1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.util; 18 19 import com.android.annotations.NonNull; 20 import com.android.annotations.Nullable; 21 22 import org.w3c.dom.Attr; 23 import org.w3c.dom.Document; 24 import org.w3c.dom.Element; 25 import org.w3c.dom.Node; 26 import org.w3c.dom.Text; 27 import org.xml.sax.Attributes; 28 import org.xml.sax.InputSource; 29 import org.xml.sax.Locator; 30 import org.xml.sax.SAXException; 31 import org.xml.sax.helpers.DefaultHandler; 32 33 import java.io.ByteArrayOutputStream; 34 import java.io.IOException; 35 import java.io.InputStream; 36 import java.io.StringReader; 37 import java.io.UnsupportedEncodingException; 38 import java.util.ArrayList; 39 import java.util.List; 40 import java.util.regex.Matcher; 41 import java.util.regex.Pattern; 42 43 import javax.xml.parsers.DocumentBuilder; 44 import javax.xml.parsers.DocumentBuilderFactory; 45 import javax.xml.parsers.ParserConfigurationException; 46 import javax.xml.parsers.SAXParser; 47 import javax.xml.parsers.SAXParserFactory; 48 49 /** 50 * A simple DOM XML parser which can retrieve exact beginning and end offsets 51 * (and line and column numbers) for element nodes as well as attribute nodes. 52 */ 53 public class PositionXmlParser { 54 private static final String UTF_8 = "UTF-8"; //$NON-NLS-1$ 55 private static final String UTF_16 = "UTF_16"; //$NON-NLS-1$ 56 private static final String UTF_16LE = "UTF_16LE"; //$NON-NLS-1$ 57 private static final String CONTENT_KEY = "contents"; //$NON-NLS-1$ 58 private final static String POS_KEY = "offsets"; //$NON-NLS-1$ 59 private static final String NAMESPACE_PREFIX_FEATURE = 60 "http://xml.org/sax/features/namespace-prefixes"; //$NON-NLS-1$ 61 private static final String NAMESPACE_FEATURE = 62 "http://xml.org/sax/features/namespaces"; //$NON-NLS-1$ 63 /** See http://www.w3.org/TR/REC-xml/#NT-EncodingDecl */ 64 private static final Pattern ENCODING_PATTERN = 65 Pattern.compile("encoding=['\"](\\S*)['\"]");//$NON-NLS-1$ 66 67 /** 68 * Parses the XML content from the given input stream. 69 * 70 * @param input the input stream containing the XML to be parsed 71 * @return the corresponding document 72 * @throws ParserConfigurationException if a SAX parser is not available 73 * @throws SAXException if the document contains a parsing error 74 * @throws IOException if something is seriously wrong. This should not 75 * happen since the input source is known to be constructed from 76 * a string. 77 */ 78 @Nullable 79 public Document parse(@NonNull InputStream input) 80 throws ParserConfigurationException, SAXException, IOException { 81 // Read in all the data 82 ByteArrayOutputStream out = new ByteArrayOutputStream(); 83 byte[] buf = new byte[1024]; 84 while (true) { 85 int r = input.read(buf); 86 if (r == -1) { 87 break; 88 } 89 out.write(buf, 0, r); 90 } 91 input.close(); 92 return parse(out.toByteArray()); 93 } 94 95 /** 96 * Parses the XML content from the given byte array 97 * 98 * @param data the raw XML data (with unknown encoding) 99 * @return the corresponding document 100 * @throws ParserConfigurationException if a SAX parser is not available 101 * @throws SAXException if the document contains a parsing error 102 * @throws IOException if something is seriously wrong. This should not 103 * happen since the input source is known to be constructed from 104 * a string. 105 */ 106 @Nullable 107 public Document parse(@NonNull byte[] data) 108 throws ParserConfigurationException, SAXException, IOException { 109 String xml = getXmlString(data); 110 return parse(xml, new InputSource(new StringReader(xml)), true); 111 } 112 113 /** 114 * Parses the given XML content. 115 * 116 * @param xml the XML string to be parsed. This must be in the correct 117 * encoding already. 118 * @return the corresponding document 119 * @throws ParserConfigurationException if a SAX parser is not available 120 * @throws SAXException if the document contains a parsing error 121 * @throws IOException if something is seriously wrong. This should not 122 * happen since the input source is known to be constructed from 123 * a string. 124 */ 125 @Nullable 126 public Document parse(@NonNull String xml) 127 throws ParserConfigurationException, SAXException, IOException { 128 return parse(xml, new InputSource(new StringReader(xml)), true); 129 } 130 131 @NonNull 132 private Document parse(@NonNull String xml, @NonNull InputSource input, boolean checkBom) 133 throws ParserConfigurationException, SAXException, IOException { 134 try { 135 SAXParserFactory factory = SAXParserFactory.newInstance(); 136 factory.setFeature(NAMESPACE_FEATURE, true); 137 factory.setFeature(NAMESPACE_PREFIX_FEATURE, true); 138 SAXParser parser = factory.newSAXParser(); 139 DomBuilder handler = new DomBuilder(xml); 140 parser.parse(input, handler); 141 return handler.getDocument(); 142 } catch (SAXException e) { 143 if (checkBom && e.getMessage().contains("Content is not allowed in prolog")) { 144 // Byte order mark in the string? Skip it. There are many markers 145 // (see http://en.wikipedia.org/wiki/Byte_order_mark) so here we'll 146 // just skip those up to the XML prolog beginning character, < 147 xml = xml.replaceFirst("^([\\W]+)<","<"); //$NON-NLS-1$ //$NON-NLS-2$ 148 return parse(xml, new InputSource(new StringReader(xml)), false); 149 } 150 throw e; 151 } 152 } 153 154 /** 155 * Returns the String corresponding to the given byte array of XML data 156 * (with unknown encoding). This method attempts to guess the encoding based 157 * on the XML prologue. 158 * @param data the XML data to be decoded into a string 159 * @return a string corresponding to the XML data 160 */ 161 public static String getXmlString(byte[] data) { 162 int offset = 0; 163 164 String defaultCharset = UTF_8; 165 String charset = null; 166 // Look for the byte order mark, to see if we need to remove bytes from 167 // the input stream (and to determine whether files are big endian or little endian) etc 168 // for files which do not specify the encoding. 169 // See http://unicode.org/faq/utf_bom.html#BOM for more. 170 if (data.length > 4) { 171 if (data[0] == (byte)0xef && data[1] == (byte)0xbb && data[2] == (byte)0xbf) { 172 // UTF-8 173 defaultCharset = charset = UTF_8; 174 offset += 3; 175 } else if (data[0] == (byte)0xfe && data[1] == (byte)0xff) { 176 // UTF-16, big-endian 177 defaultCharset = charset = UTF_16; 178 offset += 2; 179 } else if (data[0] == (byte)0x0 && data[1] == (byte)0x0 180 && data[2] == (byte)0xfe && data[3] == (byte)0xff) { 181 // UTF-32, big-endian 182 defaultCharset = charset = "UTF_32"; //$NON-NLS-1$ 183 offset += 4; 184 } else if (data[0] == (byte)0xff && data[1] == (byte)0xfe 185 && data[2] == (byte)0x0 && data[3] == (byte)0x0) { 186 // UTF-32, little-endian. We must check for this *before* looking for 187 // UTF_16LE since UTF_32LE has the same prefix! 188 defaultCharset = charset = "UTF_32LE"; //$NON-NLS-1$ 189 offset += 4; 190 } else if (data[0] == (byte)0xff && data[1] == (byte)0xfe) { 191 // UTF-16, little-endian 192 defaultCharset = charset = UTF_16LE; 193 offset += 2; 194 } 195 } 196 int length = data.length - offset; 197 198 // Guess encoding by searching for an encoding= entry in the first line. 199 // The prologue, and the encoding names, will always be in ASCII - which means 200 // we don't need to worry about strange character encodings for the prologue characters. 201 // However, one wrinkle is that the whole file may be encoded in something like UTF-16 202 // where there are two bytes per character, so we can't just look for 203 // ['e','n','c','o','d','i','n','g'] etc in the byte array since there could be 204 // multiple bytes for each character. However, since again the prologue is in ASCII, 205 // we can just drop the zeroes. 206 boolean seenOddZero = false; 207 boolean seenEvenZero = false; 208 int prologueStart = -1; 209 for (int lineEnd = offset; lineEnd < data.length; lineEnd++) { 210 if (data[lineEnd] == 0) { 211 if ((lineEnd - offset) % 1 == 0) { 212 seenEvenZero = true; 213 } else { 214 seenOddZero = true; 215 } 216 } else if (data[lineEnd] == '\n' || data[lineEnd] == '\r') { 217 break; 218 } else if (data[lineEnd] == '<') { 219 prologueStart = lineEnd; 220 } else if (data[lineEnd] == '>') { 221 // End of prologue. Quick check to see if this is a utf-8 file since that's 222 // common 223 for (int i = lineEnd - 4; i >= 0; i--) { 224 if ((data[i] == 'u' || data[i] == 'U') 225 && (data[i + 1] == 't' || data[i + 1] == 'T') 226 && (data[i + 2] == 'f' || data[i + 2] == 'F') 227 && (data[i + 3] == '-' || data[i + 3] == '_') 228 && (data[i + 4] == '8') 229 ) { 230 charset = UTF_8; 231 break; 232 } 233 } 234 235 if (charset == null) { 236 StringBuilder sb = new StringBuilder(); 237 for (int i = prologueStart; i <= lineEnd; i++) { 238 if (data[i] != 0) { 239 sb.append((char) data[i]); 240 } 241 } 242 String prologue = sb.toString(); 243 int encodingIndex = prologue.indexOf("encoding"); //$NON-NLS-1$ 244 if (encodingIndex != -1) { 245 Matcher matcher = ENCODING_PATTERN.matcher(prologue); 246 if (matcher.find(encodingIndex)) { 247 charset = matcher.group(1); 248 } 249 } 250 } 251 252 break; 253 } 254 } 255 256 // No prologue on the first line, and no byte order mark: Assume UTF-8/16 257 if (charset == null) { 258 charset = seenOddZero ? UTF_16 : seenEvenZero ? UTF_16LE : UTF_8; 259 } 260 261 String xml = null; 262 try { 263 xml = new String(data, offset, length, charset); 264 } catch (UnsupportedEncodingException e) { 265 try { 266 if (charset != defaultCharset) { 267 xml = new String(data, offset, length, defaultCharset); 268 } 269 } catch (UnsupportedEncodingException u) { 270 // Just use the default encoding below 271 } 272 } 273 if (xml == null) { 274 xml = new String(data, offset, length); 275 } 276 return xml; 277 } 278 279 /** 280 * Returns the position for the given node. This is the start position. The 281 * end position can be obtained via {@link Position#getEnd()}. 282 * 283 * @param node the node to look up position for 284 * @return the position, or null if the node type is not supported for 285 * position info 286 */ 287 @Nullable 288 public Position getPosition(@NonNull Node node) { 289 // Look up the position information stored while parsing for the given node. 290 // Note however that we only store position information for elements (because 291 // there is no SAX callback for individual attributes). 292 // Therefore, this method special cases this: 293 // -- First, it looks at the owner element and uses its position 294 // information as a first approximation. 295 // -- Second, it uses that, as well as the original XML text, to search 296 // within the node range for an exact text match on the attribute name 297 // and if found uses that as the exact node offsets instead. 298 if (node instanceof Attr) { 299 Attr attr = (Attr) node; 300 Position pos = (Position) attr.getOwnerElement().getUserData(POS_KEY); 301 if (pos != null) { 302 int startOffset = pos.getOffset(); 303 int endOffset = pos.getEnd().getOffset(); 304 305 // Find attribute in the text 306 String contents = (String) node.getOwnerDocument().getUserData(CONTENT_KEY); 307 if (contents == null) { 308 return null; 309 } 310 311 // Locate the name=value attribute in the source text 312 // Fast string check first for the common occurrence 313 String name = attr.getName(); 314 Pattern pattern = Pattern.compile( 315 String.format("%1$s\\s*=\\s*[\"'].*[\"']", name)); //$NON-NLS-1$ 316 Matcher matcher = pattern.matcher(contents); 317 if (matcher.find(startOffset) && matcher.start() <= endOffset) { 318 int index = matcher.start(); 319 // Adjust the line and column to this new offset 320 int line = pos.getLine(); 321 int column = pos.getColumn(); 322 for (int offset = pos.getOffset(); offset < index; offset++) { 323 char t = contents.charAt(offset); 324 if (t == '\n') { 325 line++; 326 column = 0; 327 } else { 328 column++; 329 } 330 } 331 332 Position attributePosition = createPosition(line, column, index); 333 // Also set end range for retrieval in getLocation 334 attributePosition.setEnd(createPosition(line, column, matcher.end())); 335 return attributePosition; 336 } else { 337 // No regexp match either: just fall back to element position 338 return pos; 339 } 340 } 341 } else if (node instanceof Text) { 342 // Position of parent element, if any 343 Position pos = null; 344 if (node.getPreviousSibling() != null) { 345 pos = (Position) node.getPreviousSibling().getUserData(POS_KEY); 346 } 347 if (pos == null) { 348 pos = (Position) node.getParentNode().getUserData(POS_KEY); 349 } 350 if (pos != null) { 351 // Attempt to point forward to the actual text node 352 int startOffset = pos.getOffset(); 353 int endOffset = pos.getEnd().getOffset(); 354 int line = pos.getLine(); 355 int column = pos.getColumn(); 356 357 // Find attribute in the text 358 String contents = (String) node.getOwnerDocument().getUserData(CONTENT_KEY); 359 if (contents == null || contents.length() < endOffset) { 360 return null; 361 } 362 363 boolean inAttribute = false; 364 for (int offset = startOffset; offset <= endOffset; offset++) { 365 char c = contents.charAt(offset); 366 if (c == '>' && !inAttribute) { 367 // Found the end of the element open tag: this is where the 368 // text begins. 369 370 // Skip > 371 offset++; 372 373 // Skip text whitespace prefix, if the text node contains non-whitespace 374 // characters 375 String text = node.getNodeValue(); 376 int textIndex = 0; 377 int textLength = text.length(); 378 int newLine = line; 379 int newColumn = column; 380 for (; textIndex < text.length(); textIndex++) { 381 char t = text.charAt(textIndex); 382 if (t == '\n') { 383 newLine++; 384 newColumn = 0; 385 } else if (!Character.isWhitespace(t)) { 386 break; 387 } else { 388 newColumn++; 389 } 390 } 391 if (textIndex == textLength) { 392 textIndex = 0; // Whitespace node 393 } else { 394 line = newLine; 395 column = newColumn; 396 } 397 398 Position attributePosition = createPosition(line, column, 399 offset + textIndex); 400 // Also set end range for retrieval in getLocation 401 attributePosition.setEnd(createPosition(line, column, 402 offset + textLength)); 403 return attributePosition; 404 } else if (c == '"') { 405 inAttribute = !inAttribute; 406 } else if (c == '\n') { 407 line++; 408 column = -1; // pre-subtract column added below 409 } 410 column++; 411 } 412 413 return pos; 414 } 415 } 416 417 return (Position) node.getUserData(POS_KEY); 418 } 419 420 /** 421 * SAX parser handler which incrementally builds up a DOM document as we go 422 * along, and updates position information along the way. Position 423 * information is attached to the DOM nodes by setting user data with the 424 * {@link POS_KEY} key. 425 */ 426 private final class DomBuilder extends DefaultHandler { 427 private final String mXml; 428 private final Document mDocument; 429 private Locator mLocator; 430 private int mCurrentLine = 0; 431 private int mCurrentOffset; 432 private int mCurrentColumn; 433 private final List<Element> mStack = new ArrayList<Element>(); 434 private final StringBuilder mPendingText = new StringBuilder(); 435 436 private DomBuilder(String xml) throws ParserConfigurationException { 437 mXml = xml; 438 439 DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); 440 factory.setNamespaceAware(true); 441 factory.setValidating(false); 442 DocumentBuilder docBuilder = factory.newDocumentBuilder(); 443 mDocument = docBuilder.newDocument(); 444 mDocument.setUserData(CONTENT_KEY, xml, null); 445 } 446 447 /** Returns the document parsed by the handler */ 448 Document getDocument() { 449 return mDocument; 450 } 451 452 @Override 453 public void setDocumentLocator(Locator locator) { 454 this.mLocator = locator; 455 } 456 457 @Override 458 public void startElement(String uri, String localName, String qName, 459 Attributes attributes) throws SAXException { 460 try { 461 flushText(); 462 Element element = mDocument.createElement(qName); 463 for (int i = 0; i < attributes.getLength(); i++) { 464 if (attributes.getURI(i) != null && attributes.getURI(i).length() > 0) { 465 Attr attr = mDocument.createAttributeNS(attributes.getURI(i), 466 attributes.getQName(i)); 467 attr.setValue(attributes.getValue(i)); 468 element.setAttributeNodeNS(attr); 469 assert attr.getOwnerElement() == element; 470 } else { 471 Attr attr = mDocument.createAttribute(attributes.getQName(i)); 472 attr.setValue(attributes.getValue(i)); 473 element.setAttributeNode(attr); 474 assert attr.getOwnerElement() == element; 475 } 476 } 477 478 Position pos = getCurrentPosition(); 479 480 // The starting position reported to us by SAX is really the END of the 481 // open tag in an element, when all the attributes have been processed. 482 // We have to scan backwards to find the real beginning. We'll do that 483 // by scanning backwards. 484 // -1: Make sure that when we have <foo></foo> we don't consider </foo> 485 // the beginning since pos.offset will typically point to the first character 486 // AFTER the element open tag, which could be a closing tag or a child open 487 // tag 488 489 for (int offset = pos.getOffset() - 1; offset >= 0; offset--) { 490 char c = mXml.charAt(offset); 491 // < cannot appear in attribute values or anywhere else within 492 // an element open tag, so we know the first occurrence is the real 493 // element start 494 if (c == '<') { 495 // Adjust line position 496 int line = pos.getLine(); 497 for (int i = offset, n = pos.getOffset(); i < n; i++) { 498 if (mXml.charAt(i) == '\n') { 499 line--; 500 } 501 } 502 503 // Compute new column position 504 int column = 0; 505 for (int i = offset - 1; i >= 0; i--, column++) { 506 if (mXml.charAt(i) == '\n') { 507 break; 508 } 509 } 510 511 pos = createPosition(line, column, offset); 512 break; 513 } 514 } 515 516 element.setUserData(POS_KEY, pos, null); 517 mStack.add(element); 518 } catch (Exception t) { 519 throw new SAXException(t); 520 } 521 } 522 523 @Override 524 public void endElement(String uri, String localName, String qName) { 525 flushText(); 526 Element element = mStack.remove(mStack.size() - 1); 527 528 Position pos = (Position) element.getUserData(POS_KEY); 529 assert pos != null; 530 pos.setEnd(getCurrentPosition()); 531 532 if (mStack.isEmpty()) { 533 mDocument.appendChild(element); 534 } else { 535 Element parent = mStack.get(mStack.size() - 1); 536 parent.appendChild(element); 537 } 538 } 539 540 /** 541 * Returns a position holder for the current position. The most 542 * important part of this function is to incrementally compute the 543 * offset as well, by counting forwards until it reaches the new line 544 * number and column position of the XML parser, counting characters as 545 * it goes along. 546 */ 547 private Position getCurrentPosition() { 548 int line = mLocator.getLineNumber() - 1; 549 int column = mLocator.getColumnNumber() - 1; 550 551 // Compute offset incrementally now that we have the new line and column 552 // numbers 553 int xmlLength = mXml.length(); 554 while (mCurrentLine < line && mCurrentOffset < xmlLength) { 555 char c = mXml.charAt(mCurrentOffset); 556 if (c == '\r' && mCurrentOffset < xmlLength - 1) { 557 if (mXml.charAt(mCurrentOffset + 1) != '\n') { 558 mCurrentLine++; 559 mCurrentColumn = 0; 560 } 561 } else if (c == '\n') { 562 mCurrentLine++; 563 mCurrentColumn = 0; 564 } else { 565 mCurrentColumn++; 566 } 567 mCurrentOffset++; 568 } 569 570 mCurrentOffset += column - mCurrentColumn; 571 if (mCurrentOffset >= xmlLength) { 572 // The parser sometimes passes wrong column numbers at the 573 // end of the file: Ensure that the offset remains valid. 574 mCurrentOffset = xmlLength; 575 } 576 mCurrentColumn = column; 577 578 return createPosition(mCurrentLine, mCurrentColumn, mCurrentOffset); 579 } 580 581 @Override 582 public void characters(char c[], int start, int length) throws SAXException { 583 mPendingText.append(c, start, length); 584 } 585 586 private void flushText() { 587 if (mPendingText.length() > 0 && !mStack.isEmpty()) { 588 Element element = mStack.get(mStack.size() - 1); 589 Node textNode = mDocument.createTextNode(mPendingText.toString()); 590 element.appendChild(textNode); 591 mPendingText.setLength(0); 592 } 593 } 594 } 595 596 /** 597 * Creates a position while constructing the DOM document. This method 598 * allows a subclass to create a custom implementation of the position 599 * class. 600 * 601 * @param line the line number for the position 602 * @param column the column number for the position 603 * @param offset the character offset 604 * @return a new position 605 */ 606 @NonNull 607 protected Position createPosition(int line, int column, int offset) { 608 return new DefaultPosition(line, column, offset); 609 } 610 611 protected interface Position { 612 /** 613 * Linked position: for a begin position this will point to the 614 * corresponding end position. For an end position this will be null. 615 * 616 * @return the end position, or null 617 */ 618 @Nullable 619 public Position getEnd(); 620 621 /** 622 * Linked position: for a begin position this will point to the 623 * corresponding end position. For an end position this will be null. 624 * 625 * @param end the end position 626 */ 627 public void setEnd(@NonNull Position end); 628 629 /** @return the line number, 0-based */ 630 public int getLine(); 631 632 /** @return the offset number, 0-based */ 633 public int getOffset(); 634 635 /** @return the column number, 0-based, and -1 if the column number if not known */ 636 public int getColumn(); 637 } 638 639 protected static class DefaultPosition implements Position { 640 /** The line number (0-based where the first line is line 0) */ 641 private final int mLine; 642 private final int mColumn; 643 private final int mOffset; 644 private Position mEnd; 645 646 /** 647 * Creates a new {@link Position} 648 * 649 * @param line the 0-based line number, or -1 if unknown 650 * @param column the 0-based column number, or -1 if unknown 651 * @param offset the offset, or -1 if unknown 652 */ 653 public DefaultPosition(int line, int column, int offset) { 654 this.mLine = line; 655 this.mColumn = column; 656 this.mOffset = offset; 657 } 658 659 @Override 660 public int getLine() { 661 return mLine; 662 } 663 664 @Override 665 public int getOffset() { 666 return mOffset; 667 } 668 669 @Override 670 public int getColumn() { 671 return mColumn; 672 } 673 674 @Override 675 public Position getEnd() { 676 return mEnd; 677 } 678 679 @Override 680 public void setEnd(Position end) { 681 mEnd = end; 682 } 683 } 684 } 685