android/util/PositionXmlParser.java

/*
 * Copyright (C) 2011 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.android.util;

import com.android.annotations.NonNull;
import com.android.annotations.Nullable;

import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.Text;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

/**
 * A simple DOM XML parser which can retrieve exact beginning and end offsets
 * (and line and column numbers) for element nodes as well as attribute nodes.
 */
public class PositionXmlParser {
    private static final String UTF_8 = "UTF-8";                 //$NON-NLS-1$
    private static final String UTF_16 = "UTF_16";               //$NON-NLS-1$
    private static final String UTF_16LE = "UTF_16LE";           //$NON-NLS-1$
    private static final String CONTENT_KEY = "contents";        //$NON-NLS-1$
    private final static String POS_KEY = "offsets";             //$NON-NLS-1$
    private static final String NAMESPACE_PREFIX_FEATURE =
            "http://xml.org/sax/features/namespace-prefixes";    //$NON-NLS-1$
    private static final String NAMESPACE_FEATURE =
            "http://xml.org/sax/features/namespaces";            //$NON-NLS-1$
    /** See http://www.w3.org/TR/REC-xml/#NT-EncodingDecl */
    private static final Pattern ENCODING_PATTERN =
            Pattern.compile("encoding=['\"](\\S*)['\"]");//$NON-NLS-1$

    /**
     * Parses the XML content from the given input stream.
     *
     * @param input the input stream containing the XML to be parsed
     * @return the corresponding document
     * @throws ParserConfigurationException if a SAX parser is not available
     * @throws SAXException if the document contains a parsing error
     * @throws IOException if something is seriously wrong. This should not
     *             happen since the input source is known to be constructed from
     *             a string.
     */
    @Nullable
    public Document parse(@NonNull InputStream input)
            throws ParserConfigurationException, SAXException, IOException {
        // Read in all the data
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        byte[] buf = new byte[1024];
        while (true) {
          int r = input.read(buf);
          if (r == -1) {
            break;
          }
          out.write(buf, 0, r);
        }
        input.close();
        return parse(out.toByteArray());
    }

    /**
     * Parses the XML content from the given byte array
     *
     * @param data the raw XML data (with unknown encoding)
     * @return the corresponding document
     * @throws ParserConfigurationException if a SAX parser is not available
     * @throws SAXException if the document contains a parsing error
     * @throws IOException if something is seriously wrong. This should not
     *             happen since the input source is known to be constructed from
     *             a string.
     */
    @Nullable
    public Document parse(@NonNull byte[] data)
            throws ParserConfigurationException, SAXException, IOException {
        String xml = getXmlString(data);
        return parse(xml, new InputSource(new StringReader(xml)), true);
    }

    /**
     * Parses the given XML content.
     *
     * @param xml the XML string to be parsed. This must be in the correct
     *     encoding already.
     * @return the corresponding document
     * @throws ParserConfigurationException if a SAX parser is not available
     * @throws SAXException if the document contains a parsing error
     * @throws IOException if something is seriously wrong. This should not
     *             happen since the input source is known to be constructed from
     *             a string.
     */
    @Nullable
    public Document parse(@NonNull String xml)
            throws ParserConfigurationException, SAXException, IOException {
        return parse(xml, new InputSource(new StringReader(xml)), true);
    }

    @NonNull
    private Document parse(@NonNull String xml, @NonNull InputSource input, boolean checkBom)
            throws ParserConfigurationException, SAXException, IOException {
        try {
            SAXParserFactory factory = SAXParserFactory.newInstance();
            factory.setFeature(NAMESPACE_FEATURE, true);
            factory.setFeature(NAMESPACE_PREFIX_FEATURE, true);
            SAXParser parser = factory.newSAXParser();
            DomBuilder handler = new DomBuilder(xml);
            parser.parse(input, handler);
            return handler.getDocument();
        } catch (SAXException e) {
            if (checkBom && e.getMessage().contains("Content is not allowed in prolog")) {
                // Byte order mark in the string? Skip it. There are many markers
                // (see http://en.wikipedia.org/wiki/Byte_order_mark) so here we'll
                // just skip those up to the XML prolog beginning character, <
                xml = xml.replaceFirst("^([\\W]+)<","<");  //$NON-NLS-1$ //$NON-NLS-2$
                return parse(xml, new InputSource(new StringReader(xml)), false);
            }
            throw e;
        }
    }

    /**
     * Returns the String corresponding to the given byte array of XML data
     * (with unknown encoding). This method attempts to guess the encoding based
     * on the XML prologue.
     * @param data the XML data to be decoded into a string
     * @return a string corresponding to the XML data
     */
    public static String getXmlString(byte[] data) {
        int offset = 0;

        String defaultCharset = UTF_8;
        String charset = null;
        // Look for the byte order mark, to see if we need to remove bytes from
        // the input stream (and to determine whether files are big endian or little endian) etc
        // for files which do not specify the encoding.
        // See http://unicode.org/faq/utf_bom.html#BOM for more.
        if (data.length > 4) {
            if (data[0] == (byte)0xef && data[1] == (byte)0xbb && data[2] == (byte)0xbf) {
                // UTF-8
                defaultCharset = charset = UTF_8;
                offset += 3;
            } else if (data[0] == (byte)0xfe && data[1] == (byte)0xff) {
                //  UTF-16, big-endian
                defaultCharset = charset = UTF_16;
                offset += 2;
            } else if (data[0] == (byte)0x0 && data[1] == (byte)0x0
                    && data[2] == (byte)0xfe && data[3] == (byte)0xff) {
                // UTF-32, big-endian
                defaultCharset = charset = "UTF_32";    //$NON-NLS-1$
                offset += 4;
            } else if (data[0] == (byte)0xff && data[1] == (byte)0xfe
                    && data[2] == (byte)0x0 && data[3] == (byte)0x0) {
                // UTF-32, little-endian. We must check for this *before* looking for
                // UTF_16LE since UTF_32LE has the same prefix!
                defaultCharset = charset = "UTF_32LE";  //$NON-NLS-1$
                offset += 4;
            } else if (data[0] == (byte)0xff && data[1] == (byte)0xfe) {
                //  UTF-16, little-endian
                defaultCharset = charset = UTF_16LE;
                offset += 2;
            }
        }
        int length = data.length - offset;

        // Guess encoding by searching for an encoding= entry in the first line.
        // The prologue, and the encoding names, will always be in ASCII - which means
        // we don't need to worry about strange character encodings for the prologue characters.
        // However, one wrinkle is that the whole file may be encoded in something like UTF-16
        // where there are two bytes per character, so we can't just look for
        //  ['e','n','c','o','d','i','n','g'] etc in the byte array since there could be
        // multiple bytes for each character. However, since again the prologue is in ASCII,
        // we can just drop the zeroes.
        boolean seenOddZero = false;
        boolean seenEvenZero = false;
        int prologueStart = -1;
        for (int lineEnd = offset; lineEnd < data.length; lineEnd++) {
            if (data[lineEnd] == 0) {
                if ((lineEnd - offset) % 1 == 0) {
                    seenEvenZero = true;
                } else {
                    seenOddZero = true;
                }
            } else if (data[lineEnd] == '\n' || data[lineEnd] == '\r') {
                break;
            } else if (data[lineEnd] == '<') {
                prologueStart = lineEnd;
            } else if (data[lineEnd] == '>') {
                // End of prologue. Quick check to see if this is a utf-8 file since that's
                // common
                for (int i = lineEnd - 4; i >= 0; i--) {
                    if ((data[i] == 'u' || data[i] == 'U')
                            && (data[i + 1] == 't' || data[i + 1] == 'T')
                            && (data[i + 2] == 'f' || data[i + 2] == 'F')
                            && (data[i + 3] == '-' || data[i + 3] == '_')
                            && (data[i + 4] == '8')
                            ) {
                        charset = UTF_8;
                        break;
                    }
                }

                if (charset == null) {
                    StringBuilder sb = new StringBuilder();
                    for (int i = prologueStart; i <= lineEnd; i++) {
                        if (data[i] != 0) {
                            sb.append((char) data[i]);
                        }
                    }
                    String prologue = sb.toString();
                    int encodingIndex = prologue.indexOf("encoding"); //$NON-NLS-1$
                    if (encodingIndex != -1) {
                        Matcher matcher = ENCODING_PATTERN.matcher(prologue);
                        if (matcher.find(encodingIndex)) {
                            charset = matcher.group(1);
                        }
                    }
                }

                break;
            }
        }

        // No prologue on the first line, and no byte order mark: Assume UTF-8/16
        if (charset == null) {
            charset = seenOddZero ? UTF_16 : seenEvenZero ? UTF_16LE : UTF_8;
        }

        String xml = null;
        try {
            xml = new String(data, offset, length, charset);
        } catch (UnsupportedEncodingException e) {
            try {
                if (charset != defaultCharset) {
                    xml = new String(data, offset, length, defaultCharset);
                }
            } catch (UnsupportedEncodingException u) {
                // Just use the default encoding below
            }
        }
        if (xml == null) {
            xml = new String(data, offset, length);
        }
        return xml;
    }

    /**
     * Returns the position for the given node. This is the start position. The
     * end position can be obtained via {@link Position#getEnd()}.
     *
     * @param node the node to look up position for
     * @return the position, or null if the node type is not supported for
     *         position info
     */
    @Nullable
    public Position getPosition(@NonNull Node node) {
        // Look up the position information stored while parsing for the given node.
        // Note however that we only store position information for elements (because
        // there is no SAX callback for individual attributes).
        // Therefore, this method special cases this:
        //  -- First, it looks at the owner element and uses its position
        //     information as a first approximation.
        //  -- Second, it uses that, as well as the original XML text, to search
        //     within the node range for an exact text match on the attribute name
        //     and if found uses that as the exact node offsets instead.
        if (node instanceof Attr) {
            Attr attr = (Attr) node;
            Position pos = (Position) attr.getOwnerElement().getUserData(POS_KEY);
            if (pos != null) {
                int startOffset = pos.getOffset();
                int endOffset = pos.getEnd().getOffset();

                // Find attribute in the text
                String contents = (String) node.getOwnerDocument().getUserData(CONTENT_KEY);
                if (contents == null) {
                    return null;
                }

                // Locate the name=value attribute in the source text
                // Fast string check first for the common occurrence
                String name = attr.getName();
                Pattern pattern = Pattern.compile(
                        String.format("%1$s\\s*=\\s*[\"'].*[\"']", name)); //$NON-NLS-1$
                Matcher matcher = pattern.matcher(contents);
                if (matcher.find(startOffset) && matcher.start() <= endOffset) {
                    int index = matcher.start();
                    // Adjust the line and column to this new offset
                    int line = pos.getLine();
                    int column = pos.getColumn();
                    for (int offset = pos.getOffset(); offset < index; offset++) {
                        char t = contents.charAt(offset);
                        if (t == '\n') {
                            line++;
                            column = 0;
                        } else {
                            column++;
                        }
                    }

                    Position attributePosition = createPosition(line, column, index);
                    // Also set end range for retrieval in getLocation
                    attributePosition.setEnd(createPosition(line, column, matcher.end()));
                    return attributePosition;
                } else {
                    // No regexp match either: just fall back to element position
                    return pos;
                }
            }
        } else if (node instanceof Text) {
            // Position of parent element, if any
            Position pos = null;
            if (node.getPreviousSibling() != null) {
                pos = (Position) node.getPreviousSibling().getUserData(POS_KEY);
            }
            if (pos == null) {
                pos = (Position) node.getParentNode().getUserData(POS_KEY);
            }
            if (pos != null) {
                // Attempt to point forward to the actual text node
                int startOffset = pos.getOffset();
                int endOffset = pos.getEnd().getOffset();
                int line = pos.getLine();
                int column = pos.getColumn();

                // Find attribute in the text
                String contents = (String) node.getOwnerDocument().getUserData(CONTENT_KEY);
                if (contents == null || contents.length() < endOffset) {
                    return null;
                }

                boolean inAttribute = false;
                for (int offset = startOffset; offset <= endOffset; offset++) {
                    char c = contents.charAt(offset);
                    if (c == '>' && !inAttribute) {
                        // Found the end of the element open tag: this is where the
                        // text begins.

                        // Skip >
                        offset++;

                        // Skip text whitespace prefix, if the text node contains non-whitespace
                        // characters
                        String text = node.getNodeValue();
                        int textIndex = 0;
                        int textLength = text.length();
                        int newLine = line;
                        int newColumn = column;
                        for (; textIndex < text.length(); textIndex++) {
                            char t = text.charAt(textIndex);
                            if (t == '\n') {
                                newLine++;
                                newColumn = 0;
                            } else if (!Character.isWhitespace(t)) {
                                break;
                            } else {
                                newColumn++;
                            }
                        }
                        if (textIndex == textLength) {
                            textIndex = 0; // Whitespace node
                        } else {
                            line = newLine;
                            column = newColumn;
                        }

                        Position attributePosition = createPosition(line, column,
                                offset + textIndex);
                        // Also set end range for retrieval in getLocation
                        attributePosition.setEnd(createPosition(line, column,
                                offset + textLength));
                        return attributePosition;
                    } else if (c == '"') {
                        inAttribute = !inAttribute;
                    } else if (c == '\n') {
                        line++;
                        column = -1; // pre-subtract column added below
                    }
                    column++;
                }

                return pos;
            }
        }

        return (Position) node.getUserData(POS_KEY);
    }

    /**
     * SAX parser handler which incrementally builds up a DOM document as we go
     * along, and updates position information along the way. Position
     * information is attached to the DOM nodes by setting user data with the
     * {@link POS_KEY} key.
     */
    private final class DomBuilder extends DefaultHandler {
        private final String mXml;
        private final Document mDocument;
        private Locator mLocator;
        private int mCurrentLine = 0;
        private int mCurrentOffset;
        private int mCurrentColumn;
        private final List<Element> mStack = new ArrayList<Element>();
        private final StringBuilder mPendingText = new StringBuilder();

        private DomBuilder(String xml) throws ParserConfigurationException {
            mXml = xml;

            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
            factory.setNamespaceAware(true);
            factory.setValidating(false);
            DocumentBuilder docBuilder = factory.newDocumentBuilder();
            mDocument = docBuilder.newDocument();
            mDocument.setUserData(CONTENT_KEY, xml, null);
        }

        /** Returns the document parsed by the handler */
        Document getDocument() {
            return mDocument;
        }

        @Override
        public void setDocumentLocator(Locator locator) {
            this.mLocator = locator;
        }

        @Override
        public void startElement(String uri, String localName, String qName,
                Attributes attributes) throws SAXException {
            try {
                flushText();
                Element element = mDocument.createElement(qName);
                for (int i = 0; i < attributes.getLength(); i++) {
                    if (attributes.getURI(i) != null && attributes.getURI(i).length() > 0) {
                        Attr attr = mDocument.createAttributeNS(attributes.getURI(i),
                                attributes.getQName(i));
                        attr.setValue(attributes.getValue(i));
                        element.setAttributeNodeNS(attr);
                        assert attr.getOwnerElement() == element;
                    } else {
                        Attr attr = mDocument.createAttribute(attributes.getQName(i));
                        attr.setValue(attributes.getValue(i));
                        element.setAttributeNode(attr);
                        assert attr.getOwnerElement() == element;
                    }
                }

                Position pos = getCurrentPosition();

                // The starting position reported to us by SAX is really the END of the
                // open tag in an element, when all the attributes have been processed.
                // We have to scan backwards to find the real beginning. We'll do that
                // by scanning backwards.
                // -1: Make sure that when we have <foo></foo> we don't consider </foo>
                // the beginning since pos.offset will typically point to the first character
                // AFTER the element open tag, which could be a closing tag or a child open
                // tag

                for (int offset = pos.getOffset() - 1; offset >= 0; offset--) {
                    char c = mXml.charAt(offset);
                    // < cannot appear in attribute values or anywhere else within
                    // an element open tag, so we know the first occurrence is the real
                    // element start
                    if (c == '<') {
                        // Adjust line position
                        int line = pos.getLine();
                        for (int i = offset, n = pos.getOffset(); i < n; i++) {
                            if (mXml.charAt(i) == '\n') {
                                line--;
                            }
                        }

                        // Compute new column position
                        int column = 0;
                        for (int i = offset - 1; i >= 0; i--, column++) {
                            if (mXml.charAt(i) == '\n') {
                                break;
                            }
                        }

                        pos = createPosition(line, column, offset);
                        break;
                    }
                }

                element.setUserData(POS_KEY, pos, null);
                mStack.add(element);
            } catch (Exception t) {
                throw new SAXException(t);
            }
        }

        @Override
        public void endElement(String uri, String localName, String qName) {
            flushText();
            Element element = mStack.remove(mStack.size() - 1);

            Position pos = (Position) element.getUserData(POS_KEY);
            assert pos != null;
            pos.setEnd(getCurrentPosition());

            if (mStack.isEmpty()) {
                mDocument.appendChild(element);
            } else {
                Element parent = mStack.get(mStack.size() - 1);
                parent.appendChild(element);
            }
        }

        /**
         * Returns a position holder for the current position. The most
         * important part of this function is to incrementally compute the
         * offset as well, by counting forwards until it reaches the new line
         * number and column position of the XML parser, counting characters as
         * it goes along.
         */
        private Position getCurrentPosition() {
            int line = mLocator.getLineNumber() - 1;
            int column = mLocator.getColumnNumber() - 1;

            // Compute offset incrementally now that we have the new line and column
            // numbers
            int xmlLength = mXml.length();
            while (mCurrentLine < line && mCurrentOffset < xmlLength) {
                char c = mXml.charAt(mCurrentOffset);
                if (c == '\r' && mCurrentOffset < xmlLength - 1) {
                    if (mXml.charAt(mCurrentOffset + 1) != '\n') {
                        mCurrentLine++;
                        mCurrentColumn = 0;
                    }
                } else if (c == '\n') {
                    mCurrentLine++;
                    mCurrentColumn = 0;
                } else {
                    mCurrentColumn++;
                }
                mCurrentOffset++;
            }

            mCurrentOffset += column - mCurrentColumn;
            if (mCurrentOffset >= xmlLength) {
                // The parser sometimes passes wrong column numbers at the
                // end of the file: Ensure that the offset remains valid.
                mCurrentOffset = xmlLength;
            }
            mCurrentColumn = column;

            return createPosition(mCurrentLine, mCurrentColumn, mCurrentOffset);
        }

        @Override
        public void characters(char c[], int start, int length) throws SAXException {
            mPendingText.append(c, start, length);
        }

        private void flushText() {
            if (mPendingText.length() > 0 && !mStack.isEmpty()) {
                Element element = mStack.get(mStack.size() - 1);
                Node textNode = mDocument.createTextNode(mPendingText.toString());
                element.appendChild(textNode);
                mPendingText.setLength(0);
            }
        }
    }

    /**
     * Creates a position while constructing the DOM document. This method
     * allows a subclass to create a custom implementation of the position
     * class.
     *
     * @param line the line number for the position
     * @param column the column number for the position
     * @param offset the character offset
     * @return a new position
     */
    @NonNull
    protected Position createPosition(int line, int column, int offset) {
        return new DefaultPosition(line, column, offset);
    }

    protected interface Position {
        /**
         * Linked position: for a begin position this will point to the
         * corresponding end position. For an end position this will be null.
         *
         * @return the end position, or null
         */
        @Nullable
        public Position getEnd();

        /**
         * Linked position: for a begin position this will point to the
         * corresponding end position. For an end position this will be null.
         *
         * @param end the end position
         */
        public void setEnd(@NonNull Position end);

        /** @return the line number, 0-based */
        public int getLine();

        /** @return the offset number, 0-based */
        public int getOffset();

        /** @return the column number, 0-based, and -1 if the column number if not known */
        public int getColumn();
    }

    protected static class DefaultPosition implements Position {
        /** The line number (0-based where the first line is line 0) */
        private final int mLine;
        private final int mColumn;
        private final int mOffset;
        private Position mEnd;

        /**
         * Creates a new {@link Position}
         *
         * @param line the 0-based line number, or -1 if unknown
         * @param column the 0-based column number, or -1 if unknown
         * @param offset the offset, or -1 if unknown
         */
        public DefaultPosition(int line, int column, int offset) {
            this.mLine = line;
            this.mColumn = column;
            this.mOffset = offset;
        }

        @Override
        public int getLine() {
            return mLine;
        }

        @Override
        public int getOffset() {
            return mOffset;
        }

        @Override
        public int getColumn() {
            return mColumn;
        }

        @Override
        public Position getEnd() {
            return mEnd;
        }

        @Override
        public void setEnd(Position end) {
            mEnd = end;
        }
    }
}