Home | History | Annotate | Download | only in parsers
      1 /*
      2  * Copyright (C) 2007 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package org.apache.harmony.xml.parsers;
     18 
     19 import java.io.IOException;
     20 import java.net.URL;
     21 import java.net.URLConnection;
     22 import javax.xml.parsers.DocumentBuilder;
     23 import libcore.io.IoUtils;
     24 import org.apache.harmony.xml.dom.CDATASectionImpl;
     25 import org.apache.harmony.xml.dom.DOMImplementationImpl;
     26 import org.apache.harmony.xml.dom.DocumentImpl;
     27 import org.apache.harmony.xml.dom.DocumentTypeImpl;
     28 import org.apache.harmony.xml.dom.TextImpl;
     29 import org.kxml2.io.KXmlParser;
     30 import org.w3c.dom.Attr;
     31 import org.w3c.dom.DOMImplementation;
     32 import org.w3c.dom.Document;
     33 import org.w3c.dom.DocumentType;
     34 import org.w3c.dom.Element;
     35 import org.w3c.dom.Node;
     36 import org.w3c.dom.Text;
     37 import org.xml.sax.EntityResolver;
     38 import org.xml.sax.ErrorHandler;
     39 import org.xml.sax.InputSource;
     40 import org.xml.sax.SAXException;
     41 import org.xml.sax.SAXParseException;
     42 import org.xml.sax.helpers.LocatorImpl;
     43 import org.xmlpull.v1.XmlPullParser;
     44 import org.xmlpull.v1.XmlPullParserException;
     45 
     46 /**
     47  * Builds a DOM using KXmlParser.
     48  */
     49 class DocumentBuilderImpl extends DocumentBuilder {
     50 
     51     private static DOMImplementationImpl dom = DOMImplementationImpl.getInstance();
     52 
     53     private boolean coalescing;
     54     private EntityResolver entityResolver;
     55     private ErrorHandler errorHandler;
     56     private boolean ignoreComments;
     57     private boolean ignoreElementContentWhitespace;
     58     private boolean namespaceAware;
     59     // adding a new field? don't forget to update reset().
     60 
     61     @Override public void reset() {
     62         coalescing = false;
     63         entityResolver = null;
     64         errorHandler = null;
     65         ignoreComments = false;
     66         ignoreElementContentWhitespace = false;
     67         namespaceAware = false;
     68     }
     69 
     70     @Override
     71     public DOMImplementation getDOMImplementation() {
     72         return dom;
     73     }
     74 
     75     @Override
     76     public boolean isNamespaceAware() {
     77         return namespaceAware;
     78     }
     79 
     80     @Override
     81     public boolean isValidating() {
     82         return false;
     83     }
     84 
     85     @Override
     86     public Document newDocument() {
     87         return dom.createDocument(null, null, null);
     88     }
     89 
     90     @Override
     91     public Document parse(InputSource source) throws SAXException, IOException {
     92         if (source == null) {
     93             throw new IllegalArgumentException("source == null");
     94         }
     95 
     96         String namespaceURI = null;
     97         String qualifiedName = null;
     98         DocumentType doctype = null;
     99         String inputEncoding = source.getEncoding();
    100         String systemId = source.getSystemId();
    101         DocumentImpl document = new DocumentImpl(
    102                 dom, namespaceURI, qualifiedName, doctype, inputEncoding);
    103         document.setDocumentURI(systemId);
    104 
    105         KXmlParser parser = new KXmlParser();
    106         try {
    107             parser.keepNamespaceAttributes();
    108             parser.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, namespaceAware);
    109 
    110             if (source.getByteStream() != null) {
    111                 parser.setInput(source.getByteStream(), inputEncoding);
    112             } else if (source.getCharacterStream() != null) {
    113                 parser.setInput(source.getCharacterStream());
    114             } else if (systemId != null) {
    115                 URL url = new URL(systemId);
    116                 URLConnection urlConnection = url.openConnection();
    117                 urlConnection.connect();
    118                 // TODO: if null, extract the inputEncoding from the Content-Type header?
    119                 parser.setInput(urlConnection.getInputStream(), inputEncoding);
    120             } else {
    121                 throw new SAXParseException("InputSource needs a stream, reader or URI", null);
    122             }
    123 
    124             if (parser.nextToken() == XmlPullParser.END_DOCUMENT) {
    125                 throw new SAXParseException("Unexpected end of document", null);
    126             }
    127 
    128             parse(parser, document, document, XmlPullParser.END_DOCUMENT);
    129 
    130             parser.require(XmlPullParser.END_DOCUMENT, null, null);
    131         } catch (XmlPullParserException ex) {
    132             Throwable detail = ex.getDetail();
    133             if (detail instanceof IOException) {
    134                 throw (IOException) detail;
    135             }
    136             if (detail instanceof RuntimeException) {
    137                 throw (RuntimeException) detail;
    138             }
    139 
    140             LocatorImpl locator = new LocatorImpl();
    141 
    142             locator.setPublicId(source.getPublicId());
    143             locator.setSystemId(systemId);
    144             locator.setLineNumber(ex.getLineNumber());
    145             locator.setColumnNumber(ex.getColumnNumber());
    146 
    147             SAXParseException newEx = new SAXParseException(ex.getMessage(), locator);
    148 
    149             if (errorHandler != null) {
    150                 errorHandler.error(newEx);
    151             }
    152 
    153             throw newEx;
    154         } finally {
    155             IoUtils.closeQuietly(parser);
    156         }
    157 
    158         return document;
    159     }
    160 
    161     /**
    162      * Implements the whole parsing of the XML document. The XML pull parser is
    163      * actually more of a tokenizer, and we are doing a classical recursive
    164      * descent parsing (the method invokes itself for XML elements). Our
    165      * approach to parsing does accept some illegal documents (more than one
    166      * root element, for example). The assumption is that the DOM implementation
    167      * throws the proper exceptions in these cases.
    168      *
    169      * @param parser The XML pull parser we're reading from.
    170      * @param document The document we're building.
    171      * @param node The node we're currently on (initially the document itself).
    172      * @param endToken The token that will end this recursive call. Either
    173      *        XmlPullParser.END_DOCUMENT or XmlPullParser.END_TAG.
    174      *
    175      * @throws XmlPullParserException If a parsing error occurs.
    176      * @throws IOException If a general IO error occurs.
    177      */
    178     private void parse(KXmlParser parser, DocumentImpl document, Node node,
    179             int endToken) throws XmlPullParserException, IOException {
    180 
    181         int token = parser.getEventType();
    182 
    183         /*
    184          * The main parsing loop. The precondition is that we are already on the
    185          * token to be processed. This holds for each iteration of the loop, so
    186          * the inner statements have to ensure that (in particular the recursive
    187          * call).
    188          */
    189         while (token != endToken && token != XmlPullParser.END_DOCUMENT) {
    190             if (token == XmlPullParser.PROCESSING_INSTRUCTION) {
    191                 /*
    192                  * Found a processing instructions. We need to split the token
    193                  * text at the first whitespace character.
    194                  */
    195                 String text = parser.getText();
    196 
    197                 int dot = text.indexOf(' ');
    198 
    199                 String target = (dot != -1 ? text.substring(0, dot) : text);
    200                 String data = (dot != -1 ? text.substring(dot + 1) : "");
    201 
    202                 node.appendChild(document.createProcessingInstruction(target,
    203                         data));
    204             } else if (token == XmlPullParser.DOCDECL) {
    205                 String name = parser.getRootElementName();
    206                 String publicId = parser.getPublicId();
    207                 String systemId = parser.getSystemId();
    208                 document.appendChild(new DocumentTypeImpl(document, name, publicId, systemId));
    209 
    210             } else if (token == XmlPullParser.COMMENT) {
    211                 /*
    212                  * Found a comment. We simply take the token text, but we only
    213                  * create a node if the client wants to see comments at all.
    214                  */
    215                 if (!ignoreComments) {
    216                     node.appendChild(document.createComment(parser.getText()));
    217                 }
    218             } else if (token == XmlPullParser.IGNORABLE_WHITESPACE) {
    219                 /*
    220                  * Found some ignorable whitespace. We only add it if the client
    221                  * wants to see whitespace. Whitespace before and after the
    222                  * document element is always ignored.
    223                  */
    224                 if (!ignoreElementContentWhitespace && document != node) {
    225                     appendText(document, node, token, parser.getText());
    226                 }
    227             } else if (token == XmlPullParser.TEXT || token == XmlPullParser.CDSECT) {
    228                 /*
    229                  * Found a piece of text (possibly encoded as a CDATA section).
    230                  * That's the easiest case. We simply take it and create a new text node,
    231                  * or merge with an adjacent text node.
    232                  */
    233                 appendText(document, node, token, parser.getText());
    234             } else if (token == XmlPullParser.ENTITY_REF) {
    235                 /*
    236                  * Found an entity reference. If an entity resolver is
    237                  * installed, we replace it by text (if possible). Otherwise we
    238                  * add an entity reference node.
    239                  */
    240                 String entity = parser.getName();
    241 
    242                 if (entityResolver != null) {
    243                     // TODO Implement this...
    244                 }
    245 
    246                 String resolved = resolvePredefinedOrCharacterEntity(entity);
    247                 if (resolved != null) {
    248                     appendText(document, node, token, resolved);
    249                 } else {
    250                     node.appendChild(document.createEntityReference(entity));
    251                 }
    252             } else if (token == XmlPullParser.START_TAG) {
    253                 /*
    254                  * Found an element start tag. We create an element node with
    255                  * the proper info and attributes. We then invoke parse()
    256                  * recursively to handle the next level of nesting. When we
    257                  * return from this call, we check that we are on the proper
    258                  * element end tag. The whole handling differs somewhat
    259                  * depending on whether the parser is namespace-aware or not.
    260                  */
    261                 if (namespaceAware) {
    262                     // Collect info for element node
    263                     String namespace = parser.getNamespace();
    264                     String name = parser.getName();
    265                     String prefix = parser.getPrefix();
    266 
    267                     if ("".equals(namespace)) {
    268                         namespace = null;
    269                     }
    270 
    271                     // Create element node and wire it correctly
    272                     Element element = document.createElementNS(namespace, name);
    273                     element.setPrefix(prefix);
    274                     node.appendChild(element);
    275 
    276                     for (int i = 0; i < parser.getAttributeCount(); i++) {
    277                         // Collect info for a single attribute node
    278                         String attrNamespace = parser.getAttributeNamespace(i);
    279                         String attrPrefix = parser.getAttributePrefix(i);
    280                         String attrName = parser.getAttributeName(i);
    281                         String attrValue = parser.getAttributeValue(i);
    282 
    283                         if ("".equals(attrNamespace)) {
    284                             attrNamespace = null;
    285                         }
    286 
    287                         // Create attribute node and wire it correctly
    288                         Attr attr = document.createAttributeNS(attrNamespace, attrName);
    289                         attr.setPrefix(attrPrefix);
    290                         attr.setValue(attrValue);
    291                         element.setAttributeNodeNS(attr);
    292                     }
    293 
    294                     // Recursive descent
    295                     token = parser.nextToken();
    296                     parse(parser, document, element, XmlPullParser.END_TAG);
    297 
    298                     // Expect the element's end tag here
    299                     parser.require(XmlPullParser.END_TAG, namespace, name);
    300 
    301                 } else {
    302                     // Collect info for element node
    303                     String name = parser.getName();
    304 
    305                     // Create element node and wire it correctly
    306                     Element element = document.createElement(name);
    307                     node.appendChild(element);
    308 
    309                     for (int i = 0; i < parser.getAttributeCount(); i++) {
    310                         // Collect info for a single attribute node
    311                         String attrName = parser.getAttributeName(i);
    312                         String attrValue = parser.getAttributeValue(i);
    313 
    314                         // Create attribute node and wire it correctly
    315                         Attr attr = document.createAttribute(attrName);
    316                         attr.setValue(attrValue);
    317                         element.setAttributeNode(attr);
    318                     }
    319 
    320                     // Recursive descent
    321                     token = parser.nextToken();
    322                     parse(parser, document, element, XmlPullParser.END_TAG);
    323 
    324                     // Expect the element's end tag here
    325                     parser.require(XmlPullParser.END_TAG, "", name);
    326                 }
    327             }
    328 
    329             token = parser.nextToken();
    330         }
    331     }
    332 
    333     /**
    334      * @param token the XML pull parser token type, such as XmlPullParser.CDSECT
    335      *      or XmlPullParser.ENTITY_REF.
    336      */
    337     private void appendText(DocumentImpl document, Node parent, int token, String text) {
    338         // Ignore empty runs.
    339         if (text.isEmpty()) {
    340             return;
    341         }
    342         // Merge with any previous text node if possible.
    343         if (coalescing || token != XmlPullParser.CDSECT) {
    344             Node lastChild = parent.getLastChild();
    345             if (lastChild != null && lastChild.getNodeType() == Node.TEXT_NODE) {
    346                 Text textNode = (Text) lastChild;
    347                 textNode.appendData(text);
    348                 return;
    349             }
    350         }
    351         // Okay, we really do need a new text node
    352         parent.appendChild(token == XmlPullParser.CDSECT
    353                 ? new CDATASectionImpl(document, text)
    354                 : new TextImpl(document, text));
    355     }
    356 
    357     @Override
    358     public void setEntityResolver(EntityResolver resolver) {
    359         entityResolver = resolver;
    360     }
    361 
    362     @Override
    363     public void setErrorHandler(ErrorHandler handler) {
    364         errorHandler = handler;
    365     }
    366 
    367     /**
    368      * Controls whether this DocumentBuilder ignores comments.
    369      */
    370     public void setIgnoreComments(boolean value) {
    371         ignoreComments = value;
    372     }
    373 
    374     public void setCoalescing(boolean value) {
    375         coalescing = value;
    376     }
    377 
    378     /**
    379      * Controls whether this DocumentBuilder ignores element content whitespace.
    380      */
    381     public void setIgnoreElementContentWhitespace(boolean value) {
    382         ignoreElementContentWhitespace = value;
    383     }
    384 
    385     /**
    386      * Controls whether this DocumentBuilder is namespace-aware.
    387      */
    388     public void setNamespaceAware(boolean value) {
    389         namespaceAware = value;
    390     }
    391 
    392     /**
    393      * Returns the replacement text or null if {@code entity} isn't predefined.
    394      */
    395     private String resolvePredefinedOrCharacterEntity(String entityName) {
    396         // Character references, section 4.1 of the XML specification.
    397         if (entityName.startsWith("#x")) {
    398             return resolveCharacterReference(entityName.substring(2), 16);
    399         } else if (entityName.startsWith("#")) {
    400             return resolveCharacterReference(entityName.substring(1), 10);
    401         }
    402         // Predefined entities, section 4.6 of the XML specification.
    403         if ("lt".equals(entityName)) {
    404             return "<";
    405         } else if ("gt".equals(entityName)) {
    406             return ">";
    407         } else if ("amp".equals(entityName)) {
    408             return "&";
    409         } else if ("apos".equals(entityName)) {
    410             return "'";
    411         } else if ("quot".equals(entityName)) {
    412             return "\"";
    413         } else {
    414             return null;
    415         }
    416     }
    417 
    418     private String resolveCharacterReference(String value, int base) {
    419         try {
    420             int codePoint = Integer.parseInt(value, base);
    421             if (Character.isBmpCodePoint(codePoint)) {
    422                 return String.valueOf((char) codePoint);
    423             } else {
    424                 char[] surrogatePair = Character.toChars(codePoint);
    425                 return new String(surrogatePair);
    426             }
    427         } catch (NumberFormatException ex) {
    428             return null;
    429         }
    430     }
    431 }
    432