Home | History | Annotate | Download | only in parsers
      1 /*
      2  * Copyright (C) 2007 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package org.apache.harmony.xml.parsers;
     18 
     19 import java.io.IOException;
     20 import java.net.URL;
     21 import java.net.URLConnection;
     22 import javax.xml.parsers.DocumentBuilder;
     23 import libcore.io.IoUtils;
     24 import org.apache.harmony.xml.dom.CDATASectionImpl;
     25 import org.apache.harmony.xml.dom.DOMImplementationImpl;
     26 import org.apache.harmony.xml.dom.DocumentImpl;
     27 import org.apache.harmony.xml.dom.DocumentTypeImpl;
     28 import org.apache.harmony.xml.dom.TextImpl;
     29 import org.kxml2.io.KXmlParser;
     30 import org.w3c.dom.Attr;
     31 import org.w3c.dom.DOMImplementation;
     32 import org.w3c.dom.Document;
     33 import org.w3c.dom.DocumentType;
     34 import org.w3c.dom.Element;
     35 import org.w3c.dom.Node;
     36 import org.w3c.dom.Text;
     37 import org.xml.sax.EntityResolver;
     38 import org.xml.sax.ErrorHandler;
     39 import org.xml.sax.InputSource;
     40 import org.xml.sax.SAXException;
     41 import org.xml.sax.SAXParseException;
     42 import org.xml.sax.helpers.LocatorImpl;
     43 import org.xmlpull.v1.XmlPullParser;
     44 import org.xmlpull.v1.XmlPullParserException;
     45 
     46 /**
     47  * Builds a DOM using KXmlParser.
     48  */
     49 class DocumentBuilderImpl extends DocumentBuilder {
     50 
     51     private static DOMImplementationImpl dom = DOMImplementationImpl.getInstance();
     52 
     53     private boolean coalescing;
     54     private EntityResolver entityResolver;
     55     private ErrorHandler errorHandler;
     56     private boolean ignoreComments;
     57     private boolean ignoreElementContentWhitespace;
     58     private boolean namespaceAware;
     59     // adding a new field? don't forget to update reset().
     60 
     61     @Override public void reset() {
     62         coalescing = false;
     63         entityResolver = null;
     64         errorHandler = null;
     65         ignoreComments = false;
     66         ignoreElementContentWhitespace = false;
     67         namespaceAware = false;
     68     }
     69 
     70     @Override
     71     public DOMImplementation getDOMImplementation() {
     72         return dom;
     73     }
     74 
     75     @Override
     76     public boolean isNamespaceAware() {
     77         return namespaceAware;
     78     }
     79 
     80     @Override
     81     public boolean isValidating() {
     82         return false;
     83     }
     84 
     85     @Override
     86     public Document newDocument() {
     87         return dom.createDocument(null, null, null);
     88     }
     89 
     90     @Override
     91     public Document parse(InputSource source) throws SAXException, IOException {
     92         if (source == null) {
     93             throw new IllegalArgumentException("source == null");
     94         }
     95 
     96         String namespaceURI = null;
     97         String qualifiedName = null;
     98         DocumentType doctype = null;
     99         String inputEncoding = source.getEncoding();
    100         String systemId = source.getSystemId();
    101         DocumentImpl document = new DocumentImpl(
    102                 dom, namespaceURI, qualifiedName, doctype, inputEncoding);
    103         document.setDocumentURI(systemId);
    104 
    105         KXmlParser parser = new KXmlParser();
    106         try {
    107             parser.keepNamespaceAttributes();
    108             parser.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, namespaceAware);
    109 
    110             if (source.getByteStream() != null) {
    111                 parser.setInput(source.getByteStream(), inputEncoding);
    112             } else if (source.getCharacterStream() != null) {
    113                 parser.setInput(source.getCharacterStream());
    114             } else if (systemId != null) {
    115                 URL url = new URL(systemId);
    116                 URLConnection urlConnection = url.openConnection();
    117                 urlConnection.connect();
    118                 // TODO: if null, extract the inputEncoding from the Content-Type header?
    119                 parser.setInput(urlConnection.getInputStream(), inputEncoding);
    120             } else {
    121                 throw new SAXParseException("InputSource needs a stream, reader or URI", null);
    122             }
    123 
    124             if (parser.nextToken() == XmlPullParser.END_DOCUMENT) {
    125                 throw new SAXParseException("Unexpected end of document", null);
    126             }
    127 
    128             parse(parser, document, document, XmlPullParser.END_DOCUMENT);
    129 
    130             parser.require(XmlPullParser.END_DOCUMENT, null, null);
    131         } catch (XmlPullParserException ex) {
    132             if (ex.getDetail() instanceof IOException) {
    133                 throw (IOException) ex.getDetail();
    134             }
    135             if (ex.getDetail() instanceof RuntimeException) {
    136                 throw (RuntimeException) ex.getDetail();
    137             }
    138 
    139             LocatorImpl locator = new LocatorImpl();
    140 
    141             locator.setPublicId(source.getPublicId());
    142             locator.setSystemId(systemId);
    143             locator.setLineNumber(ex.getLineNumber());
    144             locator.setColumnNumber(ex.getColumnNumber());
    145 
    146             SAXParseException newEx = new SAXParseException(ex.getMessage(), locator);
    147 
    148             if (errorHandler != null) {
    149                 errorHandler.error(newEx);
    150             }
    151 
    152             throw newEx;
    153         } finally {
    154             IoUtils.closeQuietly(parser);
    155         }
    156 
    157         return document;
    158     }
    159 
    160     /**
    161      * Implements the whole parsing of the XML document. The XML pull parser is
    162      * actually more of a tokenizer, and we are doing a classical recursive
    163      * descent parsing (the method invokes itself for XML elements). Our
    164      * approach to parsing does accept some illegal documents (more than one
    165      * root element, for example). The assumption is that the DOM implementation
    166      * throws the proper exceptions in these cases.
    167      *
    168      * @param parser The XML pull parser we're reading from.
    169      * @param document The document we're building.
    170      * @param node The node we're currently on (initially the document itself).
    171      * @param endToken The token that will end this recursive call. Either
    172      *        XmlPullParser.END_DOCUMENT or XmlPullParser.END_TAG.
    173      *
    174      * @throws XmlPullParserException If a parsing error occurs.
    175      * @throws IOException If a general IO error occurs.
    176      */
    177     private void parse(KXmlParser parser, DocumentImpl document, Node node,
    178             int endToken) throws XmlPullParserException, IOException {
    179 
    180         int token = parser.getEventType();
    181 
    182         /*
    183          * The main parsing loop. The precondition is that we are already on the
    184          * token to be processed. This holds for each iteration of the loop, so
    185          * the inner statements have to ensure that (in particular the recursive
    186          * call).
    187          */
    188         while (token != endToken && token != XmlPullParser.END_DOCUMENT) {
    189             if (token == XmlPullParser.PROCESSING_INSTRUCTION) {
    190                 /*
    191                  * Found a processing instructions. We need to split the token
    192                  * text at the first whitespace character.
    193                  */
    194                 String text = parser.getText();
    195 
    196                 int dot = text.indexOf(' ');
    197 
    198                 String target = (dot != -1 ? text.substring(0, dot) : text);
    199                 String data = (dot != -1 ? text.substring(dot + 1) : "");
    200 
    201                 node.appendChild(document.createProcessingInstruction(target,
    202                         data));
    203             } else if (token == XmlPullParser.DOCDECL) {
    204                 String name = parser.getRootElementName();
    205                 String publicId = parser.getPublicId();
    206                 String systemId = parser.getSystemId();
    207                 document.appendChild(new DocumentTypeImpl(document, name, publicId, systemId));
    208 
    209             } else if (token == XmlPullParser.COMMENT) {
    210                 /*
    211                  * Found a comment. We simply take the token text, but we only
    212                  * create a node if the client wants to see comments at all.
    213                  */
    214                 if (!ignoreComments) {
    215                     node.appendChild(document.createComment(parser.getText()));
    216                 }
    217             } else if (token == XmlPullParser.IGNORABLE_WHITESPACE) {
    218                 /*
    219                  * Found some ignorable whitespace. We only add it if the client
    220                  * wants to see whitespace. Whitespace before and after the
    221                  * document element is always ignored.
    222                  */
    223                 if (!ignoreElementContentWhitespace && document != node) {
    224                     appendText(document, node, token, parser.getText());
    225                 }
    226             } else if (token == XmlPullParser.TEXT || token == XmlPullParser.CDSECT) {
    227                 /*
    228                  * Found a piece of text (possibly encoded as a CDATA section).
    229                  * That's the easiest case. We simply take it and create a new text node,
    230                  * or merge with an adjacent text node.
    231                  */
    232                 appendText(document, node, token, parser.getText());
    233             } else if (token == XmlPullParser.ENTITY_REF) {
    234                 /*
    235                  * Found an entity reference. If an entity resolver is
    236                  * installed, we replace it by text (if possible). Otherwise we
    237                  * add an entity reference node.
    238                  */
    239                 String entity = parser.getName();
    240 
    241                 if (entityResolver != null) {
    242                     // TODO Implement this...
    243                 }
    244 
    245                 String resolved = resolvePredefinedOrCharacterEntity(entity);
    246                 if (resolved != null) {
    247                     appendText(document, node, token, resolved);
    248                 } else {
    249                     node.appendChild(document.createEntityReference(entity));
    250                 }
    251             } else if (token == XmlPullParser.START_TAG) {
    252                 /*
    253                  * Found an element start tag. We create an element node with
    254                  * the proper info and attributes. We then invoke parse()
    255                  * recursively to handle the next level of nesting. When we
    256                  * return from this call, we check that we are on the proper
    257                  * element end tag. The whole handling differs somewhat
    258                  * depending on whether the parser is namespace-aware or not.
    259                  */
    260                 if (namespaceAware) {
    261                     // Collect info for element node
    262                     String namespace = parser.getNamespace();
    263                     String name = parser.getName();
    264                     String prefix = parser.getPrefix();
    265 
    266                     if ("".equals(namespace)) {
    267                         namespace = null;
    268                     }
    269 
    270                     // Create element node and wire it correctly
    271                     Element element = document.createElementNS(namespace, name);
    272                     element.setPrefix(prefix);
    273                     node.appendChild(element);
    274 
    275                     for (int i = 0; i < parser.getAttributeCount(); i++) {
    276                         // Collect info for a single attribute node
    277                         String attrNamespace = parser.getAttributeNamespace(i);
    278                         String attrPrefix = parser.getAttributePrefix(i);
    279                         String attrName = parser.getAttributeName(i);
    280                         String attrValue = parser.getAttributeValue(i);
    281 
    282                         if ("".equals(attrNamespace)) {
    283                             attrNamespace = null;
    284                         }
    285 
    286                         // Create attribute node and wire it correctly
    287                         Attr attr = document.createAttributeNS(attrNamespace, attrName);
    288                         attr.setPrefix(attrPrefix);
    289                         attr.setValue(attrValue);
    290                         element.setAttributeNodeNS(attr);
    291                     }
    292 
    293                     // Recursive descent
    294                     token = parser.nextToken();
    295                     parse(parser, document, element, XmlPullParser.END_TAG);
    296 
    297                     // Expect the element's end tag here
    298                     parser.require(XmlPullParser.END_TAG, namespace, name);
    299 
    300                 } else {
    301                     // Collect info for element node
    302                     String name = parser.getName();
    303 
    304                     // Create element node and wire it correctly
    305                     Element element = document.createElement(name);
    306                     node.appendChild(element);
    307 
    308                     for (int i = 0; i < parser.getAttributeCount(); i++) {
    309                         // Collect info for a single attribute node
    310                         String attrName = parser.getAttributeName(i);
    311                         String attrValue = parser.getAttributeValue(i);
    312 
    313                         // Create attribute node and wire it correctly
    314                         Attr attr = document.createAttribute(attrName);
    315                         attr.setValue(attrValue);
    316                         element.setAttributeNode(attr);
    317                     }
    318 
    319                     // Recursive descent
    320                     token = parser.nextToken();
    321                     parse(parser, document, element, XmlPullParser.END_TAG);
    322 
    323                     // Expect the element's end tag here
    324                     parser.require(XmlPullParser.END_TAG, "", name);
    325                 }
    326             }
    327 
    328             token = parser.nextToken();
    329         }
    330     }
    331 
    332     /**
    333      * @param token the XML pull parser token type, such as XmlPullParser.CDSECT
    334      *      or XmlPullParser.ENTITY_REF.
    335      */
    336     private void appendText(DocumentImpl document, Node parent, int token, String text) {
    337         // Ignore empty runs.
    338         if (text.isEmpty()) {
    339             return;
    340         }
    341         // Merge with any previous text node if possible.
    342         if (coalescing || token != XmlPullParser.CDSECT) {
    343             Node lastChild = parent.getLastChild();
    344             if (lastChild != null && lastChild.getNodeType() == Node.TEXT_NODE) {
    345                 Text textNode = (Text) lastChild;
    346                 textNode.appendData(text);
    347                 return;
    348             }
    349         }
    350         // Okay, we really do need a new text node
    351         parent.appendChild(token == XmlPullParser.CDSECT
    352                 ? new CDATASectionImpl(document, text)
    353                 : new TextImpl(document, text));
    354     }
    355 
    356     @Override
    357     public void setEntityResolver(EntityResolver resolver) {
    358         entityResolver = resolver;
    359     }
    360 
    361     @Override
    362     public void setErrorHandler(ErrorHandler handler) {
    363         errorHandler = handler;
    364     }
    365 
    366     /**
    367      * Controls whether this DocumentBuilder ignores comments.
    368      */
    369     public void setIgnoreComments(boolean value) {
    370         ignoreComments = value;
    371     }
    372 
    373     public void setCoalescing(boolean value) {
    374         coalescing = value;
    375     }
    376 
    377     /**
    378      * Controls whether this DocumentBuilder ignores element content whitespace.
    379      */
    380     public void setIgnoreElementContentWhitespace(boolean value) {
    381         ignoreElementContentWhitespace = value;
    382     }
    383 
    384     /**
    385      * Controls whether this DocumentBuilder is namespace-aware.
    386      */
    387     public void setNamespaceAware(boolean value) {
    388         namespaceAware = value;
    389     }
    390 
    391     /**
    392      * Returns the replacement text or null if {@code entity} isn't predefined.
    393      */
    394     private String resolvePredefinedOrCharacterEntity(String entityName) {
    395         // Character references, section 4.1 of the XML specification.
    396         if (entityName.startsWith("#x")) {
    397             return resolveCharacterReference(entityName.substring(2), 16);
    398         } else if (entityName.startsWith("#")) {
    399             return resolveCharacterReference(entityName.substring(1), 10);
    400         }
    401         // Predefined entities, section 4.6 of the XML specification.
    402         if ("lt".equals(entityName)) {
    403             return "<";
    404         } else if ("gt".equals(entityName)) {
    405             return ">";
    406         } else if ("amp".equals(entityName)) {
    407             return "&";
    408         } else if ("apos".equals(entityName)) {
    409             return "'";
    410         } else if ("quot".equals(entityName)) {
    411             return "\"";
    412         } else {
    413             return null;
    414         }
    415     }
    416 
    417     private String resolveCharacterReference(String value, int base) {
    418         try {
    419             int ch = Integer.parseInt(value, base);
    420             if (ch < 0 || ch > Character.MAX_VALUE) {
    421                 return null;
    422             }
    423             return String.valueOf((char) ch);
    424         } catch (NumberFormatException ex) {
    425             return null;
    426         }
    427     }
    428 }
    429