Home | History | Annotate | Download | only in toolutil
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2004-2005, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  xmlparser.h
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2004jul21
     14 *   created by: Andy Heninger
     15 *
     16 * Tiny XML parser using ICU and intended for use in ICU tests and in build tools.
     17 * Not suitable for production use. Not supported.
     18 * Not conformant. Not efficient.
     19 * But very small.
     20 */
     21 
     22 #ifndef __XMLPARSER_H__
     23 #define __XMLPARSER_H__
     24 
     25 #include "unicode/uobject.h"
     26 #include "unicode/unistr.h"
     27 #include "unicode/regex.h"
     28 #include "uvector.h"
     29 #include "hash.h"
     30 
     31 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
     32 
     33 enum UXMLNodeType {
     34     /** Node type string (text contents), stored as a UnicodeString. */
     35     UXML_NODE_TYPE_STRING,
     36     /** Node type element, stored as a UXMLElement. */
     37     UXML_NODE_TYPE_ELEMENT,
     38     UXML_NODE_TYPE_COUNT
     39 };
     40 
     41 U_NAMESPACE_BEGIN
     42 
     43 class UXMLParser;
     44 
     45 /**
     46  * This class represents an element node in a parsed XML tree.
     47  */
     48 class U_TOOLUTIL_API UXMLElement : public UObject {
     49 public:
     50     /**
     51      * Destructor.
     52      */
     53     virtual ~UXMLElement();
     54 
     55     /**
     56      * Get the tag name of this element.
     57      */
     58     const UnicodeString &getTagName() const;
     59     /**
     60      * Get the text contents of the element.
     61      * Append the contents of all text child nodes.
     62      * @param recurse If TRUE, also recursively appends the contents of all
     63      *        text child nodes of element children.
     64      * @return The text contents.
     65      */
     66     UnicodeString getText(UBool recurse) const;
     67     /**
     68      * Get the number of attributes.
     69      */
     70     int32_t countAttributes() const;
     71     /**
     72      * Get the i-th attribute.
     73      * @param i Index of the attribute.
     74      * @param name Output parameter, receives the attribute name.
     75      * @param value Output parameter, receives the attribute value.
     76      * @return A pointer to the attribute value (may be &value or a pointer to an
     77      *         internal string object), or NULL if i is out of bounds.
     78      */
     79     const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const;
     80     /**
     81      * Get the value of the attribute with the given name.
     82      * @param name Attribute name to be looked up.
     83      * @return A pointer to the attribute value, or NULL if this element
     84      * does not have this attribute.
     85      */
     86     const UnicodeString *getAttribute(const UnicodeString &name) const;
     87     /**
     88      * Get the number of child nodes.
     89      */
     90     int32_t countChildren() const;
     91     /**
     92      * Get the i-th child node.
     93      * @param i Index of the child node.
     94      * @param type The child node type.
     95      * @return A pointer to the child node object, or NULL if i is out of bounds.
     96      */
     97     const UObject *getChild(int32_t i, UXMLNodeType &type) const;
     98     /**
     99      * Get the next child element node, skipping non-element child nodes.
    100      * @param i Enumeration index; initialize to 0 before getting the first child element.
    101      * @return A pointer to the next child element, or NULL if there is none.
    102      */
    103     const UXMLElement *nextChildElement(int32_t &i) const;
    104     /**
    105      * Get the immediate child element with the given name.
    106      * If there are multiple child elements with this name, then return
    107      * the first one.
    108      * @param name Element name to be looked up.
    109      * @return A pointer to the element node, or NULL if this element
    110      * does not have this immediate child element.
    111      */
    112     const UXMLElement *getChildElement(const UnicodeString &name) const;
    113 
    114     /**
    115      * ICU "poor man's RTTI", returns a UClassID for the actual class.
    116      */
    117     virtual UClassID getDynamicClassID() const;
    118 
    119     /**
    120      * ICU "poor man's RTTI", returns a UClassID for this class.
    121      */
    122     static UClassID U_EXPORT2 getStaticClassID();
    123 
    124 private:
    125     // prevent default construction etc.
    126     UXMLElement();
    127     UXMLElement(const UXMLElement &other);
    128     UXMLElement &operator=(const UXMLElement &other);
    129 
    130     void appendText(UnicodeString &text, UBool recurse) const;
    131 
    132     friend class UXMLParser;
    133 
    134     UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode);
    135 
    136     const UXMLParser *fParser;
    137     const UnicodeString *fName;          // The tag name of this element (owned by the UXMLParser)
    138     UnicodeString       fContent;        // The text content of this node.  All element content is
    139                                          //   concatenated even when there are intervening nested elements
    140                                          //   (which doesn't happen with most xml files we care about)
    141                                          //   Sections of content containing only white space are dropped,
    142                                          //   which gets rid  the bogus white space content from
    143                                          //   elements which are primarily containers for nested elements.
    144     UVector             fAttNames;       // A vector containing the names of this element's attributes
    145                                          //    The names are UnicodeString objects, owned by the UXMLParser.
    146     UVector             fAttValues;      // A vector containing the attribute values for
    147                                          //    this element's attributes.  The order is the same
    148                                          //    as that of the attribute name vector.
    149 
    150     UVector             fChildren;       // The child nodes of this element (a Vector)
    151 
    152     UXMLElement        *fParent;         // A pointer to the parent element of this element.
    153 };
    154 
    155 /**
    156  * A simple XML parser; it is neither efficient nor conformant and only useful for
    157  * restricted types of XML documents.
    158  *
    159  * The parse methods parse whole documents and return the parse trees via their
    160  * root elements.
    161  */
    162 class U_TOOLUTIL_API UXMLParser : public UObject {
    163 public:
    164     /**
    165      * Create an XML parser.
    166      */
    167     static UXMLParser *createParser(UErrorCode &errorCode);
    168     /**
    169      * Destructor.
    170      */
    171     virtual ~UXMLParser();
    172 
    173     /**
    174      * Parse an XML document, create the entire document tree, and
    175      * return a pointer to the root element of the parsed tree.
    176      * The caller must delete the element.
    177      */
    178     UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode);
    179     /**
    180      * Parse an XML file, create the entire document tree, and
    181      * return a pointer to the root element of the parsed tree.
    182      * The caller must delete the element.
    183      */
    184     UXMLElement *parseFile(const char *filename, UErrorCode &errorCode);
    185 
    186     /**
    187      * ICU "poor man's RTTI", returns a UClassID for the actual class.
    188      */
    189     virtual UClassID getDynamicClassID() const;
    190 
    191     /**
    192      * ICU "poor man's RTTI", returns a UClassID for this class.
    193      */
    194     static UClassID U_EXPORT2 getStaticClassID();
    195 
    196 private:
    197     // prevent default construction etc.
    198     UXMLParser();
    199     UXMLParser(const UXMLParser &other);
    200     UXMLParser &operator=(const UXMLParser &other);
    201 
    202     // constructor
    203     UXMLParser(UErrorCode &status);
    204 
    205     void           parseMisc(UErrorCode &status);
    206     UXMLElement   *createElement(RegexMatcher &mEl, UErrorCode &status);
    207     void           error(const char *message, UErrorCode &status);
    208     UnicodeString  scanContent(UErrorCode &status);
    209     void           replaceCharRefs(UnicodeString &s, UErrorCode &status);
    210 
    211     const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode);
    212 public:
    213     // public for UXMLElement only
    214     const UnicodeString *findName(const UnicodeString &s) const;
    215 private:
    216 
    217     // There is one ICU regex matcher for each of the major XML syntax items
    218     //  that are recognized.
    219     RegexMatcher mXMLDecl;
    220     RegexMatcher mXMLComment;
    221     RegexMatcher mXMLSP;
    222     RegexMatcher mXMLDoctype;
    223     RegexMatcher mXMLPI;
    224     RegexMatcher mXMLElemStart;
    225     RegexMatcher mXMLElemEnd;
    226     RegexMatcher mXMLElemEmpty;
    227     RegexMatcher mXMLCharData;
    228     RegexMatcher mAttrValue;
    229     RegexMatcher mAttrNormalizer;
    230     RegexMatcher mNewLineNormalizer;
    231     RegexMatcher mAmps;
    232 
    233     Hashtable             fNames;           // interned element/attribute name strings
    234     UStack                fElementStack;    // Stack holds the parent elements when nested
    235                                             //    elements are being parsed.  All items on this
    236                                             //    stack are of type UXMLElement.
    237     int32_t               fPos;             // String index of the current scan position in
    238                                             //    xml source (in fSrc).
    239     UnicodeString         fOneLF;
    240 };
    241 
    242 U_NAMESPACE_END
    243 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
    244 
    245 #endif
    246