Home | History | Annotate | Download | only in toolutil
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2004-2005, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  xmlparser.h
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2004jul21
     16 *   created by: Andy Heninger
     17 *
     18 * Tiny XML parser using ICU and intended for use in ICU tests and in build tools.
     19 * Not suitable for production use. Not supported.
     20 * Not conformant. Not efficient.
     21 * But very small.
     22 */
     23 
     24 #ifndef __XMLPARSER_H__
     25 #define __XMLPARSER_H__
     26 
     27 #include "unicode/uobject.h"
     28 #include "unicode/unistr.h"
     29 #include "unicode/regex.h"
     30 #include "uvector.h"
     31 #include "hash.h"
     32 
     33 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
     34 
     35 enum UXMLNodeType {
     36     /** Node type string (text contents), stored as a UnicodeString. */
     37     UXML_NODE_TYPE_STRING,
     38     /** Node type element, stored as a UXMLElement. */
     39     UXML_NODE_TYPE_ELEMENT,
     40     UXML_NODE_TYPE_COUNT
     41 };
     42 
     43 U_NAMESPACE_BEGIN
     44 
     45 class UXMLParser;
     46 
     47 /**
     48  * This class represents an element node in a parsed XML tree.
     49  */
     50 class U_TOOLUTIL_API UXMLElement : public UObject {
     51 public:
     52     /**
     53      * Destructor.
     54      */
     55     virtual ~UXMLElement();
     56 
     57     /**
     58      * Get the tag name of this element.
     59      */
     60     const UnicodeString &getTagName() const;
     61     /**
     62      * Get the text contents of the element.
     63      * Append the contents of all text child nodes.
     64      * @param recurse If TRUE, also recursively appends the contents of all
     65      *        text child nodes of element children.
     66      * @return The text contents.
     67      */
     68     UnicodeString getText(UBool recurse) const;
     69     /**
     70      * Get the number of attributes.
     71      */
     72     int32_t countAttributes() const;
     73     /**
     74      * Get the i-th attribute.
     75      * @param i Index of the attribute.
     76      * @param name Output parameter, receives the attribute name.
     77      * @param value Output parameter, receives the attribute value.
     78      * @return A pointer to the attribute value (may be &value or a pointer to an
     79      *         internal string object), or NULL if i is out of bounds.
     80      */
     81     const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const;
     82     /**
     83      * Get the value of the attribute with the given name.
     84      * @param name Attribute name to be looked up.
     85      * @return A pointer to the attribute value, or NULL if this element
     86      * does not have this attribute.
     87      */
     88     const UnicodeString *getAttribute(const UnicodeString &name) const;
     89     /**
     90      * Get the number of child nodes.
     91      */
     92     int32_t countChildren() const;
     93     /**
     94      * Get the i-th child node.
     95      * @param i Index of the child node.
     96      * @param type The child node type.
     97      * @return A pointer to the child node object, or NULL if i is out of bounds.
     98      */
     99     const UObject *getChild(int32_t i, UXMLNodeType &type) const;
    100     /**
    101      * Get the next child element node, skipping non-element child nodes.
    102      * @param i Enumeration index; initialize to 0 before getting the first child element.
    103      * @return A pointer to the next child element, or NULL if there is none.
    104      */
    105     const UXMLElement *nextChildElement(int32_t &i) const;
    106     /**
    107      * Get the immediate child element with the given name.
    108      * If there are multiple child elements with this name, then return
    109      * the first one.
    110      * @param name Element name to be looked up.
    111      * @return A pointer to the element node, or NULL if this element
    112      * does not have this immediate child element.
    113      */
    114     const UXMLElement *getChildElement(const UnicodeString &name) const;
    115 
    116     /**
    117      * ICU "poor man's RTTI", returns a UClassID for the actual class.
    118      */
    119     virtual UClassID getDynamicClassID() const;
    120 
    121     /**
    122      * ICU "poor man's RTTI", returns a UClassID for this class.
    123      */
    124     static UClassID U_EXPORT2 getStaticClassID();
    125 
    126 private:
    127     // prevent default construction etc.
    128     UXMLElement();
    129     UXMLElement(const UXMLElement &other);
    130     UXMLElement &operator=(const UXMLElement &other);
    131 
    132     void appendText(UnicodeString &text, UBool recurse) const;
    133 
    134     friend class UXMLParser;
    135 
    136     UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode);
    137 
    138     const UXMLParser *fParser;
    139     const UnicodeString *fName;          // The tag name of this element (owned by the UXMLParser)
    140     UnicodeString       fContent;        // The text content of this node.  All element content is
    141                                          //   concatenated even when there are intervening nested elements
    142                                          //   (which doesn't happen with most xml files we care about)
    143                                          //   Sections of content containing only white space are dropped,
    144                                          //   which gets rid  the bogus white space content from
    145                                          //   elements which are primarily containers for nested elements.
    146     UVector             fAttNames;       // A vector containing the names of this element's attributes
    147                                          //    The names are UnicodeString objects, owned by the UXMLParser.
    148     UVector             fAttValues;      // A vector containing the attribute values for
    149                                          //    this element's attributes.  The order is the same
    150                                          //    as that of the attribute name vector.
    151 
    152     UVector             fChildren;       // The child nodes of this element (a Vector)
    153 
    154     UXMLElement        *fParent;         // A pointer to the parent element of this element.
    155 };
    156 
    157 /**
    158  * A simple XML parser; it is neither efficient nor conformant and only useful for
    159  * restricted types of XML documents.
    160  *
    161  * The parse methods parse whole documents and return the parse trees via their
    162  * root elements.
    163  */
    164 class U_TOOLUTIL_API UXMLParser : public UObject {
    165 public:
    166     /**
    167      * Create an XML parser.
    168      */
    169     static UXMLParser *createParser(UErrorCode &errorCode);
    170     /**
    171      * Destructor.
    172      */
    173     virtual ~UXMLParser();
    174 
    175     /**
    176      * Parse an XML document, create the entire document tree, and
    177      * return a pointer to the root element of the parsed tree.
    178      * The caller must delete the element.
    179      */
    180     UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode);
    181     /**
    182      * Parse an XML file, create the entire document tree, and
    183      * return a pointer to the root element of the parsed tree.
    184      * The caller must delete the element.
    185      */
    186     UXMLElement *parseFile(const char *filename, UErrorCode &errorCode);
    187 
    188     /**
    189      * ICU "poor man's RTTI", returns a UClassID for the actual class.
    190      */
    191     virtual UClassID getDynamicClassID() const;
    192 
    193     /**
    194      * ICU "poor man's RTTI", returns a UClassID for this class.
    195      */
    196     static UClassID U_EXPORT2 getStaticClassID();
    197 
    198 private:
    199     // prevent default construction etc.
    200     UXMLParser();
    201     UXMLParser(const UXMLParser &other);
    202     UXMLParser &operator=(const UXMLParser &other);
    203 
    204     // constructor
    205     UXMLParser(UErrorCode &status);
    206 
    207     void           parseMisc(UErrorCode &status);
    208     UXMLElement   *createElement(RegexMatcher &mEl, UErrorCode &status);
    209     void           error(const char *message, UErrorCode &status);
    210     UnicodeString  scanContent(UErrorCode &status);
    211     void           replaceCharRefs(UnicodeString &s, UErrorCode &status);
    212 
    213     const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode);
    214 public:
    215     // public for UXMLElement only
    216     const UnicodeString *findName(const UnicodeString &s) const;
    217 private:
    218 
    219     // There is one ICU regex matcher for each of the major XML syntax items
    220     //  that are recognized.
    221     RegexMatcher mXMLDecl;
    222     RegexMatcher mXMLComment;
    223     RegexMatcher mXMLSP;
    224     RegexMatcher mXMLDoctype;
    225     RegexMatcher mXMLPI;
    226     RegexMatcher mXMLElemStart;
    227     RegexMatcher mXMLElemEnd;
    228     RegexMatcher mXMLElemEmpty;
    229     RegexMatcher mXMLCharData;
    230     RegexMatcher mAttrValue;
    231     RegexMatcher mAttrNormalizer;
    232     RegexMatcher mNewLineNormalizer;
    233     RegexMatcher mAmps;
    234 
    235     Hashtable             fNames;           // interned element/attribute name strings
    236     UStack                fElementStack;    // Stack holds the parent elements when nested
    237                                             //    elements are being parsed.  All items on this
    238                                             //    stack are of type UXMLElement.
    239     int32_t               fPos;             // String index of the current scan position in
    240                                             //    xml source (in fSrc).
    241     UnicodeString         fOneLF;
    242 };
    243 
    244 U_NAMESPACE_END
    245 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
    246 
    247 #endif
    248