1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2004-2005, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: xmlparser.h 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2004jul21 14 * created by: Andy Heninger 15 * 16 * Tiny XML parser using ICU and intended for use in ICU tests and in build tools. 17 * Not suitable for production use. Not supported. 18 * Not conformant. Not efficient. 19 * But very small. 20 */ 21 22 #ifndef __XMLPARSER_H__ 23 #define __XMLPARSER_H__ 24 25 #include "unicode/uobject.h" 26 #include "unicode/unistr.h" 27 #include "unicode/regex.h" 28 #include "uvector.h" 29 #include "hash.h" 30 31 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION 32 33 enum UXMLNodeType { 34 /** Node type string (text contents), stored as a UnicodeString. */ 35 UXML_NODE_TYPE_STRING, 36 /** Node type element, stored as a UXMLElement. */ 37 UXML_NODE_TYPE_ELEMENT, 38 UXML_NODE_TYPE_COUNT 39 }; 40 41 U_NAMESPACE_BEGIN 42 43 class UXMLParser; 44 45 /** 46 * This class represents an element node in a parsed XML tree. 47 */ 48 class U_TOOLUTIL_API UXMLElement : public UObject { 49 public: 50 /** 51 * Destructor. 52 */ 53 virtual ~UXMLElement(); 54 55 /** 56 * Get the tag name of this element. 57 */ 58 const UnicodeString &getTagName() const; 59 /** 60 * Get the text contents of the element. 61 * Append the contents of all text child nodes. 62 * @param recurse If TRUE, also recursively appends the contents of all 63 * text child nodes of element children. 64 * @return The text contents. 65 */ 66 UnicodeString getText(UBool recurse) const; 67 /** 68 * Get the number of attributes. 69 */ 70 int32_t countAttributes() const; 71 /** 72 * Get the i-th attribute. 73 * @param i Index of the attribute. 74 * @param name Output parameter, receives the attribute name. 75 * @param value Output parameter, receives the attribute value. 76 * @return A pointer to the attribute value (may be &value or a pointer to an 77 * internal string object), or NULL if i is out of bounds. 78 */ 79 const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const; 80 /** 81 * Get the value of the attribute with the given name. 82 * @param name Attribute name to be looked up. 83 * @return A pointer to the attribute value, or NULL if this element 84 * does not have this attribute. 85 */ 86 const UnicodeString *getAttribute(const UnicodeString &name) const; 87 /** 88 * Get the number of child nodes. 89 */ 90 int32_t countChildren() const; 91 /** 92 * Get the i-th child node. 93 * @param i Index of the child node. 94 * @param type The child node type. 95 * @return A pointer to the child node object, or NULL if i is out of bounds. 96 */ 97 const UObject *getChild(int32_t i, UXMLNodeType &type) const; 98 /** 99 * Get the next child element node, skipping non-element child nodes. 100 * @param i Enumeration index; initialize to 0 before getting the first child element. 101 * @return A pointer to the next child element, or NULL if there is none. 102 */ 103 const UXMLElement *nextChildElement(int32_t &i) const; 104 /** 105 * Get the immediate child element with the given name. 106 * If there are multiple child elements with this name, then return 107 * the first one. 108 * @param name Element name to be looked up. 109 * @return A pointer to the element node, or NULL if this element 110 * does not have this immediate child element. 111 */ 112 const UXMLElement *getChildElement(const UnicodeString &name) const; 113 114 /** 115 * ICU "poor man's RTTI", returns a UClassID for the actual class. 116 */ 117 virtual UClassID getDynamicClassID() const; 118 119 /** 120 * ICU "poor man's RTTI", returns a UClassID for this class. 121 */ 122 static UClassID U_EXPORT2 getStaticClassID(); 123 124 private: 125 // prevent default construction etc. 126 UXMLElement(); 127 UXMLElement(const UXMLElement &other); 128 UXMLElement &operator=(const UXMLElement &other); 129 130 void appendText(UnicodeString &text, UBool recurse) const; 131 132 friend class UXMLParser; 133 134 UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode); 135 136 const UXMLParser *fParser; 137 const UnicodeString *fName; // The tag name of this element (owned by the UXMLParser) 138 UnicodeString fContent; // The text content of this node. All element content is 139 // concatenated even when there are intervening nested elements 140 // (which doesn't happen with most xml files we care about) 141 // Sections of content containing only white space are dropped, 142 // which gets rid the bogus white space content from 143 // elements which are primarily containers for nested elements. 144 UVector fAttNames; // A vector containing the names of this element's attributes 145 // The names are UnicodeString objects, owned by the UXMLParser. 146 UVector fAttValues; // A vector containing the attribute values for 147 // this element's attributes. The order is the same 148 // as that of the attribute name vector. 149 150 UVector fChildren; // The child nodes of this element (a Vector) 151 152 UXMLElement *fParent; // A pointer to the parent element of this element. 153 }; 154 155 /** 156 * A simple XML parser; it is neither efficient nor conformant and only useful for 157 * restricted types of XML documents. 158 * 159 * The parse methods parse whole documents and return the parse trees via their 160 * root elements. 161 */ 162 class U_TOOLUTIL_API UXMLParser : public UObject { 163 public: 164 /** 165 * Create an XML parser. 166 */ 167 static UXMLParser *createParser(UErrorCode &errorCode); 168 /** 169 * Destructor. 170 */ 171 virtual ~UXMLParser(); 172 173 /** 174 * Parse an XML document, create the entire document tree, and 175 * return a pointer to the root element of the parsed tree. 176 * The caller must delete the element. 177 */ 178 UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode); 179 /** 180 * Parse an XML file, create the entire document tree, and 181 * return a pointer to the root element of the parsed tree. 182 * The caller must delete the element. 183 */ 184 UXMLElement *parseFile(const char *filename, UErrorCode &errorCode); 185 186 /** 187 * ICU "poor man's RTTI", returns a UClassID for the actual class. 188 */ 189 virtual UClassID getDynamicClassID() const; 190 191 /** 192 * ICU "poor man's RTTI", returns a UClassID for this class. 193 */ 194 static UClassID U_EXPORT2 getStaticClassID(); 195 196 private: 197 // prevent default construction etc. 198 UXMLParser(); 199 UXMLParser(const UXMLParser &other); 200 UXMLParser &operator=(const UXMLParser &other); 201 202 // constructor 203 UXMLParser(UErrorCode &status); 204 205 void parseMisc(UErrorCode &status); 206 UXMLElement *createElement(RegexMatcher &mEl, UErrorCode &status); 207 void error(const char *message, UErrorCode &status); 208 UnicodeString scanContent(UErrorCode &status); 209 void replaceCharRefs(UnicodeString &s, UErrorCode &status); 210 211 const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode); 212 public: 213 // public for UXMLElement only 214 const UnicodeString *findName(const UnicodeString &s) const; 215 private: 216 217 // There is one ICU regex matcher for each of the major XML syntax items 218 // that are recognized. 219 RegexMatcher mXMLDecl; 220 RegexMatcher mXMLComment; 221 RegexMatcher mXMLSP; 222 RegexMatcher mXMLDoctype; 223 RegexMatcher mXMLPI; 224 RegexMatcher mXMLElemStart; 225 RegexMatcher mXMLElemEnd; 226 RegexMatcher mXMLElemEmpty; 227 RegexMatcher mXMLCharData; 228 RegexMatcher mAttrValue; 229 RegexMatcher mAttrNormalizer; 230 RegexMatcher mNewLineNormalizer; 231 RegexMatcher mAmps; 232 233 Hashtable fNames; // interned element/attribute name strings 234 UStack fElementStack; // Stack holds the parent elements when nested 235 // elements are being parsed. All items on this 236 // stack are of type UXMLElement. 237 int32_t fPos; // String index of the current scan position in 238 // xml source (in fSrc). 239 UnicodeString fOneLF; 240 }; 241 242 U_NAMESPACE_END 243 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 244 245 #endif 246