Home | History | Annotate | Download | only in toolutil
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2004-2008, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  xmlparser.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2004jul21
     14 *   created by: Andy Heninger
     15 */
     16 
     17 #include <stdio.h>
     18 #include "unicode/uchar.h"
     19 #include "unicode/ucnv.h"
     20 #include "unicode/regex.h"
     21 #include "filestrm.h"
     22 #include "xmlparser.h"
     23 
     24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
     25 
     26 // character constants
     27 enum {
     28     x_QUOT=0x22,
     29     x_AMP=0x26,
     30     x_APOS=0x27,
     31     x_LT=0x3c,
     32     x_GT=0x3e,
     33     x_l=0x6c
     34 };
     35 
     36 #define  XML_SPACES "[ \\u0009\\u000d\\u000a]"
     37 
     38 // XML #4
     39 #define  XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
     40                     "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
     41                     "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
     42                     "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
     43 
     44 //  XML #5
     45 #define  XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
     46 
     47 //  XML #6
     48 #define  XML_NAME    XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"
     49 
     50 U_NAMESPACE_BEGIN
     51 
     52 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
     53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)
     54 
     55 //
     56 //   UXMLParser constructor.   Mostly just initializes the ICU regexes that are
     57 //                             used for parsing.
     58 //
     59 UXMLParser::UXMLParser(UErrorCode &status) :
     60       //  XML Declaration.  XML Production #23.
     61       //      example:  "<?xml version=1.0 encoding="utf-16" ?>
     62       //      This is a sloppy implementation - just look for the leading <?xml and the closing ?>
     63       //            allow for a possible leading BOM.
     64       mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
     65 
     66       //  XML Comment   production #15
     67       //     example:  "<!-- whatever -->
     68       //       note, does not detect an illegal "--" within comments
     69       mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
     70 
     71       //  XML Spaces
     72       //      production [3]
     73       mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
     74 
     75       //  XML Doctype decl  production #28
     76       //     example   "<!DOCTYPE foo SYSTEM "somewhere" >
     77       //       or      "<!DOCTYPE foo [internal dtd]>
     78       //    TODO:  we don't actually parse the DOCTYPE or internal subsets.
     79       //           Some internal dtd subsets could confuse this simple-minded
     80       //           attempt at skipping over them, specifically, occcurences
     81       //           of closeing square brackets.  These could appear in comments,
     82       //           or in parameter entity declarations, for example.
     83       mXMLDoctype(UnicodeString(
     84            "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV
     85            ), 0, status),
     86 
     87       //  XML PI     production #16
     88       //     example   "<?target stuff?>
     89       mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
     90 
     91       //  XML Element Start   Productions #40, #41
     92       //          example   <foo att1='abc'  att2="d e f" >
     93       //      capture #1:  the tag name
     94       //
     95       mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
     96           "(?:"
     97                 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
     98                 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
     99           ")*"                                                             //   * for zero or more attributes.
    100           XML_SPACES "*?>", -1, US_INV), 0, status),                               // match " >"
    101 
    102       //  XML Element End     production #42
    103       //     example   </foo>
    104       mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
    105 
    106       // XML Element Empty    production #44
    107       //     example   <foo att1="abc"   att2="d e f" />
    108       mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
    109           "(?:"
    110                 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
    111                 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
    112           ")*"                                                             //   * for zero or more attributes.
    113           XML_SPACES "*?/>", -1, US_INV), 0, status),                              // match " />"
    114 
    115 
    116       // XMLCharData.  Everything but '<'.  Note that & will be dealt with later.
    117       mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
    118 
    119       // Attribute name = "value".  XML Productions 10, 40/41
    120       //  Capture group 1 is name,
    121       //                2 is the attribute value, including the quotes.
    122       //
    123       //   Note that attributes are scanned twice.  The first time is with
    124       //        the regex for an entire element start.  There, the attributes
    125       //        are checked syntactically, but not separted out one by one.
    126       //        Here, we match a single attribute, and make its name and
    127       //        attribute value available to the parser code.
    128       mAttrValue(UnicodeString(XML_SPACES "+("  XML_NAME ")"  XML_SPACES "*=" XML_SPACES "*"
    129          "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
    130 
    131 
    132       mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
    133 
    134       // Match any of the new-line sequences in content.
    135       //   All are changed to \u000a.
    136       mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
    137 
    138       // & char references
    139       //   We will figure out what we've got based on which capture group has content.
    140       //   The last one is a catchall for unrecognized entity references..
    141       //             1     2     3      4      5           6                    7          8
    142       mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
    143                 0, status),
    144 
    145       fNames(status),
    146       fElementStack(status),
    147       fOneLF((UChar)0x0a)        // Plain new-line string, used in new line normalization.
    148       {
    149       }
    150 
    151 UXMLParser *
    152 UXMLParser::createParser(UErrorCode &errorCode) {
    153     if (U_FAILURE(errorCode)) {
    154         return NULL;
    155     } else {
    156         return new UXMLParser(errorCode);
    157     }
    158 }
    159 
    160 UXMLParser::~UXMLParser() {}
    161 
    162 UXMLElement *
    163 UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
    164     char bytes[4096], charsetBuffer[100];
    165     FileStream *f;
    166     const char *charset, *pb;
    167     UnicodeString src;
    168     UConverter *cnv;
    169     UChar *buffer, *pu;
    170     int32_t fileLength, bytesLength, length, capacity;
    171     UBool flush;
    172 
    173     if(U_FAILURE(errorCode)) {
    174         return NULL;
    175     }
    176 
    177     f=T_FileStream_open(filename, "rb");
    178     if(f==NULL) {
    179         errorCode=U_FILE_ACCESS_ERROR;
    180         return NULL;
    181     }
    182 
    183     bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
    184     if(bytesLength<(int32_t)sizeof(bytes)) {
    185         // we have already read the entire file
    186         fileLength=bytesLength;
    187     } else {
    188         // get the file length
    189         fileLength=T_FileStream_size(f);
    190     }
    191 
    192     /*
    193      * get the charset:
    194      * 1. Unicode signature
    195      * 2. treat as ISO-8859-1 and read XML encoding="charser"
    196      * 3. default to UTF-8
    197      */
    198     charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
    199     if(U_SUCCESS(errorCode) && charset!=NULL) {
    200         // open converter according to Unicode signature
    201         cnv=ucnv_open(charset, &errorCode);
    202     } else {
    203         // read as Latin-1 and parse the XML declaration and encoding
    204         cnv=ucnv_open("ISO-8859-1", &errorCode);
    205         if(U_FAILURE(errorCode)) {
    206             // unexpected error opening Latin-1 converter
    207             goto exit;
    208         }
    209 
    210         buffer=src.getBuffer(bytesLength);
    211         if(buffer==NULL) {
    212             // unexpected failure to reserve some string capacity
    213             errorCode=U_MEMORY_ALLOCATION_ERROR;
    214             goto exit;
    215         }
    216         pb=bytes;
    217         pu=buffer;
    218         ucnv_toUnicode(
    219             cnv,
    220             &pu, buffer+src.getCapacity(),
    221             &pb, bytes+bytesLength,
    222             NULL, TRUE, &errorCode);
    223         src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
    224         ucnv_close(cnv);
    225         cnv=NULL;
    226         if(U_FAILURE(errorCode)) {
    227             // unexpected error in conversion from Latin-1
    228             src.remove();
    229             goto exit;
    230         }
    231 
    232         // parse XML declaration
    233         if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
    234             int32_t declEnd=mXMLDecl.end(errorCode);
    235             // go beyond <?xml
    236             int32_t pos=src.indexOf((UChar)x_l)+1;
    237 
    238             mAttrValue.reset(src);
    239             while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) {  // loop runs once per attribute on this element.
    240                 UnicodeString attName  = mAttrValue.group(1, errorCode);
    241                 UnicodeString attValue = mAttrValue.group(2, errorCode);
    242 
    243                 // Trim the quotes from the att value.  These are left over from the original regex
    244                 //   that parsed the attribue, which couldn't conveniently strip them.
    245                 attValue.remove(0,1);                    // one char from the beginning
    246                 attValue.truncate(attValue.length()-1);  // and one from the end.
    247 
    248                 if(attName==UNICODE_STRING("encoding", 8)) {
    249                     length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
    250                     charset=charsetBuffer;
    251                     break;
    252                 }
    253                 pos = mAttrValue.end(2, errorCode);
    254             }
    255 
    256             if(charset==NULL) {
    257                 // default to UTF-8
    258                 charset="UTF-8";
    259             }
    260             cnv=ucnv_open(charset, &errorCode);
    261         }
    262     }
    263 
    264     if(U_FAILURE(errorCode)) {
    265         // unable to open the converter
    266         goto exit;
    267     }
    268 
    269     // convert the file contents
    270     capacity=fileLength;        // estimated capacity
    271     src.getBuffer(capacity);
    272     src.releaseBuffer(0);       // zero length
    273     flush=FALSE;
    274     for(;;) {
    275         // convert contents of bytes[bytesLength]
    276         pb=bytes;
    277         for(;;) {
    278             length=src.length();
    279             buffer=src.getBuffer(capacity);
    280             if(buffer==NULL) {
    281                 // unexpected failure to reserve some string capacity
    282                 errorCode=U_MEMORY_ALLOCATION_ERROR;
    283                 goto exit;
    284             }
    285 
    286             pu=buffer+length;
    287             ucnv_toUnicode(
    288                 cnv, &pu, buffer+src.getCapacity(),
    289                 &pb, bytes+bytesLength,
    290                 NULL, FALSE, &errorCode);
    291             src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
    292             if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
    293                 errorCode=U_ZERO_ERROR;
    294                 capacity=(3*src.getCapacity())/2; // increase capacity by 50%
    295             } else {
    296                 break;
    297             }
    298         }
    299 
    300         if(U_FAILURE(errorCode)) {
    301             break; // conversion error
    302         }
    303 
    304         if(flush) {
    305             break; // completely converted the file
    306         }
    307 
    308         // read next block
    309         bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
    310         if(bytesLength==0) {
    311             // reached end of file, convert once more to flush the converter
    312             flush=TRUE;
    313         }
    314     };
    315 
    316 exit:
    317     ucnv_close(cnv);
    318     T_FileStream_close(f);
    319 
    320     if(U_SUCCESS(errorCode)) {
    321         return parse(src, errorCode);
    322     } else {
    323         return NULL;
    324     }
    325 }
    326 
    327 UXMLElement *
    328 UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
    329     if(U_FAILURE(status)) {
    330         return NULL;
    331     }
    332 
    333     UXMLElement   *root = NULL;
    334     fPos = 0; // TODO use just a local pos variable and pass it into functions
    335               // where necessary?
    336 
    337     // set all matchers to work on the input string
    338     mXMLDecl.reset(src);
    339     mXMLComment.reset(src);
    340     mXMLSP.reset(src);
    341     mXMLDoctype.reset(src);
    342     mXMLPI.reset(src);
    343     mXMLElemStart.reset(src);
    344     mXMLElemEnd.reset(src);
    345     mXMLElemEmpty.reset(src);
    346     mXMLCharData.reset(src);
    347     mAttrValue.reset(src);
    348     mAttrNormalizer.reset(src);
    349     mNewLineNormalizer.reset(src);
    350     mAmps.reset(src);
    351 
    352     // Consume the XML Declaration, if present.
    353     if (mXMLDecl.lookingAt(fPos, status)) {
    354         fPos = mXMLDecl.end(status);
    355     }
    356 
    357     // Consume "misc" [XML production 27] appearing before DocType
    358     parseMisc(status);
    359 
    360     // Consume a DocType declaration, if present.
    361     if (mXMLDoctype.lookingAt(fPos, status)) {
    362         fPos = mXMLDoctype.end(status);
    363     }
    364 
    365     // Consume additional "misc" [XML production 27] appearing after the DocType
    366     parseMisc(status);
    367 
    368     // Get the root element
    369     if (mXMLElemEmpty.lookingAt(fPos, status)) {
    370         // Root is an empty element (no nested elements or content)
    371         root = createElement(mXMLElemEmpty, status);
    372         fPos = mXMLElemEmpty.end(status);
    373     } else {
    374         if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {
    375             error("Root Element expected", status);
    376             goto errorExit;
    377         }
    378         root = createElement(mXMLElemStart, status);
    379         UXMLElement  *el = root;
    380 
    381         //
    382         // This is the loop that consumes the root element of the document,
    383         //      including all nested content.   Nested elements are handled by
    384         //      explicit pushes/pops of the element stack; there is no recursion
    385         //      in the control flow of this code.
    386         //      "el" always refers to the current element, the one to which content
    387         //      is being added.  It is above the top of the element stack.
    388         for (;;) {
    389             // Nested Element Start
    390             if (mXMLElemStart.lookingAt(fPos, status)) {
    391                 UXMLElement *t = createElement(mXMLElemStart, status);
    392                 el->fChildren.addElement(t, status);
    393                 t->fParent = el;
    394                 fElementStack.push(el, status);
    395                 el = t;
    396                 continue;
    397             }
    398 
    399             // Text Content.  String is concatenated onto the current node's content,
    400             //                but only if it contains something other than spaces.
    401             UnicodeString s = scanContent(status);
    402             if (s.length() > 0) {
    403                 mXMLSP.reset(s);
    404                 if (mXMLSP.matches(status) == FALSE) {
    405                     // This chunk of text contains something other than just
    406                     //  white space. Make a child node for it.
    407                     replaceCharRefs(s, status);
    408                     el->fChildren.addElement(s.clone(), status);
    409                 }
    410                 mXMLSP.reset(src);    // The matchers need to stay set to the main input string.
    411                 continue;
    412             }
    413 
    414             // Comments.  Discard.
    415             if (mXMLComment.lookingAt(fPos, status)) {
    416                 fPos = mXMLComment.end(status);
    417                 continue;
    418             }
    419 
    420             // PIs.  Discard.
    421             if (mXMLPI.lookingAt(fPos, status)) {
    422                 fPos = mXMLPI.end(status);
    423                 continue;
    424             }
    425 
    426             // Element End
    427             if (mXMLElemEnd.lookingAt(fPos, status)) {
    428                 fPos = mXMLElemEnd.end(0, status);
    429                 const UnicodeString name = mXMLElemEnd.group(1, status);
    430                 if (name != *el->fName) {
    431                     error("Element start / end tag mismatch", status);
    432                     goto errorExit;
    433                 }
    434                 if (fElementStack.empty()) {
    435                     // Close of the root element.  We're done with the doc.
    436                     el = NULL;
    437                     break;
    438                 }
    439                 el = (UXMLElement *)fElementStack.pop();
    440                 continue;
    441             }
    442 
    443             // Empty Element.  Stored as a child of the current element, but not stacked.
    444             if (mXMLElemEmpty.lookingAt(fPos, status)) {
    445                 UXMLElement *t = createElement(mXMLElemEmpty, status);
    446                 el->fChildren.addElement(t, status);
    447                 continue;
    448             }
    449 
    450             // Hit something within the document that doesn't match anything.
    451             //   It's an error.
    452             error("Unrecognized markup", status);
    453             break;
    454         }
    455 
    456         if (el != NULL || !fElementStack.empty()) {
    457             // We bailed out early, for some reason.
    458             error("Root element not closed.", status);
    459             goto errorExit;
    460         }
    461     }
    462 
    463     // Root Element parse is complete.
    464     // Consume the annoying xml "Misc" that can appear at the end of the doc.
    465     parseMisc(status);
    466 
    467     // We should have reached the end of the input
    468     if (fPos != src.length()) {
    469         error("Extra content at the end of the document", status);
    470         goto errorExit;
    471     }
    472 
    473     // Success!
    474     return root;
    475 
    476 errorExit:
    477     delete root;
    478     return NULL;
    479 }
    480 
    481 //
    482 //  createElement
    483 //      We've just matched an element start tag.  Create and fill in a UXMLElement object
    484 //      for it.
    485 //
    486 UXMLElement *
    487 UXMLParser::createElement(RegexMatcher  &mEl, UErrorCode &status) {
    488     // First capture group is the element's name.
    489     UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);
    490 
    491     // Scan for attributes.
    492     int32_t   pos = mEl.end(1, status);  // The position after the end of the tag name
    493 
    494     while (mAttrValue.lookingAt(pos, status)) {  // loop runs once per attribute on this element.
    495         UnicodeString attName  = mAttrValue.group(1, status);
    496         UnicodeString attValue = mAttrValue.group(2, status);
    497 
    498         // Trim the quotes from the att value.  These are left over from the original regex
    499         //   that parsed the attribue, which couldn't conveniently strip them.
    500         attValue.remove(0,1);                    // one char from the beginning
    501         attValue.truncate(attValue.length()-1);  // and one from the end.
    502 
    503         // XML Attribue value normalization.
    504         // This is one of the really screwy parts of the XML spec.
    505         // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
    506         // Note that non-validating parsers must treat all entities as type CDATA
    507         //   which simplifies things some.
    508 
    509         // Att normalization step 1:  normalize any newlines in the attribute value
    510         mNewLineNormalizer.reset(attValue);
    511         attValue = mNewLineNormalizer.replaceAll(fOneLF, status);
    512 
    513         // Next change all xml white space chars to plain \u0020 spaces.
    514         mAttrNormalizer.reset(attValue);
    515         UnicodeString oneSpace((UChar)0x0020);
    516         attValue = mAttrNormalizer.replaceAll(oneSpace, status);
    517 
    518         // Replace character entities.
    519         replaceCharRefs(attValue, status);
    520 
    521         // Save the attribute name and value in our document structure.
    522         el->fAttNames.addElement((void *)intern(attName, status), status);
    523         el->fAttValues.addElement(attValue.clone(), status);
    524         pos = mAttrValue.end(2, status);
    525     }
    526     fPos = mEl.end(0, status);
    527     return el;
    528 }
    529 
    530 //
    531 //  parseMisc
    532 //     Consume XML "Misc" [production #27]
    533 //        which is any combination of space, PI and comments
    534 //      Need to watch end-of-input because xml MISC stuff is allowed after
    535 //        the document element, so we WILL scan off the end in this function
    536 //
    537 void
    538 UXMLParser::parseMisc(UErrorCode &status)  {
    539     for (;;) {
    540         if (fPos >= mXMLPI.input().length()) {
    541             break;
    542         }
    543         if (mXMLPI.lookingAt(fPos, status)) {
    544             fPos = mXMLPI.end(status);
    545             continue;
    546         }
    547         if (mXMLSP.lookingAt(fPos, status)) {
    548             fPos = mXMLSP.end(status);
    549             continue;
    550         }
    551         if (mXMLComment.lookingAt(fPos, status)) {
    552             fPos = mXMLComment.end(status);
    553             continue;
    554         }
    555         break;
    556     }
    557 }
    558 
    559 //
    560 //  Scan for document content.
    561 //
    562 UnicodeString
    563 UXMLParser::scanContent(UErrorCode &status) {
    564     UnicodeString  result;
    565     if (mXMLCharData.lookingAt(fPos, status)) {
    566         result = mXMLCharData.group(0, status);
    567         // Normalize the new-lines.  (Before char ref substitution)
    568         mNewLineNormalizer.reset(result);
    569         result = mNewLineNormalizer.replaceAll(fOneLF, status);
    570 
    571         // TODO:  handle CDATA
    572         fPos = mXMLCharData.end(0, status);
    573     }
    574 
    575     return result;
    576 }
    577 
    578 //
    579 //   replaceCharRefs
    580 //
    581 //      replace the char entities &lt;  &amp; &#123; &#x12ab; etc. in a string
    582 //       with the corresponding actual character.
    583 //
    584 void
    585 UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
    586     UnicodeString result;
    587     UnicodeString replacement;
    588     int     i;
    589 
    590     mAmps.reset(s);
    591     // See the initialization for the regex matcher mAmps.
    592     //    Which entity we've matched is determined by which capture group has content,
    593     //      which is flaged by start() of that group not being -1.
    594     while (mAmps.find()) {
    595         if (mAmps.start(1, status) != -1) {
    596             replacement.setTo((UChar)x_AMP);
    597         } else if (mAmps.start(2, status) != -1) {
    598             replacement.setTo((UChar)x_LT);
    599         } else if (mAmps.start(3, status) != -1) {
    600             replacement.setTo((UChar)x_GT);
    601         } else if (mAmps.start(4, status) != -1) {
    602             replacement.setTo((UChar)x_APOS);
    603         } else if (mAmps.start(5, status) != -1) {
    604             replacement.setTo((UChar)x_QUOT);
    605         } else if (mAmps.start(6, status) != -1) {
    606             UnicodeString hexString = mAmps.group(6, status);
    607             UChar32 val = 0;
    608             for (i=0; i<hexString.length(); i++) {
    609                 val = (val << 4) + u_digit(hexString.charAt(i), 16);
    610             }
    611             // TODO:  some verification that the character is valid
    612             replacement.setTo(val);
    613         } else if (mAmps.start(7, status) != -1) {
    614             UnicodeString decimalString = mAmps.group(7, status);
    615             UChar32 val = 0;
    616             for (i=0; i<decimalString.length(); i++) {
    617                 val = val*10 + u_digit(decimalString.charAt(i), 10);
    618             }
    619             // TODO:  some verification that the character is valid
    620             replacement.setTo(val);
    621         } else {
    622             // An unrecognized &entity;  Leave it alone.
    623             //  TODO:  check that it really looks like an entity, and is not some
    624             //         random & in the text.
    625             replacement = mAmps.group(0, status);
    626         }
    627         mAmps.appendReplacement(result, replacement, status);
    628     }
    629     mAmps.appendTail(result);
    630     s = result;
    631 }
    632 
    633 void
    634 UXMLParser::error(const char *message, UErrorCode &status) {
    635     // TODO:  something better here...
    636     const UnicodeString &src=mXMLDecl.input();
    637     int  line = 0;
    638     int  ci = 0;
    639     while (ci < fPos && ci>=0) {
    640         ci = src.indexOf((UChar)0x0a, ci+1);
    641         line++;
    642     }
    643     fprintf(stderr, "Error: %s at line %d\n", message, line);
    644     if (U_SUCCESS(status)) {
    645         status = U_PARSE_ERROR;
    646     }
    647 }
    648 
    649 // intern strings like in Java
    650 
    651 const UnicodeString *
    652 UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
    653     const UHashElement *he=fNames.find(s);
    654     if(he!=NULL) {
    655         // already a known name, return its hashed key pointer
    656         return (const UnicodeString *)he->key.pointer;
    657     } else {
    658         // add this new name and return its hashed key pointer
    659         fNames.puti(s, 0, errorCode);
    660         he=fNames.find(s);
    661         return (const UnicodeString *)he->key.pointer;
    662     }
    663 }
    664 
    665 const UnicodeString *
    666 UXMLParser::findName(const UnicodeString &s) const {
    667     const UHashElement *he=fNames.find(s);
    668     if(he!=NULL) {
    669         // a known name, return its hashed key pointer
    670         return (const UnicodeString *)he->key.pointer;
    671     } else {
    672         // unknown name
    673         return NULL;
    674     }
    675 }
    676 
    677 // UXMLElement ------------------------------------------------------------- ***
    678 
    679 UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
    680    fParser(parser),
    681    fName(name),
    682    fAttNames(errorCode),
    683    fAttValues(errorCode),
    684    fChildren(errorCode),
    685    fParent(NULL)
    686 {
    687 }
    688 
    689 UXMLElement::~UXMLElement() {
    690     int   i;
    691     // attribute names are owned by the UXMLParser, don't delete them here
    692     for (i=fAttValues.size()-1; i>=0; i--) {
    693         delete (UObject *)fAttValues.elementAt(i);
    694     }
    695     for (i=fChildren.size()-1; i>=0; i--) {
    696         delete (UObject *)fChildren.elementAt(i);
    697     }
    698 }
    699 
    700 const UnicodeString &
    701 UXMLElement::getTagName() const {
    702     return *fName;
    703 }
    704 
    705 UnicodeString
    706 UXMLElement::getText(UBool recurse) const {
    707     UnicodeString text;
    708     appendText(text, recurse);
    709     return text;
    710 }
    711 
    712 void
    713 UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
    714     const UObject *node;
    715     int32_t i, count=fChildren.size();
    716     for(i=0; i<count; ++i) {
    717         node=(const UObject *)fChildren.elementAt(i);
    718         if(node->getDynamicClassID()==UnicodeString::getStaticClassID()) {
    719             text.append(*(const UnicodeString *)node);
    720         } else if(recurse) /* must be a UXMLElement */ {
    721             ((const UXMLElement *)node)->appendText(text, recurse);
    722         }
    723     }
    724 }
    725 
    726 int32_t
    727 UXMLElement::countAttributes() const {
    728     return fAttNames.size();
    729 }
    730 
    731 const UnicodeString *
    732 UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
    733     if(0<=i && i<fAttNames.size()) {
    734         name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
    735         value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
    736         return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
    737     } else {
    738         return NULL;
    739     }
    740 }
    741 
    742 const UnicodeString *
    743 UXMLElement::getAttribute(const UnicodeString &name) const {
    744     // search for the attribute name by comparing the interned pointer,
    745     // not the string contents
    746     const UnicodeString *p=fParser->findName(name);
    747     if(p==NULL) {
    748         return NULL; // no such attribute seen by the parser at all
    749     }
    750 
    751     int32_t i, count=fAttNames.size();
    752     for(i=0; i<count; ++i) {
    753         if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
    754             return (const UnicodeString *)fAttValues.elementAt(i);
    755         }
    756     }
    757     return NULL;
    758 }
    759 
    760 int32_t
    761 UXMLElement::countChildren() const {
    762     return fChildren.size();
    763 }
    764 
    765 const UObject *
    766 UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
    767     if(0<=i && i<fChildren.size()) {
    768         const UObject *node=(const UObject *)fChildren.elementAt(i);
    769         if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
    770             type=UXML_NODE_TYPE_ELEMENT;
    771         } else {
    772             type=UXML_NODE_TYPE_STRING;
    773         }
    774         return node;
    775     } else {
    776         return NULL;
    777     }
    778 }
    779 
    780 const UXMLElement *
    781 UXMLElement::nextChildElement(int32_t &i) const {
    782     if(i<0) {
    783         return NULL;
    784     }
    785 
    786     const UObject *node;
    787     int32_t count=fChildren.size();
    788     while(i<count) {
    789         node=(const UObject *)fChildren.elementAt(i++);
    790         // TODO: see if ICU can use C++ instanceof instead of its own poor man's RTTI
    791         // if(node instanceof UXMLElement) {
    792         if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
    793             return (const UXMLElement *)node;
    794         }
    795     }
    796     return NULL;
    797 }
    798 
    799 const UXMLElement *
    800 UXMLElement::getChildElement(const UnicodeString &name) const {
    801     // search for the element name by comparing the interned pointer,
    802     // not the string contents
    803     const UnicodeString *p=fParser->findName(name);
    804     if(p==NULL) {
    805         return NULL; // no such element seen by the parser at all
    806     }
    807 
    808     const UObject *node;
    809     int32_t i, count=fChildren.size();
    810     for(i=0; i<count; ++i) {
    811         node=(const UObject *)fChildren.elementAt(i);
    812         if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
    813             const UXMLElement *elem=(const UXMLElement *)node;
    814             if(p==elem->fName) {
    815                 return elem;
    816             }
    817         }
    818     }
    819     return NULL;
    820 }
    821 
    822 U_NAMESPACE_END
    823 
    824 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
    825 
    826