Home | History | Annotate | Download | only in toolutil
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2004-2010, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  xmlparser.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2004jul21
     16 *   created by: Andy Heninger
     17 */
     18 
     19 #include <stdio.h>
     20 #include "unicode/uchar.h"
     21 #include "unicode/ucnv.h"
     22 #include "unicode/regex.h"
     23 #include "filestrm.h"
     24 #include "xmlparser.h"
     25 
     26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
     27 
     28 // character constants
     29 enum {
     30     x_QUOT=0x22,
     31     x_AMP=0x26,
     32     x_APOS=0x27,
     33     x_LT=0x3c,
     34     x_GT=0x3e,
     35     x_l=0x6c
     36 };
     37 
     38 #define  XML_SPACES "[ \\u0009\\u000d\\u000a]"
     39 
     40 // XML #4
     41 #define  XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
     42                     "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
     43                     "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
     44                     "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
     45 
     46 //  XML #5
     47 #define  XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
     48 
     49 //  XML #6
     50 #define  XML_NAME    XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"
     51 
     52 U_NAMESPACE_BEGIN
     53 
     54 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
     55 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)
     56 
     57 //
     58 //   UXMLParser constructor.   Mostly just initializes the ICU regexes that are
     59 //                             used for parsing.
     60 //
     61 UXMLParser::UXMLParser(UErrorCode &status) :
     62       //  XML Declaration.  XML Production #23.
     63       //      example:  "<?xml version=1.0 encoding="utf-16" ?>
     64       //      This is a sloppy implementation - just look for the leading <?xml and the closing ?>
     65       //            allow for a possible leading BOM.
     66       mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
     67 
     68       //  XML Comment   production #15
     69       //     example:  "<!-- whatever -->
     70       //       note, does not detect an illegal "--" within comments
     71       mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
     72 
     73       //  XML Spaces
     74       //      production [3]
     75       mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
     76 
     77       //  XML Doctype decl  production #28
     78       //     example   "<!DOCTYPE foo SYSTEM "somewhere" >
     79       //       or      "<!DOCTYPE foo [internal dtd]>
     80       //    TODO:  we don't actually parse the DOCTYPE or internal subsets.
     81       //           Some internal dtd subsets could confuse this simple-minded
     82       //           attempt at skipping over them, specifically, occcurences
     83       //           of closeing square brackets.  These could appear in comments,
     84       //           or in parameter entity declarations, for example.
     85       mXMLDoctype(UnicodeString(
     86            "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV
     87            ), 0, status),
     88 
     89       //  XML PI     production #16
     90       //     example   "<?target stuff?>
     91       mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
     92 
     93       //  XML Element Start   Productions #40, #41
     94       //          example   <foo att1='abc'  att2="d e f" >
     95       //      capture #1:  the tag name
     96       //
     97       mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
     98           "(?:"
     99                 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
    100                 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
    101           ")*"                                                             //   * for zero or more attributes.
    102           XML_SPACES "*?>", -1, US_INV), 0, status),                               // match " >"
    103 
    104       //  XML Element End     production #42
    105       //     example   </foo>
    106       mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
    107 
    108       // XML Element Empty    production #44
    109       //     example   <foo att1="abc"   att2="d e f" />
    110       mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
    111           "(?:"
    112                 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
    113                 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
    114           ")*"                                                             //   * for zero or more attributes.
    115           XML_SPACES "*?/>", -1, US_INV), 0, status),                              // match " />"
    116 
    117 
    118       // XMLCharData.  Everything but '<'.  Note that & will be dealt with later.
    119       mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
    120 
    121       // Attribute name = "value".  XML Productions 10, 40/41
    122       //  Capture group 1 is name,
    123       //                2 is the attribute value, including the quotes.
    124       //
    125       //   Note that attributes are scanned twice.  The first time is with
    126       //        the regex for an entire element start.  There, the attributes
    127       //        are checked syntactically, but not separted out one by one.
    128       //        Here, we match a single attribute, and make its name and
    129       //        attribute value available to the parser code.
    130       mAttrValue(UnicodeString(XML_SPACES "+("  XML_NAME ")"  XML_SPACES "*=" XML_SPACES "*"
    131          "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
    132 
    133 
    134       mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
    135 
    136       // Match any of the new-line sequences in content.
    137       //   All are changed to \u000a.
    138       mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
    139 
    140       // & char references
    141       //   We will figure out what we've got based on which capture group has content.
    142       //   The last one is a catchall for unrecognized entity references..
    143       //             1     2     3      4      5           6                    7          8
    144       mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
    145                 0, status),
    146 
    147       fNames(status),
    148       fElementStack(status),
    149       fOneLF((UChar)0x0a)        // Plain new-line string, used in new line normalization.
    150       {
    151       }
    152 
    153 UXMLParser *
    154 UXMLParser::createParser(UErrorCode &errorCode) {
    155     if (U_FAILURE(errorCode)) {
    156         return NULL;
    157     } else {
    158         return new UXMLParser(errorCode);
    159     }
    160 }
    161 
    162 UXMLParser::~UXMLParser() {}
    163 
    164 UXMLElement *
    165 UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
    166     char bytes[4096], charsetBuffer[100];
    167     FileStream *f;
    168     const char *charset, *pb;
    169     UnicodeString src;
    170     UConverter *cnv;
    171     UChar *buffer, *pu;
    172     int32_t fileLength, bytesLength, length, capacity;
    173     UBool flush;
    174 
    175     if(U_FAILURE(errorCode)) {
    176         return NULL;
    177     }
    178 
    179     f=T_FileStream_open(filename, "rb");
    180     if(f==NULL) {
    181         errorCode=U_FILE_ACCESS_ERROR;
    182         return NULL;
    183     }
    184 
    185     bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
    186     if(bytesLength<(int32_t)sizeof(bytes)) {
    187         // we have already read the entire file
    188         fileLength=bytesLength;
    189     } else {
    190         // get the file length
    191         fileLength=T_FileStream_size(f);
    192     }
    193 
    194     /*
    195      * get the charset:
    196      * 1. Unicode signature
    197      * 2. treat as ISO-8859-1 and read XML encoding="charser"
    198      * 3. default to UTF-8
    199      */
    200     charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
    201     if(U_SUCCESS(errorCode) && charset!=NULL) {
    202         // open converter according to Unicode signature
    203         cnv=ucnv_open(charset, &errorCode);
    204     } else {
    205         // read as Latin-1 and parse the XML declaration and encoding
    206         cnv=ucnv_open("ISO-8859-1", &errorCode);
    207         if(U_FAILURE(errorCode)) {
    208             // unexpected error opening Latin-1 converter
    209             goto exit;
    210         }
    211 
    212         buffer=toUCharPtr(src.getBuffer(bytesLength));
    213         if(buffer==NULL) {
    214             // unexpected failure to reserve some string capacity
    215             errorCode=U_MEMORY_ALLOCATION_ERROR;
    216             goto exit;
    217         }
    218         pb=bytes;
    219         pu=buffer;
    220         ucnv_toUnicode(
    221             cnv,
    222             &pu, buffer+src.getCapacity(),
    223             &pb, bytes+bytesLength,
    224             NULL, TRUE, &errorCode);
    225         src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
    226         ucnv_close(cnv);
    227         cnv=NULL;
    228         if(U_FAILURE(errorCode)) {
    229             // unexpected error in conversion from Latin-1
    230             src.remove();
    231             goto exit;
    232         }
    233 
    234         // parse XML declaration
    235         if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
    236             int32_t declEnd=mXMLDecl.end(errorCode);
    237             // go beyond <?xml
    238             int32_t pos=src.indexOf((UChar)x_l)+1;
    239 
    240             mAttrValue.reset(src);
    241             while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) {  // loop runs once per attribute on this element.
    242                 UnicodeString attName  = mAttrValue.group(1, errorCode);
    243                 UnicodeString attValue = mAttrValue.group(2, errorCode);
    244 
    245                 // Trim the quotes from the att value.  These are left over from the original regex
    246                 //   that parsed the attribue, which couldn't conveniently strip them.
    247                 attValue.remove(0,1);                    // one char from the beginning
    248                 attValue.truncate(attValue.length()-1);  // and one from the end.
    249 
    250                 if(attName==UNICODE_STRING("encoding", 8)) {
    251                     length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
    252                     charset=charsetBuffer;
    253                     break;
    254                 }
    255                 pos = mAttrValue.end(2, errorCode);
    256             }
    257 
    258             if(charset==NULL) {
    259                 // default to UTF-8
    260                 charset="UTF-8";
    261             }
    262             cnv=ucnv_open(charset, &errorCode);
    263         }
    264     }
    265 
    266     if(U_FAILURE(errorCode)) {
    267         // unable to open the converter
    268         goto exit;
    269     }
    270 
    271     // convert the file contents
    272     capacity=fileLength;        // estimated capacity
    273     src.getBuffer(capacity);
    274     src.releaseBuffer(0);       // zero length
    275     flush=FALSE;
    276     for(;;) {
    277         // convert contents of bytes[bytesLength]
    278         pb=bytes;
    279         for(;;) {
    280             length=src.length();
    281             buffer=toUCharPtr(src.getBuffer(capacity));
    282             if(buffer==NULL) {
    283                 // unexpected failure to reserve some string capacity
    284                 errorCode=U_MEMORY_ALLOCATION_ERROR;
    285                 goto exit;
    286             }
    287 
    288             pu=buffer+length;
    289             ucnv_toUnicode(
    290                 cnv, &pu, buffer+src.getCapacity(),
    291                 &pb, bytes+bytesLength,
    292                 NULL, FALSE, &errorCode);
    293             src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
    294             if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
    295                 errorCode=U_ZERO_ERROR;
    296                 capacity=(3*src.getCapacity())/2; // increase capacity by 50%
    297             } else {
    298                 break;
    299             }
    300         }
    301 
    302         if(U_FAILURE(errorCode)) {
    303             break; // conversion error
    304         }
    305 
    306         if(flush) {
    307             break; // completely converted the file
    308         }
    309 
    310         // read next block
    311         bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
    312         if(bytesLength==0) {
    313             // reached end of file, convert once more to flush the converter
    314             flush=TRUE;
    315         }
    316     };
    317 
    318 exit:
    319     ucnv_close(cnv);
    320     T_FileStream_close(f);
    321 
    322     if(U_SUCCESS(errorCode)) {
    323         return parse(src, errorCode);
    324     } else {
    325         return NULL;
    326     }
    327 }
    328 
    329 UXMLElement *
    330 UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
    331     if(U_FAILURE(status)) {
    332         return NULL;
    333     }
    334 
    335     UXMLElement   *root = NULL;
    336     fPos = 0; // TODO use just a local pos variable and pass it into functions
    337               // where necessary?
    338 
    339     // set all matchers to work on the input string
    340     mXMLDecl.reset(src);
    341     mXMLComment.reset(src);
    342     mXMLSP.reset(src);
    343     mXMLDoctype.reset(src);
    344     mXMLPI.reset(src);
    345     mXMLElemStart.reset(src);
    346     mXMLElemEnd.reset(src);
    347     mXMLElemEmpty.reset(src);
    348     mXMLCharData.reset(src);
    349     mAttrValue.reset(src);
    350     mAttrNormalizer.reset(src);
    351     mNewLineNormalizer.reset(src);
    352     mAmps.reset(src);
    353 
    354     // Consume the XML Declaration, if present.
    355     if (mXMLDecl.lookingAt(fPos, status)) {
    356         fPos = mXMLDecl.end(status);
    357     }
    358 
    359     // Consume "misc" [XML production 27] appearing before DocType
    360     parseMisc(status);
    361 
    362     // Consume a DocType declaration, if present.
    363     if (mXMLDoctype.lookingAt(fPos, status)) {
    364         fPos = mXMLDoctype.end(status);
    365     }
    366 
    367     // Consume additional "misc" [XML production 27] appearing after the DocType
    368     parseMisc(status);
    369 
    370     // Get the root element
    371     if (mXMLElemEmpty.lookingAt(fPos, status)) {
    372         // Root is an empty element (no nested elements or content)
    373         root = createElement(mXMLElemEmpty, status);
    374         fPos = mXMLElemEmpty.end(status);
    375     } else {
    376         if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {
    377             error("Root Element expected", status);
    378             goto errorExit;
    379         }
    380         root = createElement(mXMLElemStart, status);
    381         UXMLElement  *el = root;
    382 
    383         //
    384         // This is the loop that consumes the root element of the document,
    385         //      including all nested content.   Nested elements are handled by
    386         //      explicit pushes/pops of the element stack; there is no recursion
    387         //      in the control flow of this code.
    388         //      "el" always refers to the current element, the one to which content
    389         //      is being added.  It is above the top of the element stack.
    390         for (;;) {
    391             // Nested Element Start
    392             if (mXMLElemStart.lookingAt(fPos, status)) {
    393                 UXMLElement *t = createElement(mXMLElemStart, status);
    394                 el->fChildren.addElement(t, status);
    395                 t->fParent = el;
    396                 fElementStack.push(el, status);
    397                 el = t;
    398                 continue;
    399             }
    400 
    401             // Text Content.  String is concatenated onto the current node's content,
    402             //                but only if it contains something other than spaces.
    403             UnicodeString s = scanContent(status);
    404             if (s.length() > 0) {
    405                 mXMLSP.reset(s);
    406                 if (mXMLSP.matches(status) == FALSE) {
    407                     // This chunk of text contains something other than just
    408                     //  white space. Make a child node for it.
    409                     replaceCharRefs(s, status);
    410                     el->fChildren.addElement(s.clone(), status);
    411                 }
    412                 mXMLSP.reset(src);    // The matchers need to stay set to the main input string.
    413                 continue;
    414             }
    415 
    416             // Comments.  Discard.
    417             if (mXMLComment.lookingAt(fPos, status)) {
    418                 fPos = mXMLComment.end(status);
    419                 continue;
    420             }
    421 
    422             // PIs.  Discard.
    423             if (mXMLPI.lookingAt(fPos, status)) {
    424                 fPos = mXMLPI.end(status);
    425                 continue;
    426             }
    427 
    428             // Element End
    429             if (mXMLElemEnd.lookingAt(fPos, status)) {
    430                 fPos = mXMLElemEnd.end(0, status);
    431                 const UnicodeString name = mXMLElemEnd.group(1, status);
    432                 if (name != *el->fName) {
    433                     error("Element start / end tag mismatch", status);
    434                     goto errorExit;
    435                 }
    436                 if (fElementStack.empty()) {
    437                     // Close of the root element.  We're done with the doc.
    438                     el = NULL;
    439                     break;
    440                 }
    441                 el = (UXMLElement *)fElementStack.pop();
    442                 continue;
    443             }
    444 
    445             // Empty Element.  Stored as a child of the current element, but not stacked.
    446             if (mXMLElemEmpty.lookingAt(fPos, status)) {
    447                 UXMLElement *t = createElement(mXMLElemEmpty, status);
    448                 el->fChildren.addElement(t, status);
    449                 continue;
    450             }
    451 
    452             // Hit something within the document that doesn't match anything.
    453             //   It's an error.
    454             error("Unrecognized markup", status);
    455             break;
    456         }
    457 
    458         if (el != NULL || !fElementStack.empty()) {
    459             // We bailed out early, for some reason.
    460             error("Root element not closed.", status);
    461             goto errorExit;
    462         }
    463     }
    464 
    465     // Root Element parse is complete.
    466     // Consume the annoying xml "Misc" that can appear at the end of the doc.
    467     parseMisc(status);
    468 
    469     // We should have reached the end of the input
    470     if (fPos != src.length()) {
    471         error("Extra content at the end of the document", status);
    472         goto errorExit;
    473     }
    474 
    475     // Success!
    476     return root;
    477 
    478 errorExit:
    479     delete root;
    480     return NULL;
    481 }
    482 
    483 //
    484 //  createElement
    485 //      We've just matched an element start tag.  Create and fill in a UXMLElement object
    486 //      for it.
    487 //
    488 UXMLElement *
    489 UXMLParser::createElement(RegexMatcher  &mEl, UErrorCode &status) {
    490     // First capture group is the element's name.
    491     UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);
    492 
    493     // Scan for attributes.
    494     int32_t   pos = mEl.end(1, status);  // The position after the end of the tag name
    495 
    496     while (mAttrValue.lookingAt(pos, status)) {  // loop runs once per attribute on this element.
    497         UnicodeString attName  = mAttrValue.group(1, status);
    498         UnicodeString attValue = mAttrValue.group(2, status);
    499 
    500         // Trim the quotes from the att value.  These are left over from the original regex
    501         //   that parsed the attribue, which couldn't conveniently strip them.
    502         attValue.remove(0,1);                    // one char from the beginning
    503         attValue.truncate(attValue.length()-1);  // and one from the end.
    504 
    505         // XML Attribue value normalization.
    506         // This is one of the really screwy parts of the XML spec.
    507         // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
    508         // Note that non-validating parsers must treat all entities as type CDATA
    509         //   which simplifies things some.
    510 
    511         // Att normalization step 1:  normalize any newlines in the attribute value
    512         mNewLineNormalizer.reset(attValue);
    513         attValue = mNewLineNormalizer.replaceAll(fOneLF, status);
    514 
    515         // Next change all xml white space chars to plain \u0020 spaces.
    516         mAttrNormalizer.reset(attValue);
    517         UnicodeString oneSpace((UChar)0x0020);
    518         attValue = mAttrNormalizer.replaceAll(oneSpace, status);
    519 
    520         // Replace character entities.
    521         replaceCharRefs(attValue, status);
    522 
    523         // Save the attribute name and value in our document structure.
    524         el->fAttNames.addElement((void *)intern(attName, status), status);
    525         el->fAttValues.addElement(attValue.clone(), status);
    526         pos = mAttrValue.end(2, status);
    527     }
    528     fPos = mEl.end(0, status);
    529     return el;
    530 }
    531 
    532 //
    533 //  parseMisc
    534 //     Consume XML "Misc" [production #27]
    535 //        which is any combination of space, PI and comments
    536 //      Need to watch end-of-input because xml MISC stuff is allowed after
    537 //        the document element, so we WILL scan off the end in this function
    538 //
    539 void
    540 UXMLParser::parseMisc(UErrorCode &status)  {
    541     for (;;) {
    542         if (fPos >= mXMLPI.input().length()) {
    543             break;
    544         }
    545         if (mXMLPI.lookingAt(fPos, status)) {
    546             fPos = mXMLPI.end(status);
    547             continue;
    548         }
    549         if (mXMLSP.lookingAt(fPos, status)) {
    550             fPos = mXMLSP.end(status);
    551             continue;
    552         }
    553         if (mXMLComment.lookingAt(fPos, status)) {
    554             fPos = mXMLComment.end(status);
    555             continue;
    556         }
    557         break;
    558     }
    559 }
    560 
    561 //
    562 //  Scan for document content.
    563 //
    564 UnicodeString
    565 UXMLParser::scanContent(UErrorCode &status) {
    566     UnicodeString  result;
    567     if (mXMLCharData.lookingAt(fPos, status)) {
    568         result = mXMLCharData.group((int32_t)0, status);
    569         // Normalize the new-lines.  (Before char ref substitution)
    570         mNewLineNormalizer.reset(result);
    571         result = mNewLineNormalizer.replaceAll(fOneLF, status);
    572 
    573         // TODO:  handle CDATA
    574         fPos = mXMLCharData.end(0, status);
    575     }
    576 
    577     return result;
    578 }
    579 
    580 //
    581 //   replaceCharRefs
    582 //
    583 //      replace the char entities &lt;  &amp; &#123; &#x12ab; etc. in a string
    584 //       with the corresponding actual character.
    585 //
    586 void
    587 UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
    588     UnicodeString result;
    589     UnicodeString replacement;
    590     int     i;
    591 
    592     mAmps.reset(s);
    593     // See the initialization for the regex matcher mAmps.
    594     //    Which entity we've matched is determined by which capture group has content,
    595     //      which is flaged by start() of that group not being -1.
    596     while (mAmps.find()) {
    597         if (mAmps.start(1, status) != -1) {
    598             replacement.setTo((UChar)x_AMP);
    599         } else if (mAmps.start(2, status) != -1) {
    600             replacement.setTo((UChar)x_LT);
    601         } else if (mAmps.start(3, status) != -1) {
    602             replacement.setTo((UChar)x_GT);
    603         } else if (mAmps.start(4, status) != -1) {
    604             replacement.setTo((UChar)x_APOS);
    605         } else if (mAmps.start(5, status) != -1) {
    606             replacement.setTo((UChar)x_QUOT);
    607         } else if (mAmps.start(6, status) != -1) {
    608             UnicodeString hexString = mAmps.group(6, status);
    609             UChar32 val = 0;
    610             for (i=0; i<hexString.length(); i++) {
    611                 val = (val << 4) + u_digit(hexString.charAt(i), 16);
    612             }
    613             // TODO:  some verification that the character is valid
    614             replacement.setTo(val);
    615         } else if (mAmps.start(7, status) != -1) {
    616             UnicodeString decimalString = mAmps.group(7, status);
    617             UChar32 val = 0;
    618             for (i=0; i<decimalString.length(); i++) {
    619                 val = val*10 + u_digit(decimalString.charAt(i), 10);
    620             }
    621             // TODO:  some verification that the character is valid
    622             replacement.setTo(val);
    623         } else {
    624             // An unrecognized &entity;  Leave it alone.
    625             //  TODO:  check that it really looks like an entity, and is not some
    626             //         random & in the text.
    627             replacement = mAmps.group((int32_t)0, status);
    628         }
    629         mAmps.appendReplacement(result, replacement, status);
    630     }
    631     mAmps.appendTail(result);
    632     s = result;
    633 }
    634 
    635 void
    636 UXMLParser::error(const char *message, UErrorCode &status) {
    637     // TODO:  something better here...
    638     const UnicodeString &src=mXMLDecl.input();
    639     int  line = 0;
    640     int  ci = 0;
    641     while (ci < fPos && ci>=0) {
    642         ci = src.indexOf((UChar)0x0a, ci+1);
    643         line++;
    644     }
    645     fprintf(stderr, "Error: %s at line %d\n", message, line);
    646     if (U_SUCCESS(status)) {
    647         status = U_PARSE_ERROR;
    648     }
    649 }
    650 
    651 // intern strings like in Java
    652 
    653 const UnicodeString *
    654 UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
    655     const UHashElement *he=fNames.find(s);
    656     if(he!=NULL) {
    657         // already a known name, return its hashed key pointer
    658         return (const UnicodeString *)he->key.pointer;
    659     } else {
    660         // add this new name and return its hashed key pointer
    661         fNames.puti(s, 0, errorCode);
    662         he=fNames.find(s);
    663         return (const UnicodeString *)he->key.pointer;
    664     }
    665 }
    666 
    667 const UnicodeString *
    668 UXMLParser::findName(const UnicodeString &s) const {
    669     const UHashElement *he=fNames.find(s);
    670     if(he!=NULL) {
    671         // a known name, return its hashed key pointer
    672         return (const UnicodeString *)he->key.pointer;
    673     } else {
    674         // unknown name
    675         return NULL;
    676     }
    677 }
    678 
    679 // UXMLElement ------------------------------------------------------------- ***
    680 
    681 UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
    682    fParser(parser),
    683    fName(name),
    684    fAttNames(errorCode),
    685    fAttValues(errorCode),
    686    fChildren(errorCode),
    687    fParent(NULL)
    688 {
    689 }
    690 
    691 UXMLElement::~UXMLElement() {
    692     int   i;
    693     // attribute names are owned by the UXMLParser, don't delete them here
    694     for (i=fAttValues.size()-1; i>=0; i--) {
    695         delete (UObject *)fAttValues.elementAt(i);
    696     }
    697     for (i=fChildren.size()-1; i>=0; i--) {
    698         delete (UObject *)fChildren.elementAt(i);
    699     }
    700 }
    701 
    702 const UnicodeString &
    703 UXMLElement::getTagName() const {
    704     return *fName;
    705 }
    706 
    707 UnicodeString
    708 UXMLElement::getText(UBool recurse) const {
    709     UnicodeString text;
    710     appendText(text, recurse);
    711     return text;
    712 }
    713 
    714 void
    715 UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
    716     const UObject *node;
    717     int32_t i, count=fChildren.size();
    718     for(i=0; i<count; ++i) {
    719         node=(const UObject *)fChildren.elementAt(i);
    720         const UnicodeString *s=dynamic_cast<const UnicodeString *>(node);
    721         if(s!=NULL) {
    722             text.append(*s);
    723         } else if(recurse) /* must be a UXMLElement */ {
    724             ((const UXMLElement *)node)->appendText(text, recurse);
    725         }
    726     }
    727 }
    728 
    729 int32_t
    730 UXMLElement::countAttributes() const {
    731     return fAttNames.size();
    732 }
    733 
    734 const UnicodeString *
    735 UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
    736     if(0<=i && i<fAttNames.size()) {
    737         name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
    738         value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
    739         return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
    740     } else {
    741         return NULL;
    742     }
    743 }
    744 
    745 const UnicodeString *
    746 UXMLElement::getAttribute(const UnicodeString &name) const {
    747     // search for the attribute name by comparing the interned pointer,
    748     // not the string contents
    749     const UnicodeString *p=fParser->findName(name);
    750     if(p==NULL) {
    751         return NULL; // no such attribute seen by the parser at all
    752     }
    753 
    754     int32_t i, count=fAttNames.size();
    755     for(i=0; i<count; ++i) {
    756         if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
    757             return (const UnicodeString *)fAttValues.elementAt(i);
    758         }
    759     }
    760     return NULL;
    761 }
    762 
    763 int32_t
    764 UXMLElement::countChildren() const {
    765     return fChildren.size();
    766 }
    767 
    768 const UObject *
    769 UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
    770     if(0<=i && i<fChildren.size()) {
    771         const UObject *node=(const UObject *)fChildren.elementAt(i);
    772         if(dynamic_cast<const UXMLElement *>(node)!=NULL) {
    773             type=UXML_NODE_TYPE_ELEMENT;
    774         } else {
    775             type=UXML_NODE_TYPE_STRING;
    776         }
    777         return node;
    778     } else {
    779         return NULL;
    780     }
    781 }
    782 
    783 const UXMLElement *
    784 UXMLElement::nextChildElement(int32_t &i) const {
    785     if(i<0) {
    786         return NULL;
    787     }
    788 
    789     const UObject *node;
    790     int32_t count=fChildren.size();
    791     while(i<count) {
    792         node=(const UObject *)fChildren.elementAt(i++);
    793         const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
    794         if(elem!=NULL) {
    795             return elem;
    796         }
    797     }
    798     return NULL;
    799 }
    800 
    801 const UXMLElement *
    802 UXMLElement::getChildElement(const UnicodeString &name) const {
    803     // search for the element name by comparing the interned pointer,
    804     // not the string contents
    805     const UnicodeString *p=fParser->findName(name);
    806     if(p==NULL) {
    807         return NULL; // no such element seen by the parser at all
    808     }
    809 
    810     const UObject *node;
    811     int32_t i, count=fChildren.size();
    812     for(i=0; i<count; ++i) {
    813         node=(const UObject *)fChildren.elementAt(i);
    814         const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
    815         if(elem!=NULL) {
    816             if(p==elem->fName) {
    817                 return elem;
    818             }
    819         }
    820     }
    821     return NULL;
    822 }
    823 
    824 U_NAMESPACE_END
    825 
    826 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
    827 
    828