1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2004-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: xmlparser.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2004jul21 14 * created by: Andy Heninger 15 */ 16 17 #include <stdio.h> 18 #include "unicode/uchar.h" 19 #include "unicode/ucnv.h" 20 #include "unicode/regex.h" 21 #include "filestrm.h" 22 #include "xmlparser.h" 23 24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION 25 26 // character constants 27 enum { 28 x_QUOT=0x22, 29 x_AMP=0x26, 30 x_APOS=0x27, 31 x_LT=0x3c, 32 x_GT=0x3e, 33 x_l=0x6c 34 }; 35 36 #define XML_SPACES "[ \\u0009\\u000d\\u000a]" 37 38 // XML #4 39 #define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \ 40 "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \ 41 "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \ 42 "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" 43 44 // XML #5 45 #define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]" 46 47 // XML #6 48 #define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*" 49 50 U_NAMESPACE_BEGIN 51 52 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser) 53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement) 54 55 // 56 // UXMLParser constructor. Mostly just initializes the ICU regexes that are 57 // used for parsing. 58 // 59 UXMLParser::UXMLParser(UErrorCode &status) : 60 // XML Declaration. XML Production #23. 61 // example: "<?xml version=1.0 encoding="utf-16" ?> 62 // This is a sloppy implementation - just look for the leading <?xml and the closing ?> 63 // allow for a possible leading BOM. 64 mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status), 65 66 // XML Comment production #15 67 // example: "<!-- whatever --> 68 // note, does not detect an illegal "--" within comments 69 mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status), 70 71 // XML Spaces 72 // production [3] 73 mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status), 74 75 // XML Doctype decl production #28 76 // example "<!DOCTYPE foo SYSTEM "somewhere" > 77 // or "<!DOCTYPE foo [internal dtd]> 78 // TODO: we don't actually parse the DOCTYPE or internal subsets. 79 // Some internal dtd subsets could confuse this simple-minded 80 // attempt at skipping over them, specifically, occcurences 81 // of closeing square brackets. These could appear in comments, 82 // or in parameter entity declarations, for example. 83 mXMLDoctype(UnicodeString( 84 "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV 85 ), 0, status), 86 87 // XML PI production #16 88 // example "<?target stuff?> 89 mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status), 90 91 // XML Element Start Productions #40, #41 92 // example <foo att1='abc' att2="d e f" > 93 // capture #1: the tag name 94 // 95 mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" 96 "(?:" 97 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " 98 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' 99 ")*" // * for zero or more attributes. 100 XML_SPACES "*?>", -1, US_INV), 0, status), // match " >" 101 102 // XML Element End production #42 103 // example </foo> 104 mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status), 105 106 // XML Element Empty production #44 107 // example <foo att1="abc" att2="d e f" /> 108 mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" 109 "(?:" 110 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " 111 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' 112 ")*" // * for zero or more attributes. 113 XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />" 114 115 116 // XMLCharData. Everything but '<'. Note that & will be dealt with later. 117 mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status), 118 119 // Attribute name = "value". XML Productions 10, 40/41 120 // Capture group 1 is name, 121 // 2 is the attribute value, including the quotes. 122 // 123 // Note that attributes are scanned twice. The first time is with 124 // the regex for an entire element start. There, the attributes 125 // are checked syntactically, but not separted out one by one. 126 // Here, we match a single attribute, and make its name and 127 // attribute value available to the parser code. 128 mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*" 129 "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status), 130 131 132 mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status), 133 134 // Match any of the new-line sequences in content. 135 // All are changed to \u000a. 136 mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status), 137 138 // & char references 139 // We will figure out what we've got based on which capture group has content. 140 // The last one is a catchall for unrecognized entity references.. 141 // 1 2 3 4 5 6 7 8 142 mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"), 143 0, status), 144 145 fNames(status), 146 fElementStack(status), 147 fOneLF((UChar)0x0a) // Plain new-line string, used in new line normalization. 148 { 149 } 150 151 UXMLParser * 152 UXMLParser::createParser(UErrorCode &errorCode) { 153 if (U_FAILURE(errorCode)) { 154 return NULL; 155 } else { 156 return new UXMLParser(errorCode); 157 } 158 } 159 160 UXMLParser::~UXMLParser() {} 161 162 UXMLElement * 163 UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) { 164 char bytes[4096], charsetBuffer[100]; 165 FileStream *f; 166 const char *charset, *pb; 167 UnicodeString src; 168 UConverter *cnv; 169 UChar *buffer, *pu; 170 int32_t fileLength, bytesLength, length, capacity; 171 UBool flush; 172 173 if(U_FAILURE(errorCode)) { 174 return NULL; 175 } 176 177 f=T_FileStream_open(filename, "rb"); 178 if(f==NULL) { 179 errorCode=U_FILE_ACCESS_ERROR; 180 return NULL; 181 } 182 183 bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); 184 if(bytesLength<(int32_t)sizeof(bytes)) { 185 // we have already read the entire file 186 fileLength=bytesLength; 187 } else { 188 // get the file length 189 fileLength=T_FileStream_size(f); 190 } 191 192 /* 193 * get the charset: 194 * 1. Unicode signature 195 * 2. treat as ISO-8859-1 and read XML encoding="charser" 196 * 3. default to UTF-8 197 */ 198 charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode); 199 if(U_SUCCESS(errorCode) && charset!=NULL) { 200 // open converter according to Unicode signature 201 cnv=ucnv_open(charset, &errorCode); 202 } else { 203 // read as Latin-1 and parse the XML declaration and encoding 204 cnv=ucnv_open("ISO-8859-1", &errorCode); 205 if(U_FAILURE(errorCode)) { 206 // unexpected error opening Latin-1 converter 207 goto exit; 208 } 209 210 buffer=src.getBuffer(bytesLength); 211 if(buffer==NULL) { 212 // unexpected failure to reserve some string capacity 213 errorCode=U_MEMORY_ALLOCATION_ERROR; 214 goto exit; 215 } 216 pb=bytes; 217 pu=buffer; 218 ucnv_toUnicode( 219 cnv, 220 &pu, buffer+src.getCapacity(), 221 &pb, bytes+bytesLength, 222 NULL, TRUE, &errorCode); 223 src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); 224 ucnv_close(cnv); 225 cnv=NULL; 226 if(U_FAILURE(errorCode)) { 227 // unexpected error in conversion from Latin-1 228 src.remove(); 229 goto exit; 230 } 231 232 // parse XML declaration 233 if(mXMLDecl.reset(src).lookingAt(0, errorCode)) { 234 int32_t declEnd=mXMLDecl.end(errorCode); 235 // go beyond <?xml 236 int32_t pos=src.indexOf((UChar)x_l)+1; 237 238 mAttrValue.reset(src); 239 while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loop runs once per attribute on this element. 240 UnicodeString attName = mAttrValue.group(1, errorCode); 241 UnicodeString attValue = mAttrValue.group(2, errorCode); 242 243 // Trim the quotes from the att value. These are left over from the original regex 244 // that parsed the attribue, which couldn't conveniently strip them. 245 attValue.remove(0,1); // one char from the beginning 246 attValue.truncate(attValue.length()-1); // and one from the end. 247 248 if(attName==UNICODE_STRING("encoding", 8)) { 249 length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer)); 250 charset=charsetBuffer; 251 break; 252 } 253 pos = mAttrValue.end(2, errorCode); 254 } 255 256 if(charset==NULL) { 257 // default to UTF-8 258 charset="UTF-8"; 259 } 260 cnv=ucnv_open(charset, &errorCode); 261 } 262 } 263 264 if(U_FAILURE(errorCode)) { 265 // unable to open the converter 266 goto exit; 267 } 268 269 // convert the file contents 270 capacity=fileLength; // estimated capacity 271 src.getBuffer(capacity); 272 src.releaseBuffer(0); // zero length 273 flush=FALSE; 274 for(;;) { 275 // convert contents of bytes[bytesLength] 276 pb=bytes; 277 for(;;) { 278 length=src.length(); 279 buffer=src.getBuffer(capacity); 280 if(buffer==NULL) { 281 // unexpected failure to reserve some string capacity 282 errorCode=U_MEMORY_ALLOCATION_ERROR; 283 goto exit; 284 } 285 286 pu=buffer+length; 287 ucnv_toUnicode( 288 cnv, &pu, buffer+src.getCapacity(), 289 &pb, bytes+bytesLength, 290 NULL, FALSE, &errorCode); 291 src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); 292 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 293 errorCode=U_ZERO_ERROR; 294 capacity=(3*src.getCapacity())/2; // increase capacity by 50% 295 } else { 296 break; 297 } 298 } 299 300 if(U_FAILURE(errorCode)) { 301 break; // conversion error 302 } 303 304 if(flush) { 305 break; // completely converted the file 306 } 307 308 // read next block 309 bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); 310 if(bytesLength==0) { 311 // reached end of file, convert once more to flush the converter 312 flush=TRUE; 313 } 314 }; 315 316 exit: 317 ucnv_close(cnv); 318 T_FileStream_close(f); 319 320 if(U_SUCCESS(errorCode)) { 321 return parse(src, errorCode); 322 } else { 323 return NULL; 324 } 325 } 326 327 UXMLElement * 328 UXMLParser::parse(const UnicodeString &src, UErrorCode &status) { 329 if(U_FAILURE(status)) { 330 return NULL; 331 } 332 333 UXMLElement *root = NULL; 334 fPos = 0; // TODO use just a local pos variable and pass it into functions 335 // where necessary? 336 337 // set all matchers to work on the input string 338 mXMLDecl.reset(src); 339 mXMLComment.reset(src); 340 mXMLSP.reset(src); 341 mXMLDoctype.reset(src); 342 mXMLPI.reset(src); 343 mXMLElemStart.reset(src); 344 mXMLElemEnd.reset(src); 345 mXMLElemEmpty.reset(src); 346 mXMLCharData.reset(src); 347 mAttrValue.reset(src); 348 mAttrNormalizer.reset(src); 349 mNewLineNormalizer.reset(src); 350 mAmps.reset(src); 351 352 // Consume the XML Declaration, if present. 353 if (mXMLDecl.lookingAt(fPos, status)) { 354 fPos = mXMLDecl.end(status); 355 } 356 357 // Consume "misc" [XML production 27] appearing before DocType 358 parseMisc(status); 359 360 // Consume a DocType declaration, if present. 361 if (mXMLDoctype.lookingAt(fPos, status)) { 362 fPos = mXMLDoctype.end(status); 363 } 364 365 // Consume additional "misc" [XML production 27] appearing after the DocType 366 parseMisc(status); 367 368 // Get the root element 369 if (mXMLElemEmpty.lookingAt(fPos, status)) { 370 // Root is an empty element (no nested elements or content) 371 root = createElement(mXMLElemEmpty, status); 372 fPos = mXMLElemEmpty.end(status); 373 } else { 374 if (mXMLElemStart.lookingAt(fPos, status) == FALSE) { 375 error("Root Element expected", status); 376 goto errorExit; 377 } 378 root = createElement(mXMLElemStart, status); 379 UXMLElement *el = root; 380 381 // 382 // This is the loop that consumes the root element of the document, 383 // including all nested content. Nested elements are handled by 384 // explicit pushes/pops of the element stack; there is no recursion 385 // in the control flow of this code. 386 // "el" always refers to the current element, the one to which content 387 // is being added. It is above the top of the element stack. 388 for (;;) { 389 // Nested Element Start 390 if (mXMLElemStart.lookingAt(fPos, status)) { 391 UXMLElement *t = createElement(mXMLElemStart, status); 392 el->fChildren.addElement(t, status); 393 t->fParent = el; 394 fElementStack.push(el, status); 395 el = t; 396 continue; 397 } 398 399 // Text Content. String is concatenated onto the current node's content, 400 // but only if it contains something other than spaces. 401 UnicodeString s = scanContent(status); 402 if (s.length() > 0) { 403 mXMLSP.reset(s); 404 if (mXMLSP.matches(status) == FALSE) { 405 // This chunk of text contains something other than just 406 // white space. Make a child node for it. 407 replaceCharRefs(s, status); 408 el->fChildren.addElement(s.clone(), status); 409 } 410 mXMLSP.reset(src); // The matchers need to stay set to the main input string. 411 continue; 412 } 413 414 // Comments. Discard. 415 if (mXMLComment.lookingAt(fPos, status)) { 416 fPos = mXMLComment.end(status); 417 continue; 418 } 419 420 // PIs. Discard. 421 if (mXMLPI.lookingAt(fPos, status)) { 422 fPos = mXMLPI.end(status); 423 continue; 424 } 425 426 // Element End 427 if (mXMLElemEnd.lookingAt(fPos, status)) { 428 fPos = mXMLElemEnd.end(0, status); 429 const UnicodeString name = mXMLElemEnd.group(1, status); 430 if (name != *el->fName) { 431 error("Element start / end tag mismatch", status); 432 goto errorExit; 433 } 434 if (fElementStack.empty()) { 435 // Close of the root element. We're done with the doc. 436 el = NULL; 437 break; 438 } 439 el = (UXMLElement *)fElementStack.pop(); 440 continue; 441 } 442 443 // Empty Element. Stored as a child of the current element, but not stacked. 444 if (mXMLElemEmpty.lookingAt(fPos, status)) { 445 UXMLElement *t = createElement(mXMLElemEmpty, status); 446 el->fChildren.addElement(t, status); 447 continue; 448 } 449 450 // Hit something within the document that doesn't match anything. 451 // It's an error. 452 error("Unrecognized markup", status); 453 break; 454 } 455 456 if (el != NULL || !fElementStack.empty()) { 457 // We bailed out early, for some reason. 458 error("Root element not closed.", status); 459 goto errorExit; 460 } 461 } 462 463 // Root Element parse is complete. 464 // Consume the annoying xml "Misc" that can appear at the end of the doc. 465 parseMisc(status); 466 467 // We should have reached the end of the input 468 if (fPos != src.length()) { 469 error("Extra content at the end of the document", status); 470 goto errorExit; 471 } 472 473 // Success! 474 return root; 475 476 errorExit: 477 delete root; 478 return NULL; 479 } 480 481 // 482 // createElement 483 // We've just matched an element start tag. Create and fill in a UXMLElement object 484 // for it. 485 // 486 UXMLElement * 487 UXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) { 488 // First capture group is the element's name. 489 UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status); 490 491 // Scan for attributes. 492 int32_t pos = mEl.end(1, status); // The position after the end of the tag name 493 494 while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute on this element. 495 UnicodeString attName = mAttrValue.group(1, status); 496 UnicodeString attValue = mAttrValue.group(2, status); 497 498 // Trim the quotes from the att value. These are left over from the original regex 499 // that parsed the attribue, which couldn't conveniently strip them. 500 attValue.remove(0,1); // one char from the beginning 501 attValue.truncate(attValue.length()-1); // and one from the end. 502 503 // XML Attribue value normalization. 504 // This is one of the really screwy parts of the XML spec. 505 // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize 506 // Note that non-validating parsers must treat all entities as type CDATA 507 // which simplifies things some. 508 509 // Att normalization step 1: normalize any newlines in the attribute value 510 mNewLineNormalizer.reset(attValue); 511 attValue = mNewLineNormalizer.replaceAll(fOneLF, status); 512 513 // Next change all xml white space chars to plain \u0020 spaces. 514 mAttrNormalizer.reset(attValue); 515 UnicodeString oneSpace((UChar)0x0020); 516 attValue = mAttrNormalizer.replaceAll(oneSpace, status); 517 518 // Replace character entities. 519 replaceCharRefs(attValue, status); 520 521 // Save the attribute name and value in our document structure. 522 el->fAttNames.addElement((void *)intern(attName, status), status); 523 el->fAttValues.addElement(attValue.clone(), status); 524 pos = mAttrValue.end(2, status); 525 } 526 fPos = mEl.end(0, status); 527 return el; 528 } 529 530 // 531 // parseMisc 532 // Consume XML "Misc" [production #27] 533 // which is any combination of space, PI and comments 534 // Need to watch end-of-input because xml MISC stuff is allowed after 535 // the document element, so we WILL scan off the end in this function 536 // 537 void 538 UXMLParser::parseMisc(UErrorCode &status) { 539 for (;;) { 540 if (fPos >= mXMLPI.input().length()) { 541 break; 542 } 543 if (mXMLPI.lookingAt(fPos, status)) { 544 fPos = mXMLPI.end(status); 545 continue; 546 } 547 if (mXMLSP.lookingAt(fPos, status)) { 548 fPos = mXMLSP.end(status); 549 continue; 550 } 551 if (mXMLComment.lookingAt(fPos, status)) { 552 fPos = mXMLComment.end(status); 553 continue; 554 } 555 break; 556 } 557 } 558 559 // 560 // Scan for document content. 561 // 562 UnicodeString 563 UXMLParser::scanContent(UErrorCode &status) { 564 UnicodeString result; 565 if (mXMLCharData.lookingAt(fPos, status)) { 566 result = mXMLCharData.group((int32_t)0, status); 567 // Normalize the new-lines. (Before char ref substitution) 568 mNewLineNormalizer.reset(result); 569 result = mNewLineNormalizer.replaceAll(fOneLF, status); 570 571 // TODO: handle CDATA 572 fPos = mXMLCharData.end(0, status); 573 } 574 575 return result; 576 } 577 578 // 579 // replaceCharRefs 580 // 581 // replace the char entities < & { ካ etc. in a string 582 // with the corresponding actual character. 583 // 584 void 585 UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) { 586 UnicodeString result; 587 UnicodeString replacement; 588 int i; 589 590 mAmps.reset(s); 591 // See the initialization for the regex matcher mAmps. 592 // Which entity we've matched is determined by which capture group has content, 593 // which is flaged by start() of that group not being -1. 594 while (mAmps.find()) { 595 if (mAmps.start(1, status) != -1) { 596 replacement.setTo((UChar)x_AMP); 597 } else if (mAmps.start(2, status) != -1) { 598 replacement.setTo((UChar)x_LT); 599 } else if (mAmps.start(3, status) != -1) { 600 replacement.setTo((UChar)x_GT); 601 } else if (mAmps.start(4, status) != -1) { 602 replacement.setTo((UChar)x_APOS); 603 } else if (mAmps.start(5, status) != -1) { 604 replacement.setTo((UChar)x_QUOT); 605 } else if (mAmps.start(6, status) != -1) { 606 UnicodeString hexString = mAmps.group(6, status); 607 UChar32 val = 0; 608 for (i=0; i<hexString.length(); i++) { 609 val = (val << 4) + u_digit(hexString.charAt(i), 16); 610 } 611 // TODO: some verification that the character is valid 612 replacement.setTo(val); 613 } else if (mAmps.start(7, status) != -1) { 614 UnicodeString decimalString = mAmps.group(7, status); 615 UChar32 val = 0; 616 for (i=0; i<decimalString.length(); i++) { 617 val = val*10 + u_digit(decimalString.charAt(i), 10); 618 } 619 // TODO: some verification that the character is valid 620 replacement.setTo(val); 621 } else { 622 // An unrecognized &entity; Leave it alone. 623 // TODO: check that it really looks like an entity, and is not some 624 // random & in the text. 625 replacement = mAmps.group((int32_t)0, status); 626 } 627 mAmps.appendReplacement(result, replacement, status); 628 } 629 mAmps.appendTail(result); 630 s = result; 631 } 632 633 void 634 UXMLParser::error(const char *message, UErrorCode &status) { 635 // TODO: something better here... 636 const UnicodeString &src=mXMLDecl.input(); 637 int line = 0; 638 int ci = 0; 639 while (ci < fPos && ci>=0) { 640 ci = src.indexOf((UChar)0x0a, ci+1); 641 line++; 642 } 643 fprintf(stderr, "Error: %s at line %d\n", message, line); 644 if (U_SUCCESS(status)) { 645 status = U_PARSE_ERROR; 646 } 647 } 648 649 // intern strings like in Java 650 651 const UnicodeString * 652 UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) { 653 const UHashElement *he=fNames.find(s); 654 if(he!=NULL) { 655 // already a known name, return its hashed key pointer 656 return (const UnicodeString *)he->key.pointer; 657 } else { 658 // add this new name and return its hashed key pointer 659 fNames.puti(s, 0, errorCode); 660 he=fNames.find(s); 661 return (const UnicodeString *)he->key.pointer; 662 } 663 } 664 665 const UnicodeString * 666 UXMLParser::findName(const UnicodeString &s) const { 667 const UHashElement *he=fNames.find(s); 668 if(he!=NULL) { 669 // a known name, return its hashed key pointer 670 return (const UnicodeString *)he->key.pointer; 671 } else { 672 // unknown name 673 return NULL; 674 } 675 } 676 677 // UXMLElement ------------------------------------------------------------- *** 678 679 UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) : 680 fParser(parser), 681 fName(name), 682 fAttNames(errorCode), 683 fAttValues(errorCode), 684 fChildren(errorCode), 685 fParent(NULL) 686 { 687 } 688 689 UXMLElement::~UXMLElement() { 690 int i; 691 // attribute names are owned by the UXMLParser, don't delete them here 692 for (i=fAttValues.size()-1; i>=0; i--) { 693 delete (UObject *)fAttValues.elementAt(i); 694 } 695 for (i=fChildren.size()-1; i>=0; i--) { 696 delete (UObject *)fChildren.elementAt(i); 697 } 698 } 699 700 const UnicodeString & 701 UXMLElement::getTagName() const { 702 return *fName; 703 } 704 705 UnicodeString 706 UXMLElement::getText(UBool recurse) const { 707 UnicodeString text; 708 appendText(text, recurse); 709 return text; 710 } 711 712 void 713 UXMLElement::appendText(UnicodeString &text, UBool recurse) const { 714 const UObject *node; 715 int32_t i, count=fChildren.size(); 716 for(i=0; i<count; ++i) { 717 node=(const UObject *)fChildren.elementAt(i); 718 const UnicodeString *s=dynamic_cast<const UnicodeString *>(node); 719 if(s!=NULL) { 720 text.append(*s); 721 } else if(recurse) /* must be a UXMLElement */ { 722 ((const UXMLElement *)node)->appendText(text, recurse); 723 } 724 } 725 } 726 727 int32_t 728 UXMLElement::countAttributes() const { 729 return fAttNames.size(); 730 } 731 732 const UnicodeString * 733 UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const { 734 if(0<=i && i<fAttNames.size()) { 735 name.setTo(*(const UnicodeString *)fAttNames.elementAt(i)); 736 value.setTo(*(const UnicodeString *)fAttValues.elementAt(i)); 737 return &value; // or return (UnicodeString *)fAttValues.elementAt(i); 738 } else { 739 return NULL; 740 } 741 } 742 743 const UnicodeString * 744 UXMLElement::getAttribute(const UnicodeString &name) const { 745 // search for the attribute name by comparing the interned pointer, 746 // not the string contents 747 const UnicodeString *p=fParser->findName(name); 748 if(p==NULL) { 749 return NULL; // no such attribute seen by the parser at all 750 } 751 752 int32_t i, count=fAttNames.size(); 753 for(i=0; i<count; ++i) { 754 if(p==(const UnicodeString *)fAttNames.elementAt(i)) { 755 return (const UnicodeString *)fAttValues.elementAt(i); 756 } 757 } 758 return NULL; 759 } 760 761 int32_t 762 UXMLElement::countChildren() const { 763 return fChildren.size(); 764 } 765 766 const UObject * 767 UXMLElement::getChild(int32_t i, UXMLNodeType &type) const { 768 if(0<=i && i<fChildren.size()) { 769 const UObject *node=(const UObject *)fChildren.elementAt(i); 770 if(dynamic_cast<const UXMLElement *>(node)!=NULL) { 771 type=UXML_NODE_TYPE_ELEMENT; 772 } else { 773 type=UXML_NODE_TYPE_STRING; 774 } 775 return node; 776 } else { 777 return NULL; 778 } 779 } 780 781 const UXMLElement * 782 UXMLElement::nextChildElement(int32_t &i) const { 783 if(i<0) { 784 return NULL; 785 } 786 787 const UObject *node; 788 int32_t count=fChildren.size(); 789 while(i<count) { 790 node=(const UObject *)fChildren.elementAt(i++); 791 const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); 792 if(elem!=NULL) { 793 return elem; 794 } 795 } 796 return NULL; 797 } 798 799 const UXMLElement * 800 UXMLElement::getChildElement(const UnicodeString &name) const { 801 // search for the element name by comparing the interned pointer, 802 // not the string contents 803 const UnicodeString *p=fParser->findName(name); 804 if(p==NULL) { 805 return NULL; // no such element seen by the parser at all 806 } 807 808 const UObject *node; 809 int32_t i, count=fChildren.size(); 810 for(i=0; i<count; ++i) { 811 node=(const UObject *)fChildren.elementAt(i); 812 const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); 813 if(elem!=NULL) { 814 if(p==elem->fName) { 815 return elem; 816 } 817 } 818 } 819 return NULL; 820 } 821 822 U_NAMESPACE_END 823 824 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 825 826