1 /* 2 * HTMLtree.c : implementation of access function for an HTML tree. 3 * 4 * See Copyright for the status of this software. 5 * 6 * daniel (at) veillard.com 7 */ 8 9 10 #define IN_LIBXML 11 #include "libxml.h" 12 #ifdef LIBXML_HTML_ENABLED 13 14 #include <string.h> /* for memset() only ! */ 15 16 #ifdef HAVE_CTYPE_H 17 #include <ctype.h> 18 #endif 19 #ifdef HAVE_STDLIB_H 20 #include <stdlib.h> 21 #endif 22 23 #include <libxml/xmlmemory.h> 24 #include <libxml/HTMLparser.h> 25 #include <libxml/HTMLtree.h> 26 #include <libxml/entities.h> 27 #include <libxml/valid.h> 28 #include <libxml/xmlerror.h> 29 #include <libxml/parserInternals.h> 30 #include <libxml/globals.h> 31 #include <libxml/uri.h> 32 33 #include "buf.h" 34 35 /************************************************************************ 36 * * 37 * Getting/Setting encoding meta tags * 38 * * 39 ************************************************************************/ 40 41 /** 42 * htmlGetMetaEncoding: 43 * @doc: the document 44 * 45 * Encoding definition lookup in the Meta tags 46 * 47 * Returns the current encoding as flagged in the HTML source 48 */ 49 const xmlChar * 50 htmlGetMetaEncoding(htmlDocPtr doc) { 51 htmlNodePtr cur; 52 const xmlChar *content; 53 const xmlChar *encoding; 54 55 if (doc == NULL) 56 return(NULL); 57 cur = doc->children; 58 59 /* 60 * Search the html 61 */ 62 while (cur != NULL) { 63 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 64 if (xmlStrEqual(cur->name, BAD_CAST"html")) 65 break; 66 if (xmlStrEqual(cur->name, BAD_CAST"head")) 67 goto found_head; 68 if (xmlStrEqual(cur->name, BAD_CAST"meta")) 69 goto found_meta; 70 } 71 cur = cur->next; 72 } 73 if (cur == NULL) 74 return(NULL); 75 cur = cur->children; 76 77 /* 78 * Search the head 79 */ 80 while (cur != NULL) { 81 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 82 if (xmlStrEqual(cur->name, BAD_CAST"head")) 83 break; 84 if (xmlStrEqual(cur->name, BAD_CAST"meta")) 85 goto found_meta; 86 } 87 cur = cur->next; 88 } 89 if (cur == NULL) 90 return(NULL); 91 found_head: 92 cur = cur->children; 93 94 /* 95 * Search the meta elements 96 */ 97 found_meta: 98 while (cur != NULL) { 99 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 100 if (xmlStrEqual(cur->name, BAD_CAST"meta")) { 101 xmlAttrPtr attr = cur->properties; 102 int http; 103 const xmlChar *value; 104 105 content = NULL; 106 http = 0; 107 while (attr != NULL) { 108 if ((attr->children != NULL) && 109 (attr->children->type == XML_TEXT_NODE) && 110 (attr->children->next == NULL)) { 111 value = attr->children->content; 112 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) 113 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 114 http = 1; 115 else if ((value != NULL) 116 && (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) 117 content = value; 118 if ((http != 0) && (content != NULL)) 119 goto found_content; 120 } 121 attr = attr->next; 122 } 123 } 124 } 125 cur = cur->next; 126 } 127 return(NULL); 128 129 found_content: 130 encoding = xmlStrstr(content, BAD_CAST"charset="); 131 if (encoding == NULL) 132 encoding = xmlStrstr(content, BAD_CAST"Charset="); 133 if (encoding == NULL) 134 encoding = xmlStrstr(content, BAD_CAST"CHARSET="); 135 if (encoding != NULL) { 136 encoding += 8; 137 } else { 138 encoding = xmlStrstr(content, BAD_CAST"charset ="); 139 if (encoding == NULL) 140 encoding = xmlStrstr(content, BAD_CAST"Charset ="); 141 if (encoding == NULL) 142 encoding = xmlStrstr(content, BAD_CAST"CHARSET ="); 143 if (encoding != NULL) 144 encoding += 9; 145 } 146 if (encoding != NULL) { 147 while ((*encoding == ' ') || (*encoding == '\t')) encoding++; 148 } 149 return(encoding); 150 } 151 152 /** 153 * htmlSetMetaEncoding: 154 * @doc: the document 155 * @encoding: the encoding string 156 * 157 * Sets the current encoding in the Meta tags 158 * NOTE: this will not change the document content encoding, just 159 * the META flag associated. 160 * 161 * Returns 0 in case of success and -1 in case of error 162 */ 163 int 164 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) { 165 htmlNodePtr cur, meta = NULL, head = NULL; 166 const xmlChar *content = NULL; 167 char newcontent[100]; 168 169 newcontent[0] = 0; 170 171 if (doc == NULL) 172 return(-1); 173 174 /* html isn't a real encoding it's just libxml2 way to get entities */ 175 if (!xmlStrcasecmp(encoding, BAD_CAST "html")) 176 return(-1); 177 178 if (encoding != NULL) { 179 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s", 180 (char *)encoding); 181 newcontent[sizeof(newcontent) - 1] = 0; 182 } 183 184 cur = doc->children; 185 186 /* 187 * Search the html 188 */ 189 while (cur != NULL) { 190 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 191 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0) 192 break; 193 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) 194 goto found_head; 195 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) 196 goto found_meta; 197 } 198 cur = cur->next; 199 } 200 if (cur == NULL) 201 return(-1); 202 cur = cur->children; 203 204 /* 205 * Search the head 206 */ 207 while (cur != NULL) { 208 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 209 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) 210 break; 211 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { 212 head = cur->parent; 213 goto found_meta; 214 } 215 } 216 cur = cur->next; 217 } 218 if (cur == NULL) 219 return(-1); 220 found_head: 221 head = cur; 222 if (cur->children == NULL) 223 goto create; 224 cur = cur->children; 225 226 found_meta: 227 /* 228 * Search and update all the remaining the meta elements carrying 229 * encoding informations 230 */ 231 while (cur != NULL) { 232 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 233 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { 234 xmlAttrPtr attr = cur->properties; 235 int http; 236 const xmlChar *value; 237 238 content = NULL; 239 http = 0; 240 while (attr != NULL) { 241 if ((attr->children != NULL) && 242 (attr->children->type == XML_TEXT_NODE) && 243 (attr->children->next == NULL)) { 244 value = attr->children->content; 245 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) 246 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 247 http = 1; 248 else 249 { 250 if ((value != NULL) && 251 (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) 252 content = value; 253 } 254 if ((http != 0) && (content != NULL)) 255 break; 256 } 257 attr = attr->next; 258 } 259 if ((http != 0) && (content != NULL)) { 260 meta = cur; 261 break; 262 } 263 264 } 265 } 266 cur = cur->next; 267 } 268 create: 269 if (meta == NULL) { 270 if ((encoding != NULL) && (head != NULL)) { 271 /* 272 * Create a new Meta element with the right attributes 273 */ 274 275 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); 276 if (head->children == NULL) 277 xmlAddChild(head, meta); 278 else 279 xmlAddPrevSibling(head->children, meta); 280 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); 281 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); 282 } 283 } else { 284 /* remove the meta tag if NULL is passed */ 285 if (encoding == NULL) { 286 xmlUnlinkNode(meta); 287 xmlFreeNode(meta); 288 } 289 /* change the document only if there is a real encoding change */ 290 else if (xmlStrcasestr(content, encoding) == NULL) { 291 xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent); 292 } 293 } 294 295 296 return(0); 297 } 298 299 /** 300 * booleanHTMLAttrs: 301 * 302 * These are the HTML attributes which will be output 303 * in minimized form, i.e. <option selected="selected"> will be 304 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method" 305 * 306 */ 307 static const char* htmlBooleanAttrs[] = { 308 "checked", "compact", "declare", "defer", "disabled", "ismap", 309 "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly", 310 "selected", NULL 311 }; 312 313 314 /** 315 * htmlIsBooleanAttr: 316 * @name: the name of the attribute to check 317 * 318 * Determine if a given attribute is a boolean attribute. 319 * 320 * returns: false if the attribute is not boolean, true otherwise. 321 */ 322 int 323 htmlIsBooleanAttr(const xmlChar *name) 324 { 325 int i = 0; 326 327 while (htmlBooleanAttrs[i] != NULL) { 328 if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0) 329 return 1; 330 i++; 331 } 332 return 0; 333 } 334 335 #ifdef LIBXML_OUTPUT_ENABLED 336 /* 337 * private routine exported from xmlIO.c 338 */ 339 xmlOutputBufferPtr 340 xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder); 341 /************************************************************************ 342 * * 343 * Output error handlers * 344 * * 345 ************************************************************************/ 346 /** 347 * htmlSaveErrMemory: 348 * @extra: extra informations 349 * 350 * Handle an out of memory condition 351 */ 352 static void 353 htmlSaveErrMemory(const char *extra) 354 { 355 __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra); 356 } 357 358 /** 359 * htmlSaveErr: 360 * @code: the error number 361 * @node: the location of the error. 362 * @extra: extra informations 363 * 364 * Handle an out of memory condition 365 */ 366 static void 367 htmlSaveErr(int code, xmlNodePtr node, const char *extra) 368 { 369 const char *msg = NULL; 370 371 switch(code) { 372 case XML_SAVE_NOT_UTF8: 373 msg = "string is not in UTF-8\n"; 374 break; 375 case XML_SAVE_CHAR_INVALID: 376 msg = "invalid character value\n"; 377 break; 378 case XML_SAVE_UNKNOWN_ENCODING: 379 msg = "unknown encoding %s\n"; 380 break; 381 case XML_SAVE_NO_DOCTYPE: 382 msg = "HTML has no DOCTYPE\n"; 383 break; 384 default: 385 msg = "unexpected error number\n"; 386 } 387 __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra); 388 } 389 390 /************************************************************************ 391 * * 392 * Dumping HTML tree content to a simple buffer * 393 * * 394 ************************************************************************/ 395 396 /** 397 * htmlBufNodeDumpFormat: 398 * @buf: the xmlBufPtr output 399 * @doc: the document 400 * @cur: the current node 401 * @format: should formatting spaces been added 402 * 403 * Dump an HTML node, recursive behaviour,children are printed too. 404 * 405 * Returns the number of byte written or -1 in case of error 406 */ 407 static size_t 408 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur, 409 int format) { 410 size_t use; 411 int ret; 412 xmlOutputBufferPtr outbuf; 413 414 if (cur == NULL) { 415 return (-1); 416 } 417 if (buf == NULL) { 418 return (-1); 419 } 420 outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer)); 421 if (outbuf == NULL) { 422 htmlSaveErrMemory("allocating HTML output buffer"); 423 return (-1); 424 } 425 memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer)); 426 outbuf->buffer = buf; 427 outbuf->encoder = NULL; 428 outbuf->writecallback = NULL; 429 outbuf->closecallback = NULL; 430 outbuf->context = NULL; 431 outbuf->written = 0; 432 433 use = xmlBufUse(buf); 434 htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format); 435 xmlFree(outbuf); 436 ret = xmlBufUse(buf) - use; 437 return (ret); 438 } 439 440 /** 441 * htmlNodeDump: 442 * @buf: the HTML buffer output 443 * @doc: the document 444 * @cur: the current node 445 * 446 * Dump an HTML node, recursive behaviour,children are printed too, 447 * and formatting returns are added. 448 * 449 * Returns the number of byte written or -1 in case of error 450 */ 451 int 452 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) { 453 xmlBufPtr buffer; 454 size_t ret; 455 456 if ((buf == NULL) || (cur == NULL)) 457 return(-1); 458 459 xmlInitParser(); 460 buffer = xmlBufFromBuffer(buf); 461 if (buffer == NULL) 462 return(-1); 463 464 ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1); 465 466 xmlBufBackToBuffer(buffer); 467 468 if (ret > INT_MAX) 469 return(-1); 470 return((int) ret); 471 } 472 473 /** 474 * htmlNodeDumpFileFormat: 475 * @out: the FILE pointer 476 * @doc: the document 477 * @cur: the current node 478 * @encoding: the document encoding 479 * @format: should formatting spaces been added 480 * 481 * Dump an HTML node, recursive behaviour,children are printed too. 482 * 483 * TODO: if encoding == NULL try to save in the doc encoding 484 * 485 * returns: the number of byte written or -1 in case of failure. 486 */ 487 int 488 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc, 489 xmlNodePtr cur, const char *encoding, int format) { 490 xmlOutputBufferPtr buf; 491 xmlCharEncodingHandlerPtr handler = NULL; 492 int ret; 493 494 xmlInitParser(); 495 496 if (encoding != NULL) { 497 xmlCharEncoding enc; 498 499 enc = xmlParseCharEncoding(encoding); 500 if (enc != XML_CHAR_ENCODING_UTF8) { 501 handler = xmlFindCharEncodingHandler(encoding); 502 if (handler == NULL) 503 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 504 } 505 } 506 507 /* 508 * Fallback to HTML or ASCII when the encoding is unspecified 509 */ 510 if (handler == NULL) 511 handler = xmlFindCharEncodingHandler("HTML"); 512 if (handler == NULL) 513 handler = xmlFindCharEncodingHandler("ascii"); 514 515 /* 516 * save the content to a temp buffer. 517 */ 518 buf = xmlOutputBufferCreateFile(out, handler); 519 if (buf == NULL) return(0); 520 521 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); 522 523 ret = xmlOutputBufferClose(buf); 524 return(ret); 525 } 526 527 /** 528 * htmlNodeDumpFile: 529 * @out: the FILE pointer 530 * @doc: the document 531 * @cur: the current node 532 * 533 * Dump an HTML node, recursive behaviour,children are printed too, 534 * and formatting returns are added. 535 */ 536 void 537 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) { 538 htmlNodeDumpFileFormat(out, doc, cur, NULL, 1); 539 } 540 541 /** 542 * htmlDocDumpMemoryFormat: 543 * @cur: the document 544 * @mem: OUT: the memory pointer 545 * @size: OUT: the memory length 546 * @format: should formatting spaces been added 547 * 548 * Dump an HTML document in memory and return the xmlChar * and it's size. 549 * It's up to the caller to free the memory. 550 */ 551 void 552 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) { 553 xmlOutputBufferPtr buf; 554 xmlCharEncodingHandlerPtr handler = NULL; 555 const char *encoding; 556 557 xmlInitParser(); 558 559 if ((mem == NULL) || (size == NULL)) 560 return; 561 if (cur == NULL) { 562 *mem = NULL; 563 *size = 0; 564 return; 565 } 566 567 encoding = (const char *) htmlGetMetaEncoding(cur); 568 569 if (encoding != NULL) { 570 xmlCharEncoding enc; 571 572 enc = xmlParseCharEncoding(encoding); 573 if (enc != cur->charset) { 574 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 575 /* 576 * Not supported yet 577 */ 578 *mem = NULL; 579 *size = 0; 580 return; 581 } 582 583 handler = xmlFindCharEncodingHandler(encoding); 584 if (handler == NULL) 585 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 586 587 } else { 588 handler = xmlFindCharEncodingHandler(encoding); 589 } 590 } 591 592 /* 593 * Fallback to HTML or ASCII when the encoding is unspecified 594 */ 595 if (handler == NULL) 596 handler = xmlFindCharEncodingHandler("HTML"); 597 if (handler == NULL) 598 handler = xmlFindCharEncodingHandler("ascii"); 599 600 buf = xmlAllocOutputBufferInternal(handler); 601 if (buf == NULL) { 602 *mem = NULL; 603 *size = 0; 604 return; 605 } 606 607 htmlDocContentDumpFormatOutput(buf, cur, NULL, format); 608 609 xmlOutputBufferFlush(buf); 610 if (buf->conv != NULL) { 611 *size = xmlBufUse(buf->conv); 612 *mem = xmlStrndup(xmlBufContent(buf->conv), *size); 613 } else { 614 *size = xmlBufUse(buf->buffer); 615 *mem = xmlStrndup(xmlBufContent(buf->buffer), *size); 616 } 617 (void)xmlOutputBufferClose(buf); 618 } 619 620 /** 621 * htmlDocDumpMemory: 622 * @cur: the document 623 * @mem: OUT: the memory pointer 624 * @size: OUT: the memory length 625 * 626 * Dump an HTML document in memory and return the xmlChar * and it's size. 627 * It's up to the caller to free the memory. 628 */ 629 void 630 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) { 631 htmlDocDumpMemoryFormat(cur, mem, size, 1); 632 } 633 634 635 /************************************************************************ 636 * * 637 * Dumping HTML tree content to an I/O output buffer * 638 * * 639 ************************************************************************/ 640 641 void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur); 642 643 /** 644 * htmlDtdDumpOutput: 645 * @buf: the HTML buffer output 646 * @doc: the document 647 * @encoding: the encoding string 648 * 649 * TODO: check whether encoding is needed 650 * 651 * Dump the HTML document DTD, if any. 652 */ 653 static void 654 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 655 const char *encoding ATTRIBUTE_UNUSED) { 656 xmlDtdPtr cur = doc->intSubset; 657 658 if (cur == NULL) { 659 htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL); 660 return; 661 } 662 xmlOutputBufferWriteString(buf, "<!DOCTYPE "); 663 xmlOutputBufferWriteString(buf, (const char *)cur->name); 664 if (cur->ExternalID != NULL) { 665 xmlOutputBufferWriteString(buf, " PUBLIC "); 666 xmlBufWriteQuotedString(buf->buffer, cur->ExternalID); 667 if (cur->SystemID != NULL) { 668 xmlOutputBufferWriteString(buf, " "); 669 xmlBufWriteQuotedString(buf->buffer, cur->SystemID); 670 } 671 } else if (cur->SystemID != NULL) { 672 xmlOutputBufferWriteString(buf, " SYSTEM "); 673 xmlBufWriteQuotedString(buf->buffer, cur->SystemID); 674 } 675 xmlOutputBufferWriteString(buf, ">\n"); 676 } 677 678 /** 679 * htmlAttrDumpOutput: 680 * @buf: the HTML buffer output 681 * @doc: the document 682 * @cur: the attribute pointer 683 * @encoding: the encoding string 684 * 685 * Dump an HTML attribute 686 */ 687 static void 688 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, 689 const char *encoding ATTRIBUTE_UNUSED) { 690 xmlChar *value; 691 692 /* 693 * The html output method should not escape a & character 694 * occurring in an attribute value immediately followed by 695 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation). 696 * This is implemented in xmlEncodeEntitiesReentrant 697 */ 698 699 if (cur == NULL) { 700 return; 701 } 702 xmlOutputBufferWriteString(buf, " "); 703 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 704 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 705 xmlOutputBufferWriteString(buf, ":"); 706 } 707 xmlOutputBufferWriteString(buf, (const char *)cur->name); 708 if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) { 709 value = xmlNodeListGetString(doc, cur->children, 0); 710 if (value) { 711 xmlOutputBufferWriteString(buf, "="); 712 if ((cur->ns == NULL) && (cur->parent != NULL) && 713 (cur->parent->ns == NULL) && 714 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) || 715 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) || 716 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) || 717 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) && 718 (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) { 719 xmlChar *escaped; 720 xmlChar *tmp = value; 721 722 while (IS_BLANK_CH(*tmp)) tmp++; 723 724 /* 725 * the < and > have already been escaped at the entity level 726 * And doing so here breaks server side includes 727 */ 728 escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+<>"); 729 if (escaped != NULL) { 730 xmlBufWriteQuotedString(buf->buffer, escaped); 731 xmlFree(escaped); 732 } else { 733 xmlBufWriteQuotedString(buf->buffer, value); 734 } 735 } else { 736 xmlBufWriteQuotedString(buf->buffer, value); 737 } 738 xmlFree(value); 739 } else { 740 xmlOutputBufferWriteString(buf, "=\"\""); 741 } 742 } 743 } 744 745 /** 746 * htmlAttrListDumpOutput: 747 * @buf: the HTML buffer output 748 * @doc: the document 749 * @cur: the first attribute pointer 750 * @encoding: the encoding string 751 * 752 * Dump a list of HTML attributes 753 */ 754 static void 755 htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) { 756 if (cur == NULL) { 757 return; 758 } 759 while (cur != NULL) { 760 htmlAttrDumpOutput(buf, doc, cur, encoding); 761 cur = cur->next; 762 } 763 } 764 765 766 767 /** 768 * htmlNodeListDumpOutput: 769 * @buf: the HTML buffer output 770 * @doc: the document 771 * @cur: the first node 772 * @encoding: the encoding string 773 * @format: should formatting spaces been added 774 * 775 * Dump an HTML node list, recursive behaviour,children are printed too. 776 */ 777 static void 778 htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 779 xmlNodePtr cur, const char *encoding, int format) { 780 if (cur == NULL) { 781 return; 782 } 783 while (cur != NULL) { 784 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); 785 cur = cur->next; 786 } 787 } 788 789 /** 790 * htmlNodeDumpFormatOutput: 791 * @buf: the HTML buffer output 792 * @doc: the document 793 * @cur: the current node 794 * @encoding: the encoding string 795 * @format: should formatting spaces been added 796 * 797 * Dump an HTML node, recursive behaviour,children are printed too. 798 */ 799 void 800 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 801 xmlNodePtr cur, const char *encoding, int format) { 802 const htmlElemDesc * info; 803 804 xmlInitParser(); 805 806 if ((cur == NULL) || (buf == NULL)) { 807 return; 808 } 809 /* 810 * Special cases. 811 */ 812 if (cur->type == XML_DTD_NODE) 813 return; 814 if ((cur->type == XML_HTML_DOCUMENT_NODE) || 815 (cur->type == XML_DOCUMENT_NODE)){ 816 htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding); 817 return; 818 } 819 if (cur->type == XML_ATTRIBUTE_NODE) { 820 htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding); 821 return; 822 } 823 if (cur->type == HTML_TEXT_NODE) { 824 if (cur->content != NULL) { 825 if (((cur->name == (const xmlChar *)xmlStringText) || 826 (cur->name != (const xmlChar *)xmlStringTextNoenc)) && 827 ((cur->parent == NULL) || 828 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) && 829 (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) { 830 xmlChar *buffer; 831 832 buffer = xmlEncodeEntitiesReentrant(doc, cur->content); 833 if (buffer != NULL) { 834 xmlOutputBufferWriteString(buf, (const char *)buffer); 835 xmlFree(buffer); 836 } 837 } else { 838 xmlOutputBufferWriteString(buf, (const char *)cur->content); 839 } 840 } 841 return; 842 } 843 if (cur->type == HTML_COMMENT_NODE) { 844 if (cur->content != NULL) { 845 xmlOutputBufferWriteString(buf, "<!--"); 846 xmlOutputBufferWriteString(buf, (const char *)cur->content); 847 xmlOutputBufferWriteString(buf, "-->"); 848 } 849 return; 850 } 851 if (cur->type == HTML_PI_NODE) { 852 if (cur->name == NULL) 853 return; 854 xmlOutputBufferWriteString(buf, "<?"); 855 xmlOutputBufferWriteString(buf, (const char *)cur->name); 856 if (cur->content != NULL) { 857 xmlOutputBufferWriteString(buf, " "); 858 xmlOutputBufferWriteString(buf, (const char *)cur->content); 859 } 860 xmlOutputBufferWriteString(buf, ">"); 861 return; 862 } 863 if (cur->type == HTML_ENTITY_REF_NODE) { 864 xmlOutputBufferWriteString(buf, "&"); 865 xmlOutputBufferWriteString(buf, (const char *)cur->name); 866 xmlOutputBufferWriteString(buf, ";"); 867 return; 868 } 869 if (cur->type == HTML_PRESERVE_NODE) { 870 if (cur->content != NULL) { 871 xmlOutputBufferWriteString(buf, (const char *)cur->content); 872 } 873 return; 874 } 875 876 /* 877 * Get specific HTML info for that node. 878 */ 879 if (cur->ns == NULL) 880 info = htmlTagLookup(cur->name); 881 else 882 info = NULL; 883 884 xmlOutputBufferWriteString(buf, "<"); 885 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 886 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 887 xmlOutputBufferWriteString(buf, ":"); 888 } 889 xmlOutputBufferWriteString(buf, (const char *)cur->name); 890 if (cur->nsDef) 891 xmlNsListDumpOutput(buf, cur->nsDef); 892 if (cur->properties != NULL) 893 htmlAttrListDumpOutput(buf, doc, cur->properties, encoding); 894 895 if ((info != NULL) && (info->empty)) { 896 xmlOutputBufferWriteString(buf, ">"); 897 if ((format) && (!info->isinline) && (cur->next != NULL)) { 898 if ((cur->next->type != HTML_TEXT_NODE) && 899 (cur->next->type != HTML_ENTITY_REF_NODE) && 900 (cur->parent != NULL) && 901 (cur->parent->name != NULL) && 902 (cur->parent->name[0] != 'p')) /* p, pre, param */ 903 xmlOutputBufferWriteString(buf, "\n"); 904 } 905 return; 906 } 907 if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) && 908 (cur->children == NULL)) { 909 if ((info != NULL) && (info->saveEndTag != 0) && 910 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) && 911 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) { 912 xmlOutputBufferWriteString(buf, ">"); 913 } else { 914 xmlOutputBufferWriteString(buf, "></"); 915 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 916 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 917 xmlOutputBufferWriteString(buf, ":"); 918 } 919 xmlOutputBufferWriteString(buf, (const char *)cur->name); 920 xmlOutputBufferWriteString(buf, ">"); 921 } 922 if ((format) && (cur->next != NULL) && 923 (info != NULL) && (!info->isinline)) { 924 if ((cur->next->type != HTML_TEXT_NODE) && 925 (cur->next->type != HTML_ENTITY_REF_NODE) && 926 (cur->parent != NULL) && 927 (cur->parent->name != NULL) && 928 (cur->parent->name[0] != 'p')) /* p, pre, param */ 929 xmlOutputBufferWriteString(buf, "\n"); 930 } 931 return; 932 } 933 xmlOutputBufferWriteString(buf, ">"); 934 if ((cur->type != XML_ELEMENT_NODE) && 935 (cur->content != NULL)) { 936 /* 937 * Uses the OutputBuffer property to automatically convert 938 * invalids to charrefs 939 */ 940 941 xmlOutputBufferWriteString(buf, (const char *) cur->content); 942 } 943 if (cur->children != NULL) { 944 if ((format) && (info != NULL) && (!info->isinline) && 945 (cur->children->type != HTML_TEXT_NODE) && 946 (cur->children->type != HTML_ENTITY_REF_NODE) && 947 (cur->children != cur->last) && 948 (cur->name != NULL) && 949 (cur->name[0] != 'p')) /* p, pre, param */ 950 xmlOutputBufferWriteString(buf, "\n"); 951 htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format); 952 if ((format) && (info != NULL) && (!info->isinline) && 953 (cur->last->type != HTML_TEXT_NODE) && 954 (cur->last->type != HTML_ENTITY_REF_NODE) && 955 (cur->children != cur->last) && 956 (cur->name != NULL) && 957 (cur->name[0] != 'p')) /* p, pre, param */ 958 xmlOutputBufferWriteString(buf, "\n"); 959 } 960 xmlOutputBufferWriteString(buf, "</"); 961 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 962 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 963 xmlOutputBufferWriteString(buf, ":"); 964 } 965 xmlOutputBufferWriteString(buf, (const char *)cur->name); 966 xmlOutputBufferWriteString(buf, ">"); 967 if ((format) && (info != NULL) && (!info->isinline) && 968 (cur->next != NULL)) { 969 if ((cur->next->type != HTML_TEXT_NODE) && 970 (cur->next->type != HTML_ENTITY_REF_NODE) && 971 (cur->parent != NULL) && 972 (cur->parent->name != NULL) && 973 (cur->parent->name[0] != 'p')) /* p, pre, param */ 974 xmlOutputBufferWriteString(buf, "\n"); 975 } 976 } 977 978 /** 979 * htmlNodeDumpOutput: 980 * @buf: the HTML buffer output 981 * @doc: the document 982 * @cur: the current node 983 * @encoding: the encoding string 984 * 985 * Dump an HTML node, recursive behaviour,children are printed too, 986 * and formatting returns/spaces are added. 987 */ 988 void 989 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 990 xmlNodePtr cur, const char *encoding) { 991 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1); 992 } 993 994 /** 995 * htmlDocContentDumpFormatOutput: 996 * @buf: the HTML buffer output 997 * @cur: the document 998 * @encoding: the encoding string 999 * @format: should formatting spaces been added 1000 * 1001 * Dump an HTML document. 1002 */ 1003 void 1004 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, 1005 const char *encoding, int format) { 1006 int type; 1007 1008 xmlInitParser(); 1009 1010 if ((buf == NULL) || (cur == NULL)) 1011 return; 1012 1013 /* 1014 * force to output the stuff as HTML, especially for entities 1015 */ 1016 type = cur->type; 1017 cur->type = XML_HTML_DOCUMENT_NODE; 1018 if (cur->intSubset != NULL) { 1019 htmlDtdDumpOutput(buf, cur, NULL); 1020 } 1021 if (cur->children != NULL) { 1022 htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format); 1023 } 1024 xmlOutputBufferWriteString(buf, "\n"); 1025 cur->type = (xmlElementType) type; 1026 } 1027 1028 /** 1029 * htmlDocContentDumpOutput: 1030 * @buf: the HTML buffer output 1031 * @cur: the document 1032 * @encoding: the encoding string 1033 * 1034 * Dump an HTML document. Formating return/spaces are added. 1035 */ 1036 void 1037 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, 1038 const char *encoding) { 1039 htmlDocContentDumpFormatOutput(buf, cur, encoding, 1); 1040 } 1041 1042 /************************************************************************ 1043 * * 1044 * Saving functions front-ends * 1045 * * 1046 ************************************************************************/ 1047 1048 /** 1049 * htmlDocDump: 1050 * @f: the FILE* 1051 * @cur: the document 1052 * 1053 * Dump an HTML document to an open FILE. 1054 * 1055 * returns: the number of byte written or -1 in case of failure. 1056 */ 1057 int 1058 htmlDocDump(FILE *f, xmlDocPtr cur) { 1059 xmlOutputBufferPtr buf; 1060 xmlCharEncodingHandlerPtr handler = NULL; 1061 const char *encoding; 1062 int ret; 1063 1064 xmlInitParser(); 1065 1066 if ((cur == NULL) || (f == NULL)) { 1067 return(-1); 1068 } 1069 1070 encoding = (const char *) htmlGetMetaEncoding(cur); 1071 1072 if (encoding != NULL) { 1073 xmlCharEncoding enc; 1074 1075 enc = xmlParseCharEncoding(encoding); 1076 if (enc != cur->charset) { 1077 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 1078 /* 1079 * Not supported yet 1080 */ 1081 return(-1); 1082 } 1083 1084 handler = xmlFindCharEncodingHandler(encoding); 1085 if (handler == NULL) 1086 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 1087 } else { 1088 handler = xmlFindCharEncodingHandler(encoding); 1089 } 1090 } 1091 1092 /* 1093 * Fallback to HTML or ASCII when the encoding is unspecified 1094 */ 1095 if (handler == NULL) 1096 handler = xmlFindCharEncodingHandler("HTML"); 1097 if (handler == NULL) 1098 handler = xmlFindCharEncodingHandler("ascii"); 1099 1100 buf = xmlOutputBufferCreateFile(f, handler); 1101 if (buf == NULL) return(-1); 1102 htmlDocContentDumpOutput(buf, cur, NULL); 1103 1104 ret = xmlOutputBufferClose(buf); 1105 return(ret); 1106 } 1107 1108 /** 1109 * htmlSaveFile: 1110 * @filename: the filename (or URL) 1111 * @cur: the document 1112 * 1113 * Dump an HTML document to a file. If @filename is "-" the stdout file is 1114 * used. 1115 * returns: the number of byte written or -1 in case of failure. 1116 */ 1117 int 1118 htmlSaveFile(const char *filename, xmlDocPtr cur) { 1119 xmlOutputBufferPtr buf; 1120 xmlCharEncodingHandlerPtr handler = NULL; 1121 const char *encoding; 1122 int ret; 1123 1124 if ((cur == NULL) || (filename == NULL)) 1125 return(-1); 1126 1127 xmlInitParser(); 1128 1129 encoding = (const char *) htmlGetMetaEncoding(cur); 1130 1131 if (encoding != NULL) { 1132 xmlCharEncoding enc; 1133 1134 enc = xmlParseCharEncoding(encoding); 1135 if (enc != cur->charset) { 1136 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 1137 /* 1138 * Not supported yet 1139 */ 1140 return(-1); 1141 } 1142 1143 handler = xmlFindCharEncodingHandler(encoding); 1144 if (handler == NULL) 1145 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 1146 } 1147 } 1148 1149 /* 1150 * Fallback to HTML or ASCII when the encoding is unspecified 1151 */ 1152 if (handler == NULL) 1153 handler = xmlFindCharEncodingHandler("HTML"); 1154 if (handler == NULL) 1155 handler = xmlFindCharEncodingHandler("ascii"); 1156 1157 /* 1158 * save the content to a temp buffer. 1159 */ 1160 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression); 1161 if (buf == NULL) return(0); 1162 1163 htmlDocContentDumpOutput(buf, cur, NULL); 1164 1165 ret = xmlOutputBufferClose(buf); 1166 return(ret); 1167 } 1168 1169 /** 1170 * htmlSaveFileFormat: 1171 * @filename: the filename 1172 * @cur: the document 1173 * @format: should formatting spaces been added 1174 * @encoding: the document encoding 1175 * 1176 * Dump an HTML document to a file using a given encoding. 1177 * 1178 * returns: the number of byte written or -1 in case of failure. 1179 */ 1180 int 1181 htmlSaveFileFormat(const char *filename, xmlDocPtr cur, 1182 const char *encoding, int format) { 1183 xmlOutputBufferPtr buf; 1184 xmlCharEncodingHandlerPtr handler = NULL; 1185 int ret; 1186 1187 if ((cur == NULL) || (filename == NULL)) 1188 return(-1); 1189 1190 xmlInitParser(); 1191 1192 if (encoding != NULL) { 1193 xmlCharEncoding enc; 1194 1195 enc = xmlParseCharEncoding(encoding); 1196 if (enc != cur->charset) { 1197 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 1198 /* 1199 * Not supported yet 1200 */ 1201 return(-1); 1202 } 1203 1204 handler = xmlFindCharEncodingHandler(encoding); 1205 if (handler == NULL) 1206 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 1207 } 1208 htmlSetMetaEncoding(cur, (const xmlChar *) encoding); 1209 } else { 1210 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8"); 1211 } 1212 1213 /* 1214 * Fallback to HTML or ASCII when the encoding is unspecified 1215 */ 1216 if (handler == NULL) 1217 handler = xmlFindCharEncodingHandler("HTML"); 1218 if (handler == NULL) 1219 handler = xmlFindCharEncodingHandler("ascii"); 1220 1221 /* 1222 * save the content to a temp buffer. 1223 */ 1224 buf = xmlOutputBufferCreateFilename(filename, handler, 0); 1225 if (buf == NULL) return(0); 1226 1227 htmlDocContentDumpFormatOutput(buf, cur, encoding, format); 1228 1229 ret = xmlOutputBufferClose(buf); 1230 return(ret); 1231 } 1232 1233 /** 1234 * htmlSaveFileEnc: 1235 * @filename: the filename 1236 * @cur: the document 1237 * @encoding: the document encoding 1238 * 1239 * Dump an HTML document to a file using a given encoding 1240 * and formatting returns/spaces are added. 1241 * 1242 * returns: the number of byte written or -1 in case of failure. 1243 */ 1244 int 1245 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) { 1246 return(htmlSaveFileFormat(filename, cur, encoding, 1)); 1247 } 1248 1249 #endif /* LIBXML_OUTPUT_ENABLED */ 1250 1251 #define bottom_HTMLtree 1252 #include "elfgcchack.h" 1253 #endif /* LIBXML_HTML_ENABLED */ 1254