1 /* 2 * HTMLtree.c : implementation of access function for an HTML tree. 3 * 4 * See Copyright for the status of this software. 5 * 6 * daniel (at) veillard.com 7 */ 8 9 10 #define IN_LIBXML 11 #include "libxml.h" 12 #ifdef LIBXML_HTML_ENABLED 13 14 #include <string.h> /* for memset() only ! */ 15 16 #ifdef HAVE_CTYPE_H 17 #include <ctype.h> 18 #endif 19 #ifdef HAVE_STDLIB_H 20 #include <stdlib.h> 21 #endif 22 23 #include <libxml/xmlmemory.h> 24 #include <libxml/HTMLparser.h> 25 #include <libxml/HTMLtree.h> 26 #include <libxml/entities.h> 27 #include <libxml/valid.h> 28 #include <libxml/xmlerror.h> 29 #include <libxml/parserInternals.h> 30 #include <libxml/globals.h> 31 #include <libxml/uri.h> 32 33 /************************************************************************ 34 * * 35 * Getting/Setting encoding meta tags * 36 * * 37 ************************************************************************/ 38 39 /** 40 * htmlGetMetaEncoding: 41 * @doc: the document 42 * 43 * Encoding definition lookup in the Meta tags 44 * 45 * Returns the current encoding as flagged in the HTML source 46 */ 47 const xmlChar * 48 htmlGetMetaEncoding(htmlDocPtr doc) { 49 htmlNodePtr cur; 50 const xmlChar *content; 51 const xmlChar *encoding; 52 53 if (doc == NULL) 54 return(NULL); 55 cur = doc->children; 56 57 /* 58 * Search the html 59 */ 60 while (cur != NULL) { 61 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 62 if (xmlStrEqual(cur->name, BAD_CAST"html")) 63 break; 64 if (xmlStrEqual(cur->name, BAD_CAST"head")) 65 goto found_head; 66 if (xmlStrEqual(cur->name, BAD_CAST"meta")) 67 goto found_meta; 68 } 69 cur = cur->next; 70 } 71 if (cur == NULL) 72 return(NULL); 73 cur = cur->children; 74 75 /* 76 * Search the head 77 */ 78 while (cur != NULL) { 79 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 80 if (xmlStrEqual(cur->name, BAD_CAST"head")) 81 break; 82 if (xmlStrEqual(cur->name, BAD_CAST"meta")) 83 goto found_meta; 84 } 85 cur = cur->next; 86 } 87 if (cur == NULL) 88 return(NULL); 89 found_head: 90 cur = cur->children; 91 92 /* 93 * Search the meta elements 94 */ 95 found_meta: 96 while (cur != NULL) { 97 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 98 if (xmlStrEqual(cur->name, BAD_CAST"meta")) { 99 xmlAttrPtr attr = cur->properties; 100 int http; 101 const xmlChar *value; 102 103 content = NULL; 104 http = 0; 105 while (attr != NULL) { 106 if ((attr->children != NULL) && 107 (attr->children->type == XML_TEXT_NODE) && 108 (attr->children->next == NULL)) { 109 value = attr->children->content; 110 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) 111 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 112 http = 1; 113 else if ((value != NULL) 114 && (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) 115 content = value; 116 if ((http != 0) && (content != NULL)) 117 goto found_content; 118 } 119 attr = attr->next; 120 } 121 } 122 } 123 cur = cur->next; 124 } 125 return(NULL); 126 127 found_content: 128 encoding = xmlStrstr(content, BAD_CAST"charset="); 129 if (encoding == NULL) 130 encoding = xmlStrstr(content, BAD_CAST"Charset="); 131 if (encoding == NULL) 132 encoding = xmlStrstr(content, BAD_CAST"CHARSET="); 133 if (encoding != NULL) { 134 encoding += 8; 135 } else { 136 encoding = xmlStrstr(content, BAD_CAST"charset ="); 137 if (encoding == NULL) 138 encoding = xmlStrstr(content, BAD_CAST"Charset ="); 139 if (encoding == NULL) 140 encoding = xmlStrstr(content, BAD_CAST"CHARSET ="); 141 if (encoding != NULL) 142 encoding += 9; 143 } 144 if (encoding != NULL) { 145 while ((*encoding == ' ') || (*encoding == '\t')) encoding++; 146 } 147 return(encoding); 148 } 149 150 /** 151 * htmlSetMetaEncoding: 152 * @doc: the document 153 * @encoding: the encoding string 154 * 155 * Sets the current encoding in the Meta tags 156 * NOTE: this will not change the document content encoding, just 157 * the META flag associated. 158 * 159 * Returns 0 in case of success and -1 in case of error 160 */ 161 int 162 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) { 163 htmlNodePtr cur, meta; 164 const xmlChar *content; 165 char newcontent[100]; 166 167 168 if (doc == NULL) 169 return(-1); 170 171 if (encoding != NULL) { 172 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s", 173 (char *)encoding); 174 newcontent[sizeof(newcontent) - 1] = 0; 175 } 176 177 cur = doc->children; 178 179 /* 180 * Search the html 181 */ 182 while (cur != NULL) { 183 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 184 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0) 185 break; 186 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) 187 goto found_head; 188 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) 189 goto found_meta; 190 } 191 cur = cur->next; 192 } 193 if (cur == NULL) 194 return(-1); 195 cur = cur->children; 196 197 /* 198 * Search the head 199 */ 200 while (cur != NULL) { 201 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 202 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) 203 break; 204 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) 205 goto found_meta; 206 } 207 cur = cur->next; 208 } 209 if (cur == NULL) 210 return(-1); 211 found_head: 212 if (cur->children == NULL) { 213 if (encoding == NULL) 214 return(0); 215 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); 216 xmlAddChild(cur, meta); 217 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); 218 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); 219 return(0); 220 } 221 cur = cur->children; 222 223 found_meta: 224 if (encoding != NULL) { 225 /* 226 * Create a new Meta element with the right attributes 227 */ 228 229 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); 230 xmlAddPrevSibling(cur, meta); 231 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); 232 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); 233 } 234 235 /* 236 * Search and destroy all the remaining the meta elements carrying 237 * encoding informations 238 */ 239 while (cur != NULL) { 240 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 241 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { 242 xmlAttrPtr attr = cur->properties; 243 int http; 244 const xmlChar *value; 245 246 content = NULL; 247 http = 0; 248 while (attr != NULL) { 249 if ((attr->children != NULL) && 250 (attr->children->type == XML_TEXT_NODE) && 251 (attr->children->next == NULL)) { 252 value = attr->children->content; 253 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) 254 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 255 http = 1; 256 else 257 { 258 if ((value != NULL) && 259 (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) 260 content = value; 261 } 262 if ((http != 0) && (content != NULL)) 263 break; 264 } 265 attr = attr->next; 266 } 267 if ((http != 0) && (content != NULL)) { 268 meta = cur; 269 cur = cur->next; 270 xmlUnlinkNode(meta); 271 xmlFreeNode(meta); 272 continue; 273 } 274 275 } 276 } 277 cur = cur->next; 278 } 279 return(0); 280 } 281 282 /** 283 * booleanHTMLAttrs: 284 * 285 * These are the HTML attributes which will be output 286 * in minimized form, i.e. <option selected="selected"> will be 287 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method" 288 * 289 */ 290 static const char* htmlBooleanAttrs[] = { 291 "checked", "compact", "declare", "defer", "disabled", "ismap", 292 "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly", 293 "selected", NULL 294 }; 295 296 297 /** 298 * htmlIsBooleanAttr: 299 * @name: the name of the attribute to check 300 * 301 * Determine if a given attribute is a boolean attribute. 302 * 303 * returns: false if the attribute is not boolean, true otherwise. 304 */ 305 int 306 htmlIsBooleanAttr(const xmlChar *name) 307 { 308 int i = 0; 309 310 while (htmlBooleanAttrs[i] != NULL) { 311 if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0) 312 return 1; 313 i++; 314 } 315 return 0; 316 } 317 318 #ifdef LIBXML_OUTPUT_ENABLED 319 /* 320 * private routine exported from xmlIO.c 321 */ 322 xmlOutputBufferPtr 323 xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder); 324 /************************************************************************ 325 * * 326 * Output error handlers * 327 * * 328 ************************************************************************/ 329 /** 330 * htmlSaveErrMemory: 331 * @extra: extra informations 332 * 333 * Handle an out of memory condition 334 */ 335 static void 336 htmlSaveErrMemory(const char *extra) 337 { 338 __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra); 339 } 340 341 /** 342 * htmlSaveErr: 343 * @code: the error number 344 * @node: the location of the error. 345 * @extra: extra informations 346 * 347 * Handle an out of memory condition 348 */ 349 static void 350 htmlSaveErr(int code, xmlNodePtr node, const char *extra) 351 { 352 const char *msg = NULL; 353 354 switch(code) { 355 case XML_SAVE_NOT_UTF8: 356 msg = "string is not in UTF-8\n"; 357 break; 358 case XML_SAVE_CHAR_INVALID: 359 msg = "invalid character value\n"; 360 break; 361 case XML_SAVE_UNKNOWN_ENCODING: 362 msg = "unknown encoding %s\n"; 363 break; 364 case XML_SAVE_NO_DOCTYPE: 365 msg = "HTML has no DOCTYPE\n"; 366 break; 367 default: 368 msg = "unexpected error number\n"; 369 } 370 __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra); 371 } 372 373 /************************************************************************ 374 * * 375 * Dumping HTML tree content to a simple buffer * 376 * * 377 ************************************************************************/ 378 379 static int 380 htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, 381 int format); 382 383 /** 384 * htmlNodeDumpFormat: 385 * @buf: the HTML buffer output 386 * @doc: the document 387 * @cur: the current node 388 * @format: should formatting spaces been added 389 * 390 * Dump an HTML node, recursive behaviour,children are printed too. 391 * 392 * Returns the number of byte written or -1 in case of error 393 */ 394 static int 395 htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, 396 int format) { 397 unsigned int use; 398 int ret; 399 xmlOutputBufferPtr outbuf; 400 401 if (cur == NULL) { 402 return (-1); 403 } 404 if (buf == NULL) { 405 return (-1); 406 } 407 outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer)); 408 if (outbuf == NULL) { 409 htmlSaveErrMemory("allocating HTML output buffer"); 410 return (-1); 411 } 412 memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer)); 413 outbuf->buffer = buf; 414 outbuf->encoder = NULL; 415 outbuf->writecallback = NULL; 416 outbuf->closecallback = NULL; 417 outbuf->context = NULL; 418 outbuf->written = 0; 419 420 use = buf->use; 421 htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format); 422 xmlFree(outbuf); 423 ret = buf->use - use; 424 return (ret); 425 } 426 427 /** 428 * htmlNodeDump: 429 * @buf: the HTML buffer output 430 * @doc: the document 431 * @cur: the current node 432 * 433 * Dump an HTML node, recursive behaviour,children are printed too, 434 * and formatting returns are added. 435 * 436 * Returns the number of byte written or -1 in case of error 437 */ 438 int 439 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) { 440 xmlInitParser(); 441 442 return(htmlNodeDumpFormat(buf, doc, cur, 1)); 443 } 444 445 /** 446 * htmlNodeDumpFileFormat: 447 * @out: the FILE pointer 448 * @doc: the document 449 * @cur: the current node 450 * @encoding: the document encoding 451 * @format: should formatting spaces been added 452 * 453 * Dump an HTML node, recursive behaviour,children are printed too. 454 * 455 * TODO: if encoding == NULL try to save in the doc encoding 456 * 457 * returns: the number of byte written or -1 in case of failure. 458 */ 459 int 460 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc, 461 xmlNodePtr cur, const char *encoding, int format) { 462 xmlOutputBufferPtr buf; 463 xmlCharEncodingHandlerPtr handler = NULL; 464 int ret; 465 466 xmlInitParser(); 467 468 if (encoding != NULL) { 469 xmlCharEncoding enc; 470 471 enc = xmlParseCharEncoding(encoding); 472 if (enc != XML_CHAR_ENCODING_UTF8) { 473 handler = xmlFindCharEncodingHandler(encoding); 474 if (handler == NULL) 475 return(-1); 476 } 477 } 478 479 /* 480 * Fallback to HTML or ASCII when the encoding is unspecified 481 */ 482 if (handler == NULL) 483 handler = xmlFindCharEncodingHandler("HTML"); 484 if (handler == NULL) 485 handler = xmlFindCharEncodingHandler("ascii"); 486 487 /* 488 * save the content to a temp buffer. 489 */ 490 buf = xmlOutputBufferCreateFile(out, handler); 491 if (buf == NULL) return(0); 492 493 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); 494 495 ret = xmlOutputBufferClose(buf); 496 return(ret); 497 } 498 499 /** 500 * htmlNodeDumpFile: 501 * @out: the FILE pointer 502 * @doc: the document 503 * @cur: the current node 504 * 505 * Dump an HTML node, recursive behaviour,children are printed too, 506 * and formatting returns are added. 507 */ 508 void 509 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) { 510 htmlNodeDumpFileFormat(out, doc, cur, NULL, 1); 511 } 512 513 /** 514 * htmlDocDumpMemoryFormat: 515 * @cur: the document 516 * @mem: OUT: the memory pointer 517 * @size: OUT: the memory length 518 * @format: should formatting spaces been added 519 * 520 * Dump an HTML document in memory and return the xmlChar * and it's size. 521 * It's up to the caller to free the memory. 522 */ 523 void 524 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) { 525 xmlOutputBufferPtr buf; 526 xmlCharEncodingHandlerPtr handler = NULL; 527 const char *encoding; 528 529 xmlInitParser(); 530 531 if ((mem == NULL) || (size == NULL)) 532 return; 533 if (cur == NULL) { 534 *mem = NULL; 535 *size = 0; 536 return; 537 } 538 539 encoding = (const char *) htmlGetMetaEncoding(cur); 540 541 if (encoding != NULL) { 542 xmlCharEncoding enc; 543 544 enc = xmlParseCharEncoding(encoding); 545 if (enc != cur->charset) { 546 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 547 /* 548 * Not supported yet 549 */ 550 *mem = NULL; 551 *size = 0; 552 return; 553 } 554 555 handler = xmlFindCharEncodingHandler(encoding); 556 if (handler == NULL) { 557 *mem = NULL; 558 *size = 0; 559 return; 560 } 561 } else { 562 handler = xmlFindCharEncodingHandler(encoding); 563 } 564 } 565 566 /* 567 * Fallback to HTML or ASCII when the encoding is unspecified 568 */ 569 if (handler == NULL) 570 handler = xmlFindCharEncodingHandler("HTML"); 571 if (handler == NULL) 572 handler = xmlFindCharEncodingHandler("ascii"); 573 574 buf = xmlAllocOutputBufferInternal(handler); 575 if (buf == NULL) { 576 *mem = NULL; 577 *size = 0; 578 return; 579 } 580 581 htmlDocContentDumpFormatOutput(buf, cur, NULL, format); 582 583 xmlOutputBufferFlush(buf); 584 if (buf->conv != NULL) { 585 *size = buf->conv->use; 586 *mem = xmlStrndup(buf->conv->content, *size); 587 } else { 588 *size = buf->buffer->use; 589 *mem = xmlStrndup(buf->buffer->content, *size); 590 } 591 (void)xmlOutputBufferClose(buf); 592 } 593 594 /** 595 * htmlDocDumpMemory: 596 * @cur: the document 597 * @mem: OUT: the memory pointer 598 * @size: OUT: the memory length 599 * 600 * Dump an HTML document in memory and return the xmlChar * and it's size. 601 * It's up to the caller to free the memory. 602 */ 603 void 604 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) { 605 htmlDocDumpMemoryFormat(cur, mem, size, 1); 606 } 607 608 609 /************************************************************************ 610 * * 611 * Dumping HTML tree content to an I/O output buffer * 612 * * 613 ************************************************************************/ 614 615 void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur); 616 617 /** 618 * htmlDtdDumpOutput: 619 * @buf: the HTML buffer output 620 * @doc: the document 621 * @encoding: the encoding string 622 * 623 * TODO: check whether encoding is needed 624 * 625 * Dump the HTML document DTD, if any. 626 */ 627 static void 628 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 629 const char *encoding ATTRIBUTE_UNUSED) { 630 xmlDtdPtr cur = doc->intSubset; 631 632 if (cur == NULL) { 633 htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL); 634 return; 635 } 636 xmlOutputBufferWriteString(buf, "<!DOCTYPE "); 637 xmlOutputBufferWriteString(buf, (const char *)cur->name); 638 if (cur->ExternalID != NULL) { 639 xmlOutputBufferWriteString(buf, " PUBLIC "); 640 xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID); 641 if (cur->SystemID != NULL) { 642 xmlOutputBufferWriteString(buf, " "); 643 xmlBufferWriteQuotedString(buf->buffer, cur->SystemID); 644 } 645 } else if (cur->SystemID != NULL) { 646 xmlOutputBufferWriteString(buf, " SYSTEM "); 647 xmlBufferWriteQuotedString(buf->buffer, cur->SystemID); 648 } 649 xmlOutputBufferWriteString(buf, ">\n"); 650 } 651 652 /** 653 * htmlAttrDumpOutput: 654 * @buf: the HTML buffer output 655 * @doc: the document 656 * @cur: the attribute pointer 657 * @encoding: the encoding string 658 * 659 * Dump an HTML attribute 660 */ 661 static void 662 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, 663 const char *encoding ATTRIBUTE_UNUSED) { 664 xmlChar *value; 665 666 /* 667 * TODO: The html output method should not escape a & character 668 * occurring in an attribute value immediately followed by 669 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation). 670 */ 671 672 if (cur == NULL) { 673 return; 674 } 675 xmlOutputBufferWriteString(buf, " "); 676 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 677 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 678 xmlOutputBufferWriteString(buf, ":"); 679 } 680 xmlOutputBufferWriteString(buf, (const char *)cur->name); 681 if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) { 682 value = xmlNodeListGetString(doc, cur->children, 0); 683 if (value) { 684 xmlOutputBufferWriteString(buf, "="); 685 if ((cur->ns == NULL) && (cur->parent != NULL) && 686 (cur->parent->ns == NULL) && 687 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) || 688 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) || 689 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) || 690 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) && 691 (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) { 692 xmlChar *escaped; 693 xmlChar *tmp = value; 694 695 while (IS_BLANK_CH(*tmp)) tmp++; 696 697 escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+"); 698 if (escaped != NULL) { 699 xmlBufferWriteQuotedString(buf->buffer, escaped); 700 xmlFree(escaped); 701 } else { 702 xmlBufferWriteQuotedString(buf->buffer, value); 703 } 704 } else { 705 xmlBufferWriteQuotedString(buf->buffer, value); 706 } 707 xmlFree(value); 708 } else { 709 xmlOutputBufferWriteString(buf, "=\"\""); 710 } 711 } 712 } 713 714 /** 715 * htmlAttrListDumpOutput: 716 * @buf: the HTML buffer output 717 * @doc: the document 718 * @cur: the first attribute pointer 719 * @encoding: the encoding string 720 * 721 * Dump a list of HTML attributes 722 */ 723 static void 724 htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) { 725 if (cur == NULL) { 726 return; 727 } 728 while (cur != NULL) { 729 htmlAttrDumpOutput(buf, doc, cur, encoding); 730 cur = cur->next; 731 } 732 } 733 734 735 736 /** 737 * htmlNodeListDumpOutput: 738 * @buf: the HTML buffer output 739 * @doc: the document 740 * @cur: the first node 741 * @encoding: the encoding string 742 * @format: should formatting spaces been added 743 * 744 * Dump an HTML node list, recursive behaviour,children are printed too. 745 */ 746 static void 747 htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 748 xmlNodePtr cur, const char *encoding, int format) { 749 if (cur == NULL) { 750 return; 751 } 752 while (cur != NULL) { 753 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); 754 cur = cur->next; 755 } 756 } 757 758 /** 759 * htmlNodeDumpFormatOutput: 760 * @buf: the HTML buffer output 761 * @doc: the document 762 * @cur: the current node 763 * @encoding: the encoding string 764 * @format: should formatting spaces been added 765 * 766 * Dump an HTML node, recursive behaviour,children are printed too. 767 */ 768 void 769 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 770 xmlNodePtr cur, const char *encoding, int format) { 771 const htmlElemDesc * info; 772 773 xmlInitParser(); 774 775 if ((cur == NULL) || (buf == NULL)) { 776 return; 777 } 778 /* 779 * Special cases. 780 */ 781 if (cur->type == XML_DTD_NODE) 782 return; 783 if ((cur->type == XML_HTML_DOCUMENT_NODE) || 784 (cur->type == XML_DOCUMENT_NODE)){ 785 htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding); 786 return; 787 } 788 if (cur->type == XML_ATTRIBUTE_NODE) { 789 htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding); 790 return; 791 } 792 if (cur->type == HTML_TEXT_NODE) { 793 if (cur->content != NULL) { 794 if (((cur->name == (const xmlChar *)xmlStringText) || 795 (cur->name != (const xmlChar *)xmlStringTextNoenc)) && 796 ((cur->parent == NULL) || 797 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) && 798 (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) { 799 xmlChar *buffer; 800 801 buffer = xmlEncodeEntitiesReentrant(doc, cur->content); 802 if (buffer != NULL) { 803 xmlOutputBufferWriteString(buf, (const char *)buffer); 804 xmlFree(buffer); 805 } 806 } else { 807 xmlOutputBufferWriteString(buf, (const char *)cur->content); 808 } 809 } 810 return; 811 } 812 if (cur->type == HTML_COMMENT_NODE) { 813 if (cur->content != NULL) { 814 xmlOutputBufferWriteString(buf, "<!--"); 815 xmlOutputBufferWriteString(buf, (const char *)cur->content); 816 xmlOutputBufferWriteString(buf, "-->"); 817 } 818 return; 819 } 820 if (cur->type == HTML_PI_NODE) { 821 if (cur->name == NULL) 822 return; 823 xmlOutputBufferWriteString(buf, "<?"); 824 xmlOutputBufferWriteString(buf, (const char *)cur->name); 825 if (cur->content != NULL) { 826 xmlOutputBufferWriteString(buf, " "); 827 xmlOutputBufferWriteString(buf, (const char *)cur->content); 828 } 829 xmlOutputBufferWriteString(buf, ">"); 830 return; 831 } 832 if (cur->type == HTML_ENTITY_REF_NODE) { 833 xmlOutputBufferWriteString(buf, "&"); 834 xmlOutputBufferWriteString(buf, (const char *)cur->name); 835 xmlOutputBufferWriteString(buf, ";"); 836 return; 837 } 838 if (cur->type == HTML_PRESERVE_NODE) { 839 if (cur->content != NULL) { 840 xmlOutputBufferWriteString(buf, (const char *)cur->content); 841 } 842 return; 843 } 844 845 /* 846 * Get specific HTML info for that node. 847 */ 848 if (cur->ns == NULL) 849 info = htmlTagLookup(cur->name); 850 else 851 info = NULL; 852 853 xmlOutputBufferWriteString(buf, "<"); 854 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 855 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 856 xmlOutputBufferWriteString(buf, ":"); 857 } 858 xmlOutputBufferWriteString(buf, (const char *)cur->name); 859 if (cur->nsDef) 860 xmlNsListDumpOutput(buf, cur->nsDef); 861 if (cur->properties != NULL) 862 htmlAttrListDumpOutput(buf, doc, cur->properties, encoding); 863 864 if ((info != NULL) && (info->empty)) { 865 xmlOutputBufferWriteString(buf, ">"); 866 if ((format) && (!info->isinline) && (cur->next != NULL)) { 867 if ((cur->next->type != HTML_TEXT_NODE) && 868 (cur->next->type != HTML_ENTITY_REF_NODE) && 869 (cur->parent != NULL) && 870 (cur->parent->name != NULL) && 871 (cur->parent->name[0] != 'p')) /* p, pre, param */ 872 xmlOutputBufferWriteString(buf, "\n"); 873 } 874 return; 875 } 876 if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) && 877 (cur->children == NULL)) { 878 if ((info != NULL) && (info->saveEndTag != 0) && 879 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) && 880 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) { 881 xmlOutputBufferWriteString(buf, ">"); 882 } else { 883 xmlOutputBufferWriteString(buf, "></"); 884 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 885 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 886 xmlOutputBufferWriteString(buf, ":"); 887 } 888 xmlOutputBufferWriteString(buf, (const char *)cur->name); 889 xmlOutputBufferWriteString(buf, ">"); 890 } 891 if ((format) && (cur->next != NULL) && 892 (info != NULL) && (!info->isinline)) { 893 if ((cur->next->type != HTML_TEXT_NODE) && 894 (cur->next->type != HTML_ENTITY_REF_NODE) && 895 (cur->parent != NULL) && 896 (cur->parent->name != NULL) && 897 (cur->parent->name[0] != 'p')) /* p, pre, param */ 898 xmlOutputBufferWriteString(buf, "\n"); 899 } 900 return; 901 } 902 xmlOutputBufferWriteString(buf, ">"); 903 if ((cur->type != XML_ELEMENT_NODE) && 904 (cur->content != NULL)) { 905 /* 906 * Uses the OutputBuffer property to automatically convert 907 * invalids to charrefs 908 */ 909 910 xmlOutputBufferWriteString(buf, (const char *) cur->content); 911 } 912 if (cur->children != NULL) { 913 if ((format) && (info != NULL) && (!info->isinline) && 914 (cur->children->type != HTML_TEXT_NODE) && 915 (cur->children->type != HTML_ENTITY_REF_NODE) && 916 (cur->children != cur->last) && 917 (cur->name != NULL) && 918 (cur->name[0] != 'p')) /* p, pre, param */ 919 xmlOutputBufferWriteString(buf, "\n"); 920 htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format); 921 if ((format) && (info != NULL) && (!info->isinline) && 922 (cur->last->type != HTML_TEXT_NODE) && 923 (cur->last->type != HTML_ENTITY_REF_NODE) && 924 (cur->children != cur->last) && 925 (cur->name != NULL) && 926 (cur->name[0] != 'p')) /* p, pre, param */ 927 xmlOutputBufferWriteString(buf, "\n"); 928 } 929 xmlOutputBufferWriteString(buf, "</"); 930 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 931 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 932 xmlOutputBufferWriteString(buf, ":"); 933 } 934 xmlOutputBufferWriteString(buf, (const char *)cur->name); 935 xmlOutputBufferWriteString(buf, ">"); 936 if ((format) && (info != NULL) && (!info->isinline) && 937 (cur->next != NULL)) { 938 if ((cur->next->type != HTML_TEXT_NODE) && 939 (cur->next->type != HTML_ENTITY_REF_NODE) && 940 (cur->parent != NULL) && 941 (cur->parent->name != NULL) && 942 (cur->parent->name[0] != 'p')) /* p, pre, param */ 943 xmlOutputBufferWriteString(buf, "\n"); 944 } 945 } 946 947 /** 948 * htmlNodeDumpOutput: 949 * @buf: the HTML buffer output 950 * @doc: the document 951 * @cur: the current node 952 * @encoding: the encoding string 953 * 954 * Dump an HTML node, recursive behaviour,children are printed too, 955 * and formatting returns/spaces are added. 956 */ 957 void 958 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 959 xmlNodePtr cur, const char *encoding) { 960 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1); 961 } 962 963 /** 964 * htmlDocContentDumpFormatOutput: 965 * @buf: the HTML buffer output 966 * @cur: the document 967 * @encoding: the encoding string 968 * @format: should formatting spaces been added 969 * 970 * Dump an HTML document. 971 */ 972 void 973 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, 974 const char *encoding, int format) { 975 int type; 976 977 xmlInitParser(); 978 979 if ((buf == NULL) || (cur == NULL)) 980 return; 981 982 /* 983 * force to output the stuff as HTML, especially for entities 984 */ 985 type = cur->type; 986 cur->type = XML_HTML_DOCUMENT_NODE; 987 if (cur->intSubset != NULL) { 988 htmlDtdDumpOutput(buf, cur, NULL); 989 } 990 if (cur->children != NULL) { 991 htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format); 992 } 993 xmlOutputBufferWriteString(buf, "\n"); 994 cur->type = (xmlElementType) type; 995 } 996 997 /** 998 * htmlDocContentDumpOutput: 999 * @buf: the HTML buffer output 1000 * @cur: the document 1001 * @encoding: the encoding string 1002 * 1003 * Dump an HTML document. Formating return/spaces are added. 1004 */ 1005 void 1006 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, 1007 const char *encoding) { 1008 htmlDocContentDumpFormatOutput(buf, cur, encoding, 1); 1009 } 1010 1011 /************************************************************************ 1012 * * 1013 * Saving functions front-ends * 1014 * * 1015 ************************************************************************/ 1016 1017 /** 1018 * htmlDocDump: 1019 * @f: the FILE* 1020 * @cur: the document 1021 * 1022 * Dump an HTML document to an open FILE. 1023 * 1024 * returns: the number of byte written or -1 in case of failure. 1025 */ 1026 int 1027 htmlDocDump(FILE *f, xmlDocPtr cur) { 1028 xmlOutputBufferPtr buf; 1029 xmlCharEncodingHandlerPtr handler = NULL; 1030 const char *encoding; 1031 int ret; 1032 1033 xmlInitParser(); 1034 1035 if ((cur == NULL) || (f == NULL)) { 1036 return(-1); 1037 } 1038 1039 encoding = (const char *) htmlGetMetaEncoding(cur); 1040 1041 if (encoding != NULL) { 1042 xmlCharEncoding enc; 1043 1044 enc = xmlParseCharEncoding(encoding); 1045 if (enc != cur->charset) { 1046 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 1047 /* 1048 * Not supported yet 1049 */ 1050 return(-1); 1051 } 1052 1053 handler = xmlFindCharEncodingHandler(encoding); 1054 if (handler == NULL) 1055 return(-1); 1056 } else { 1057 handler = xmlFindCharEncodingHandler(encoding); 1058 } 1059 } 1060 1061 /* 1062 * Fallback to HTML or ASCII when the encoding is unspecified 1063 */ 1064 if (handler == NULL) 1065 handler = xmlFindCharEncodingHandler("HTML"); 1066 if (handler == NULL) 1067 handler = xmlFindCharEncodingHandler("ascii"); 1068 1069 buf = xmlOutputBufferCreateFile(f, handler); 1070 if (buf == NULL) return(-1); 1071 htmlDocContentDumpOutput(buf, cur, NULL); 1072 1073 ret = xmlOutputBufferClose(buf); 1074 return(ret); 1075 } 1076 1077 /** 1078 * htmlSaveFile: 1079 * @filename: the filename (or URL) 1080 * @cur: the document 1081 * 1082 * Dump an HTML document to a file. If @filename is "-" the stdout file is 1083 * used. 1084 * returns: the number of byte written or -1 in case of failure. 1085 */ 1086 int 1087 htmlSaveFile(const char *filename, xmlDocPtr cur) { 1088 xmlOutputBufferPtr buf; 1089 xmlCharEncodingHandlerPtr handler = NULL; 1090 const char *encoding; 1091 int ret; 1092 1093 if ((cur == NULL) || (filename == NULL)) 1094 return(-1); 1095 1096 xmlInitParser(); 1097 1098 encoding = (const char *) htmlGetMetaEncoding(cur); 1099 1100 if (encoding != NULL) { 1101 xmlCharEncoding enc; 1102 1103 enc = xmlParseCharEncoding(encoding); 1104 if (enc != cur->charset) { 1105 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 1106 /* 1107 * Not supported yet 1108 */ 1109 return(-1); 1110 } 1111 1112 handler = xmlFindCharEncodingHandler(encoding); 1113 if (handler == NULL) 1114 return(-1); 1115 } 1116 } 1117 1118 /* 1119 * Fallback to HTML or ASCII when the encoding is unspecified 1120 */ 1121 if (handler == NULL) 1122 handler = xmlFindCharEncodingHandler("HTML"); 1123 if (handler == NULL) 1124 handler = xmlFindCharEncodingHandler("ascii"); 1125 1126 /* 1127 * save the content to a temp buffer. 1128 */ 1129 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression); 1130 if (buf == NULL) return(0); 1131 1132 htmlDocContentDumpOutput(buf, cur, NULL); 1133 1134 ret = xmlOutputBufferClose(buf); 1135 return(ret); 1136 } 1137 1138 /** 1139 * htmlSaveFileFormat: 1140 * @filename: the filename 1141 * @cur: the document 1142 * @format: should formatting spaces been added 1143 * @encoding: the document encoding 1144 * 1145 * Dump an HTML document to a file using a given encoding. 1146 * 1147 * returns: the number of byte written or -1 in case of failure. 1148 */ 1149 int 1150 htmlSaveFileFormat(const char *filename, xmlDocPtr cur, 1151 const char *encoding, int format) { 1152 xmlOutputBufferPtr buf; 1153 xmlCharEncodingHandlerPtr handler = NULL; 1154 int ret; 1155 1156 if ((cur == NULL) || (filename == NULL)) 1157 return(-1); 1158 1159 xmlInitParser(); 1160 1161 if (encoding != NULL) { 1162 xmlCharEncoding enc; 1163 1164 enc = xmlParseCharEncoding(encoding); 1165 if (enc != cur->charset) { 1166 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 1167 /* 1168 * Not supported yet 1169 */ 1170 return(-1); 1171 } 1172 1173 handler = xmlFindCharEncodingHandler(encoding); 1174 if (handler == NULL) 1175 return(-1); 1176 htmlSetMetaEncoding(cur, (const xmlChar *) encoding); 1177 } 1178 } else { 1179 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8"); 1180 } 1181 1182 /* 1183 * Fallback to HTML or ASCII when the encoding is unspecified 1184 */ 1185 if (handler == NULL) 1186 handler = xmlFindCharEncodingHandler("HTML"); 1187 if (handler == NULL) 1188 handler = xmlFindCharEncodingHandler("ascii"); 1189 1190 /* 1191 * save the content to a temp buffer. 1192 */ 1193 buf = xmlOutputBufferCreateFilename(filename, handler, 0); 1194 if (buf == NULL) return(0); 1195 1196 htmlDocContentDumpFormatOutput(buf, cur, encoding, format); 1197 1198 ret = xmlOutputBufferClose(buf); 1199 return(ret); 1200 } 1201 1202 /** 1203 * htmlSaveFileEnc: 1204 * @filename: the filename 1205 * @cur: the document 1206 * @encoding: the document encoding 1207 * 1208 * Dump an HTML document to a file using a given encoding 1209 * and formatting returns/spaces are added. 1210 * 1211 * returns: the number of byte written or -1 in case of failure. 1212 */ 1213 int 1214 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) { 1215 return(htmlSaveFileFormat(filename, cur, encoding, 1)); 1216 } 1217 1218 #endif /* LIBXML_OUTPUT_ENABLED */ 1219 1220 #define bottom_HTMLtree 1221 #include "elfgcchack.h" 1222 #endif /* LIBXML_HTML_ENABLED */ 1223