Home | History | Annotate | Download | only in libxml2
      1 /*
      2  * HTMLtree.c : implementation of access function for an HTML tree.
      3  *
      4  * See Copyright for the status of this software.
      5  *
      6  * daniel (at) veillard.com
      7  */
      8 
      9 
     10 #define IN_LIBXML
     11 #include "libxml.h"
     12 #ifdef LIBXML_HTML_ENABLED
     13 
     14 #include <string.h> /* for memset() only ! */
     15 
     16 #ifdef HAVE_CTYPE_H
     17 #include <ctype.h>
     18 #endif
     19 #ifdef HAVE_STDLIB_H
     20 #include <stdlib.h>
     21 #endif
     22 
     23 #include <libxml/xmlmemory.h>
     24 #include <libxml/HTMLparser.h>
     25 #include <libxml/HTMLtree.h>
     26 #include <libxml/entities.h>
     27 #include <libxml/valid.h>
     28 #include <libxml/xmlerror.h>
     29 #include <libxml/parserInternals.h>
     30 #include <libxml/globals.h>
     31 #include <libxml/uri.h>
     32 
     33 #include "buf.h"
     34 
     35 /************************************************************************
     36  *									*
     37  *		Getting/Setting encoding meta tags			*
     38  *									*
     39  ************************************************************************/
     40 
     41 /**
     42  * htmlGetMetaEncoding:
     43  * @doc:  the document
     44  *
     45  * Encoding definition lookup in the Meta tags
     46  *
     47  * Returns the current encoding as flagged in the HTML source
     48  */
     49 const xmlChar *
     50 htmlGetMetaEncoding(htmlDocPtr doc) {
     51     htmlNodePtr cur;
     52     const xmlChar *content;
     53     const xmlChar *encoding;
     54 
     55     if (doc == NULL)
     56 	return(NULL);
     57     cur = doc->children;
     58 
     59     /*
     60      * Search the html
     61      */
     62     while (cur != NULL) {
     63 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
     64 	    if (xmlStrEqual(cur->name, BAD_CAST"html"))
     65 		break;
     66 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
     67 		goto found_head;
     68 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
     69 		goto found_meta;
     70 	}
     71 	cur = cur->next;
     72     }
     73     if (cur == NULL)
     74 	return(NULL);
     75     cur = cur->children;
     76 
     77     /*
     78      * Search the head
     79      */
     80     while (cur != NULL) {
     81 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
     82 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
     83 		break;
     84 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
     85 		goto found_meta;
     86 	}
     87 	cur = cur->next;
     88     }
     89     if (cur == NULL)
     90 	return(NULL);
     91 found_head:
     92     cur = cur->children;
     93 
     94     /*
     95      * Search the meta elements
     96      */
     97 found_meta:
     98     while (cur != NULL) {
     99 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
    100 	    if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
    101 		xmlAttrPtr attr = cur->properties;
    102 		int http;
    103 		const xmlChar *value;
    104 
    105 		content = NULL;
    106 		http = 0;
    107 		while (attr != NULL) {
    108 		    if ((attr->children != NULL) &&
    109 		        (attr->children->type == XML_TEXT_NODE) &&
    110 		        (attr->children->next == NULL)) {
    111 			value = attr->children->content;
    112 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
    113 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
    114 			    http = 1;
    115 			else if ((value != NULL)
    116 			 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
    117 			    content = value;
    118 			if ((http != 0) && (content != NULL))
    119 			    goto found_content;
    120 		    }
    121 		    attr = attr->next;
    122 		}
    123 	    }
    124 	}
    125 	cur = cur->next;
    126     }
    127     return(NULL);
    128 
    129 found_content:
    130     encoding = xmlStrstr(content, BAD_CAST"charset=");
    131     if (encoding == NULL)
    132 	encoding = xmlStrstr(content, BAD_CAST"Charset=");
    133     if (encoding == NULL)
    134 	encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
    135     if (encoding != NULL) {
    136 	encoding += 8;
    137     } else {
    138 	encoding = xmlStrstr(content, BAD_CAST"charset =");
    139 	if (encoding == NULL)
    140 	    encoding = xmlStrstr(content, BAD_CAST"Charset =");
    141 	if (encoding == NULL)
    142 	    encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
    143 	if (encoding != NULL)
    144 	    encoding += 9;
    145     }
    146     if (encoding != NULL) {
    147 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
    148     }
    149     return(encoding);
    150 }
    151 
    152 /**
    153  * htmlSetMetaEncoding:
    154  * @doc:  the document
    155  * @encoding:  the encoding string
    156  *
    157  * Sets the current encoding in the Meta tags
    158  * NOTE: this will not change the document content encoding, just
    159  * the META flag associated.
    160  *
    161  * Returns 0 in case of success and -1 in case of error
    162  */
    163 int
    164 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
    165     htmlNodePtr cur, meta = NULL, head = NULL;
    166     const xmlChar *content = NULL;
    167     char newcontent[100];
    168 
    169     newcontent[0] = 0;
    170 
    171     if (doc == NULL)
    172 	return(-1);
    173 
    174     /* html isn't a real encoding it's just libxml2 way to get entities */
    175     if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
    176         return(-1);
    177 
    178     if (encoding != NULL) {
    179 	snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
    180                 (char *)encoding);
    181 	newcontent[sizeof(newcontent) - 1] = 0;
    182     }
    183 
    184     cur = doc->children;
    185 
    186     /*
    187      * Search the html
    188      */
    189     while (cur != NULL) {
    190 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
    191 	    if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
    192 		break;
    193 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
    194 		goto found_head;
    195 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
    196 		goto found_meta;
    197 	}
    198 	cur = cur->next;
    199     }
    200     if (cur == NULL)
    201 	return(-1);
    202     cur = cur->children;
    203 
    204     /*
    205      * Search the head
    206      */
    207     while (cur != NULL) {
    208 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
    209 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
    210 		break;
    211 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
    212                 head = cur->parent;
    213 		goto found_meta;
    214             }
    215 	}
    216 	cur = cur->next;
    217     }
    218     if (cur == NULL)
    219 	return(-1);
    220 found_head:
    221     head = cur;
    222     if (cur->children == NULL)
    223         goto create;
    224     cur = cur->children;
    225 
    226 found_meta:
    227     /*
    228      * Search and update all the remaining the meta elements carrying
    229      * encoding informations
    230      */
    231     while (cur != NULL) {
    232 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
    233 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
    234 		xmlAttrPtr attr = cur->properties;
    235 		int http;
    236 		const xmlChar *value;
    237 
    238 		content = NULL;
    239 		http = 0;
    240 		while (attr != NULL) {
    241 		    if ((attr->children != NULL) &&
    242 		        (attr->children->type == XML_TEXT_NODE) &&
    243 		        (attr->children->next == NULL)) {
    244 			value = attr->children->content;
    245 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
    246 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
    247 			    http = 1;
    248 			else
    249                         {
    250                            if ((value != NULL) &&
    251                                (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
    252 			       content = value;
    253                         }
    254 		        if ((http != 0) && (content != NULL))
    255 			    break;
    256 		    }
    257 		    attr = attr->next;
    258 		}
    259 		if ((http != 0) && (content != NULL)) {
    260 		    meta = cur;
    261 		    break;
    262 		}
    263 
    264 	    }
    265 	}
    266 	cur = cur->next;
    267     }
    268 create:
    269     if (meta == NULL) {
    270         if ((encoding != NULL) && (head != NULL)) {
    271             /*
    272              * Create a new Meta element with the right attributes
    273              */
    274 
    275             meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
    276             if (head->children == NULL)
    277                 xmlAddChild(head, meta);
    278             else
    279                 xmlAddPrevSibling(head->children, meta);
    280             xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
    281             xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
    282         }
    283     } else {
    284         /* remove the meta tag if NULL is passed */
    285         if (encoding == NULL) {
    286             xmlUnlinkNode(meta);
    287             xmlFreeNode(meta);
    288         }
    289         /* change the document only if there is a real encoding change */
    290         else if (xmlStrcasestr(content, encoding) == NULL) {
    291             xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
    292         }
    293     }
    294 
    295 
    296     return(0);
    297 }
    298 
    299 /**
    300  * booleanHTMLAttrs:
    301  *
    302  * These are the HTML attributes which will be output
    303  * in minimized form, i.e. <option selected="selected"> will be
    304  * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
    305  *
    306  */
    307 static const char* htmlBooleanAttrs[] = {
    308   "checked", "compact", "declare", "defer", "disabled", "ismap",
    309   "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
    310   "selected", NULL
    311 };
    312 
    313 
    314 /**
    315  * htmlIsBooleanAttr:
    316  * @name:  the name of the attribute to check
    317  *
    318  * Determine if a given attribute is a boolean attribute.
    319  *
    320  * returns: false if the attribute is not boolean, true otherwise.
    321  */
    322 int
    323 htmlIsBooleanAttr(const xmlChar *name)
    324 {
    325     int i = 0;
    326 
    327     while (htmlBooleanAttrs[i] != NULL) {
    328         if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
    329             return 1;
    330         i++;
    331     }
    332     return 0;
    333 }
    334 
    335 #ifdef LIBXML_OUTPUT_ENABLED
    336 /*
    337  * private routine exported from xmlIO.c
    338  */
    339 xmlOutputBufferPtr
    340 xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder);
    341 /************************************************************************
    342  *									*
    343  *			Output error handlers				*
    344  *									*
    345  ************************************************************************/
    346 /**
    347  * htmlSaveErrMemory:
    348  * @extra:  extra informations
    349  *
    350  * Handle an out of memory condition
    351  */
    352 static void
    353 htmlSaveErrMemory(const char *extra)
    354 {
    355     __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
    356 }
    357 
    358 /**
    359  * htmlSaveErr:
    360  * @code:  the error number
    361  * @node:  the location of the error.
    362  * @extra:  extra informations
    363  *
    364  * Handle an out of memory condition
    365  */
    366 static void
    367 htmlSaveErr(int code, xmlNodePtr node, const char *extra)
    368 {
    369     const char *msg = NULL;
    370 
    371     switch(code) {
    372         case XML_SAVE_NOT_UTF8:
    373 	    msg = "string is not in UTF-8\n";
    374 	    break;
    375 	case XML_SAVE_CHAR_INVALID:
    376 	    msg = "invalid character value\n";
    377 	    break;
    378 	case XML_SAVE_UNKNOWN_ENCODING:
    379 	    msg = "unknown encoding %s\n";
    380 	    break;
    381 	case XML_SAVE_NO_DOCTYPE:
    382 	    msg = "HTML has no DOCTYPE\n";
    383 	    break;
    384 	default:
    385 	    msg = "unexpected error number\n";
    386     }
    387     __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
    388 }
    389 
    390 /************************************************************************
    391  *									*
    392  *		Dumping HTML tree content to a simple buffer		*
    393  *									*
    394  ************************************************************************/
    395 
    396 /**
    397  * htmlBufNodeDumpFormat:
    398  * @buf:  the xmlBufPtr output
    399  * @doc:  the document
    400  * @cur:  the current node
    401  * @format:  should formatting spaces been added
    402  *
    403  * Dump an HTML node, recursive behaviour,children are printed too.
    404  *
    405  * Returns the number of byte written or -1 in case of error
    406  */
    407 static size_t
    408 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
    409 	           int format) {
    410     size_t use;
    411     int ret;
    412     xmlOutputBufferPtr outbuf;
    413 
    414     if (cur == NULL) {
    415 	return (-1);
    416     }
    417     if (buf == NULL) {
    418 	return (-1);
    419     }
    420     outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
    421     if (outbuf == NULL) {
    422         htmlSaveErrMemory("allocating HTML output buffer");
    423 	return (-1);
    424     }
    425     memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
    426     outbuf->buffer = buf;
    427     outbuf->encoder = NULL;
    428     outbuf->writecallback = NULL;
    429     outbuf->closecallback = NULL;
    430     outbuf->context = NULL;
    431     outbuf->written = 0;
    432 
    433     use = xmlBufUse(buf);
    434     htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
    435     xmlFree(outbuf);
    436     ret = xmlBufUse(buf) - use;
    437     return (ret);
    438 }
    439 
    440 /**
    441  * htmlNodeDump:
    442  * @buf:  the HTML buffer output
    443  * @doc:  the document
    444  * @cur:  the current node
    445  *
    446  * Dump an HTML node, recursive behaviour,children are printed too,
    447  * and formatting returns are added.
    448  *
    449  * Returns the number of byte written or -1 in case of error
    450  */
    451 int
    452 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
    453     xmlBufPtr buffer;
    454     size_t ret;
    455 
    456     if ((buf == NULL) || (cur == NULL))
    457         return(-1);
    458 
    459     xmlInitParser();
    460     buffer = xmlBufFromBuffer(buf);
    461     if (buffer == NULL)
    462         return(-1);
    463 
    464     ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
    465 
    466     xmlBufBackToBuffer(buffer);
    467 
    468     if (ret > INT_MAX)
    469         return(-1);
    470     return((int) ret);
    471 }
    472 
    473 /**
    474  * htmlNodeDumpFileFormat:
    475  * @out:  the FILE pointer
    476  * @doc:  the document
    477  * @cur:  the current node
    478  * @encoding: the document encoding
    479  * @format:  should formatting spaces been added
    480  *
    481  * Dump an HTML node, recursive behaviour,children are printed too.
    482  *
    483  * TODO: if encoding == NULL try to save in the doc encoding
    484  *
    485  * returns: the number of byte written or -1 in case of failure.
    486  */
    487 int
    488 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
    489 	               xmlNodePtr cur, const char *encoding, int format) {
    490     xmlOutputBufferPtr buf;
    491     xmlCharEncodingHandlerPtr handler = NULL;
    492     int ret;
    493 
    494     xmlInitParser();
    495 
    496     if (encoding != NULL) {
    497 	xmlCharEncoding enc;
    498 
    499 	enc = xmlParseCharEncoding(encoding);
    500 	if (enc != XML_CHAR_ENCODING_UTF8) {
    501 	    handler = xmlFindCharEncodingHandler(encoding);
    502 	    if (handler == NULL)
    503 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
    504 	}
    505     }
    506 
    507     /*
    508      * Fallback to HTML or ASCII when the encoding is unspecified
    509      */
    510     if (handler == NULL)
    511 	handler = xmlFindCharEncodingHandler("HTML");
    512     if (handler == NULL)
    513 	handler = xmlFindCharEncodingHandler("ascii");
    514 
    515     /*
    516      * save the content to a temp buffer.
    517      */
    518     buf = xmlOutputBufferCreateFile(out, handler);
    519     if (buf == NULL) return(0);
    520 
    521     htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
    522 
    523     ret = xmlOutputBufferClose(buf);
    524     return(ret);
    525 }
    526 
    527 /**
    528  * htmlNodeDumpFile:
    529  * @out:  the FILE pointer
    530  * @doc:  the document
    531  * @cur:  the current node
    532  *
    533  * Dump an HTML node, recursive behaviour,children are printed too,
    534  * and formatting returns are added.
    535  */
    536 void
    537 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
    538     htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
    539 }
    540 
    541 /**
    542  * htmlDocDumpMemoryFormat:
    543  * @cur:  the document
    544  * @mem:  OUT: the memory pointer
    545  * @size:  OUT: the memory length
    546  * @format:  should formatting spaces been added
    547  *
    548  * Dump an HTML document in memory and return the xmlChar * and it's size.
    549  * It's up to the caller to free the memory.
    550  */
    551 void
    552 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
    553     xmlOutputBufferPtr buf;
    554     xmlCharEncodingHandlerPtr handler = NULL;
    555     const char *encoding;
    556 
    557     xmlInitParser();
    558 
    559     if ((mem == NULL) || (size == NULL))
    560         return;
    561     if (cur == NULL) {
    562 	*mem = NULL;
    563 	*size = 0;
    564 	return;
    565     }
    566 
    567     encoding = (const char *) htmlGetMetaEncoding(cur);
    568 
    569     if (encoding != NULL) {
    570 	xmlCharEncoding enc;
    571 
    572 	enc = xmlParseCharEncoding(encoding);
    573 	if (enc != cur->charset) {
    574 	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
    575 		/*
    576 		 * Not supported yet
    577 		 */
    578 		*mem = NULL;
    579 		*size = 0;
    580 		return;
    581 	    }
    582 
    583 	    handler = xmlFindCharEncodingHandler(encoding);
    584 	    if (handler == NULL)
    585                 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
    586 
    587 	} else {
    588 	    handler = xmlFindCharEncodingHandler(encoding);
    589 	}
    590     }
    591 
    592     /*
    593      * Fallback to HTML or ASCII when the encoding is unspecified
    594      */
    595     if (handler == NULL)
    596 	handler = xmlFindCharEncodingHandler("HTML");
    597     if (handler == NULL)
    598 	handler = xmlFindCharEncodingHandler("ascii");
    599 
    600     buf = xmlAllocOutputBufferInternal(handler);
    601     if (buf == NULL) {
    602 	*mem = NULL;
    603 	*size = 0;
    604 	return;
    605     }
    606 
    607     htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
    608 
    609     xmlOutputBufferFlush(buf);
    610     if (buf->conv != NULL) {
    611 	*size = xmlBufUse(buf->conv);
    612 	*mem = xmlStrndup(xmlBufContent(buf->conv), *size);
    613     } else {
    614 	*size = xmlBufUse(buf->buffer);
    615 	*mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
    616     }
    617     (void)xmlOutputBufferClose(buf);
    618 }
    619 
    620 /**
    621  * htmlDocDumpMemory:
    622  * @cur:  the document
    623  * @mem:  OUT: the memory pointer
    624  * @size:  OUT: the memory length
    625  *
    626  * Dump an HTML document in memory and return the xmlChar * and it's size.
    627  * It's up to the caller to free the memory.
    628  */
    629 void
    630 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
    631 	htmlDocDumpMemoryFormat(cur, mem, size, 1);
    632 }
    633 
    634 
    635 /************************************************************************
    636  *									*
    637  *		Dumping HTML tree content to an I/O output buffer	*
    638  *									*
    639  ************************************************************************/
    640 
    641 void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
    642 
    643 /**
    644  * htmlDtdDumpOutput:
    645  * @buf:  the HTML buffer output
    646  * @doc:  the document
    647  * @encoding:  the encoding string
    648  *
    649  * TODO: check whether encoding is needed
    650  *
    651  * Dump the HTML document DTD, if any.
    652  */
    653 static void
    654 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
    655 	          const char *encoding ATTRIBUTE_UNUSED) {
    656     xmlDtdPtr cur = doc->intSubset;
    657 
    658     if (cur == NULL) {
    659 	htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
    660 	return;
    661     }
    662     xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
    663     xmlOutputBufferWriteString(buf, (const char *)cur->name);
    664     if (cur->ExternalID != NULL) {
    665 	xmlOutputBufferWriteString(buf, " PUBLIC ");
    666 	xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
    667 	if (cur->SystemID != NULL) {
    668 	    xmlOutputBufferWriteString(buf, " ");
    669 	    xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
    670 	}
    671     }  else if (cur->SystemID != NULL) {
    672 	xmlOutputBufferWriteString(buf, " SYSTEM ");
    673 	xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
    674     }
    675     xmlOutputBufferWriteString(buf, ">\n");
    676 }
    677 
    678 /**
    679  * htmlAttrDumpOutput:
    680  * @buf:  the HTML buffer output
    681  * @doc:  the document
    682  * @cur:  the attribute pointer
    683  * @encoding:  the encoding string
    684  *
    685  * Dump an HTML attribute
    686  */
    687 static void
    688 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
    689 	           const char *encoding ATTRIBUTE_UNUSED) {
    690     xmlChar *value;
    691 
    692     /*
    693      * The html output method should not escape a & character
    694      * occurring in an attribute value immediately followed by
    695      * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
    696      * This is implemented in xmlEncodeEntitiesReentrant
    697      */
    698 
    699     if (cur == NULL) {
    700 	return;
    701     }
    702     xmlOutputBufferWriteString(buf, " ");
    703     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
    704         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
    705 	xmlOutputBufferWriteString(buf, ":");
    706     }
    707     xmlOutputBufferWriteString(buf, (const char *)cur->name);
    708     if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
    709 	value = xmlNodeListGetString(doc, cur->children, 0);
    710 	if (value) {
    711 	    xmlOutputBufferWriteString(buf, "=");
    712 	    if ((cur->ns == NULL) && (cur->parent != NULL) &&
    713 		(cur->parent->ns == NULL) &&
    714 		((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
    715 	         (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
    716 		 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
    717 		 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
    718 		  (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
    719 		xmlChar *escaped;
    720 		xmlChar *tmp = value;
    721 
    722 		while (IS_BLANK_CH(*tmp)) tmp++;
    723 
    724 		/*
    725 		 * the < and > have already been escaped at the entity level
    726 		 * And doing so here breaks server side includes
    727 		 */
    728 		escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+<>");
    729 		if (escaped != NULL) {
    730 		    xmlBufWriteQuotedString(buf->buffer, escaped);
    731 		    xmlFree(escaped);
    732 		} else {
    733 		    xmlBufWriteQuotedString(buf->buffer, value);
    734 		}
    735 	    } else {
    736 		xmlBufWriteQuotedString(buf->buffer, value);
    737 	    }
    738 	    xmlFree(value);
    739 	} else  {
    740 	    xmlOutputBufferWriteString(buf, "=\"\"");
    741 	}
    742     }
    743 }
    744 
    745 /**
    746  * htmlAttrListDumpOutput:
    747  * @buf:  the HTML buffer output
    748  * @doc:  the document
    749  * @cur:  the first attribute pointer
    750  * @encoding:  the encoding string
    751  *
    752  * Dump a list of HTML attributes
    753  */
    754 static void
    755 htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
    756     if (cur == NULL) {
    757 	return;
    758     }
    759     while (cur != NULL) {
    760         htmlAttrDumpOutput(buf, doc, cur, encoding);
    761 	cur = cur->next;
    762     }
    763 }
    764 
    765 
    766 
    767 /**
    768  * htmlNodeListDumpOutput:
    769  * @buf:  the HTML buffer output
    770  * @doc:  the document
    771  * @cur:  the first node
    772  * @encoding:  the encoding string
    773  * @format:  should formatting spaces been added
    774  *
    775  * Dump an HTML node list, recursive behaviour,children are printed too.
    776  */
    777 static void
    778 htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
    779 	               xmlNodePtr cur, const char *encoding, int format) {
    780     if (cur == NULL) {
    781 	return;
    782     }
    783     while (cur != NULL) {
    784         htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
    785 	cur = cur->next;
    786     }
    787 }
    788 
    789 /**
    790  * htmlNodeDumpFormatOutput:
    791  * @buf:  the HTML buffer output
    792  * @doc:  the document
    793  * @cur:  the current node
    794  * @encoding:  the encoding string
    795  * @format:  should formatting spaces been added
    796  *
    797  * Dump an HTML node, recursive behaviour,children are printed too.
    798  */
    799 void
    800 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
    801 	                 xmlNodePtr cur, const char *encoding, int format) {
    802     const htmlElemDesc * info;
    803 
    804     xmlInitParser();
    805 
    806     if ((cur == NULL) || (buf == NULL)) {
    807 	return;
    808     }
    809     /*
    810      * Special cases.
    811      */
    812     if (cur->type == XML_DTD_NODE)
    813 	return;
    814     if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
    815         (cur->type == XML_DOCUMENT_NODE)){
    816 	htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
    817 	return;
    818     }
    819     if (cur->type == XML_ATTRIBUTE_NODE) {
    820         htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding);
    821 	return;
    822     }
    823     if (cur->type == HTML_TEXT_NODE) {
    824 	if (cur->content != NULL) {
    825 	    if (((cur->name == (const xmlChar *)xmlStringText) ||
    826 		 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
    827 		((cur->parent == NULL) ||
    828 		 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) &&
    829 		  (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) {
    830 		xmlChar *buffer;
    831 
    832 		buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
    833 		if (buffer != NULL) {
    834 		    xmlOutputBufferWriteString(buf, (const char *)buffer);
    835 		    xmlFree(buffer);
    836 		}
    837 	    } else {
    838 		xmlOutputBufferWriteString(buf, (const char *)cur->content);
    839 	    }
    840 	}
    841 	return;
    842     }
    843     if (cur->type == HTML_COMMENT_NODE) {
    844 	if (cur->content != NULL) {
    845 	    xmlOutputBufferWriteString(buf, "<!--");
    846 	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
    847 	    xmlOutputBufferWriteString(buf, "-->");
    848 	}
    849 	return;
    850     }
    851     if (cur->type == HTML_PI_NODE) {
    852 	if (cur->name == NULL)
    853 	    return;
    854 	xmlOutputBufferWriteString(buf, "<?");
    855 	xmlOutputBufferWriteString(buf, (const char *)cur->name);
    856 	if (cur->content != NULL) {
    857 	    xmlOutputBufferWriteString(buf, " ");
    858 	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
    859 	}
    860 	xmlOutputBufferWriteString(buf, ">");
    861 	return;
    862     }
    863     if (cur->type == HTML_ENTITY_REF_NODE) {
    864         xmlOutputBufferWriteString(buf, "&");
    865 	xmlOutputBufferWriteString(buf, (const char *)cur->name);
    866         xmlOutputBufferWriteString(buf, ";");
    867 	return;
    868     }
    869     if (cur->type == HTML_PRESERVE_NODE) {
    870 	if (cur->content != NULL) {
    871 	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
    872 	}
    873 	return;
    874     }
    875 
    876     /*
    877      * Get specific HTML info for that node.
    878      */
    879     if (cur->ns == NULL)
    880 	info = htmlTagLookup(cur->name);
    881     else
    882 	info = NULL;
    883 
    884     xmlOutputBufferWriteString(buf, "<");
    885     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
    886         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
    887 	xmlOutputBufferWriteString(buf, ":");
    888     }
    889     xmlOutputBufferWriteString(buf, (const char *)cur->name);
    890     if (cur->nsDef)
    891 	xmlNsListDumpOutput(buf, cur->nsDef);
    892     if (cur->properties != NULL)
    893         htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
    894 
    895     if ((info != NULL) && (info->empty)) {
    896         xmlOutputBufferWriteString(buf, ">");
    897 	if ((format) && (!info->isinline) && (cur->next != NULL)) {
    898 	    if ((cur->next->type != HTML_TEXT_NODE) &&
    899 		(cur->next->type != HTML_ENTITY_REF_NODE) &&
    900 		(cur->parent != NULL) &&
    901 		(cur->parent->name != NULL) &&
    902 		(cur->parent->name[0] != 'p')) /* p, pre, param */
    903 		xmlOutputBufferWriteString(buf, "\n");
    904 	}
    905 	return;
    906     }
    907     if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) &&
    908 	(cur->children == NULL)) {
    909         if ((info != NULL) && (info->saveEndTag != 0) &&
    910 	    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
    911 	    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
    912 	    xmlOutputBufferWriteString(buf, ">");
    913 	} else {
    914 	    xmlOutputBufferWriteString(buf, "></");
    915             if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
    916                 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
    917                 xmlOutputBufferWriteString(buf, ":");
    918             }
    919 	    xmlOutputBufferWriteString(buf, (const char *)cur->name);
    920 	    xmlOutputBufferWriteString(buf, ">");
    921 	}
    922 	if ((format) && (cur->next != NULL) &&
    923             (info != NULL) && (!info->isinline)) {
    924 	    if ((cur->next->type != HTML_TEXT_NODE) &&
    925 		(cur->next->type != HTML_ENTITY_REF_NODE) &&
    926 		(cur->parent != NULL) &&
    927 		(cur->parent->name != NULL) &&
    928 		(cur->parent->name[0] != 'p')) /* p, pre, param */
    929 		xmlOutputBufferWriteString(buf, "\n");
    930 	}
    931 	return;
    932     }
    933     xmlOutputBufferWriteString(buf, ">");
    934     if ((cur->type != XML_ELEMENT_NODE) &&
    935 	(cur->content != NULL)) {
    936 	    /*
    937 	     * Uses the OutputBuffer property to automatically convert
    938 	     * invalids to charrefs
    939 	     */
    940 
    941             xmlOutputBufferWriteString(buf, (const char *) cur->content);
    942     }
    943     if (cur->children != NULL) {
    944         if ((format) && (info != NULL) && (!info->isinline) &&
    945 	    (cur->children->type != HTML_TEXT_NODE) &&
    946 	    (cur->children->type != HTML_ENTITY_REF_NODE) &&
    947 	    (cur->children != cur->last) &&
    948 	    (cur->name != NULL) &&
    949 	    (cur->name[0] != 'p')) /* p, pre, param */
    950 	    xmlOutputBufferWriteString(buf, "\n");
    951 	htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format);
    952         if ((format) && (info != NULL) && (!info->isinline) &&
    953 	    (cur->last->type != HTML_TEXT_NODE) &&
    954 	    (cur->last->type != HTML_ENTITY_REF_NODE) &&
    955 	    (cur->children != cur->last) &&
    956 	    (cur->name != NULL) &&
    957 	    (cur->name[0] != 'p')) /* p, pre, param */
    958 	    xmlOutputBufferWriteString(buf, "\n");
    959     }
    960     xmlOutputBufferWriteString(buf, "</");
    961     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
    962         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
    963 	xmlOutputBufferWriteString(buf, ":");
    964     }
    965     xmlOutputBufferWriteString(buf, (const char *)cur->name);
    966     xmlOutputBufferWriteString(buf, ">");
    967     if ((format) && (info != NULL) && (!info->isinline) &&
    968 	(cur->next != NULL)) {
    969         if ((cur->next->type != HTML_TEXT_NODE) &&
    970 	    (cur->next->type != HTML_ENTITY_REF_NODE) &&
    971 	    (cur->parent != NULL) &&
    972 	    (cur->parent->name != NULL) &&
    973 	    (cur->parent->name[0] != 'p')) /* p, pre, param */
    974 	    xmlOutputBufferWriteString(buf, "\n");
    975     }
    976 }
    977 
    978 /**
    979  * htmlNodeDumpOutput:
    980  * @buf:  the HTML buffer output
    981  * @doc:  the document
    982  * @cur:  the current node
    983  * @encoding:  the encoding string
    984  *
    985  * Dump an HTML node, recursive behaviour,children are printed too,
    986  * and formatting returns/spaces are added.
    987  */
    988 void
    989 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
    990 	           xmlNodePtr cur, const char *encoding) {
    991     htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1);
    992 }
    993 
    994 /**
    995  * htmlDocContentDumpFormatOutput:
    996  * @buf:  the HTML buffer output
    997  * @cur:  the document
    998  * @encoding:  the encoding string
    999  * @format:  should formatting spaces been added
   1000  *
   1001  * Dump an HTML document.
   1002  */
   1003 void
   1004 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
   1005 	                       const char *encoding, int format) {
   1006     int type;
   1007 
   1008     xmlInitParser();
   1009 
   1010     if ((buf == NULL) || (cur == NULL))
   1011         return;
   1012 
   1013     /*
   1014      * force to output the stuff as HTML, especially for entities
   1015      */
   1016     type = cur->type;
   1017     cur->type = XML_HTML_DOCUMENT_NODE;
   1018     if (cur->intSubset != NULL) {
   1019         htmlDtdDumpOutput(buf, cur, NULL);
   1020     }
   1021     if (cur->children != NULL) {
   1022         htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format);
   1023     }
   1024     xmlOutputBufferWriteString(buf, "\n");
   1025     cur->type = (xmlElementType) type;
   1026 }
   1027 
   1028 /**
   1029  * htmlDocContentDumpOutput:
   1030  * @buf:  the HTML buffer output
   1031  * @cur:  the document
   1032  * @encoding:  the encoding string
   1033  *
   1034  * Dump an HTML document. Formating return/spaces are added.
   1035  */
   1036 void
   1037 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
   1038 	                 const char *encoding) {
   1039     htmlDocContentDumpFormatOutput(buf, cur, encoding, 1);
   1040 }
   1041 
   1042 /************************************************************************
   1043  *									*
   1044  *		Saving functions front-ends				*
   1045  *									*
   1046  ************************************************************************/
   1047 
   1048 /**
   1049  * htmlDocDump:
   1050  * @f:  the FILE*
   1051  * @cur:  the document
   1052  *
   1053  * Dump an HTML document to an open FILE.
   1054  *
   1055  * returns: the number of byte written or -1 in case of failure.
   1056  */
   1057 int
   1058 htmlDocDump(FILE *f, xmlDocPtr cur) {
   1059     xmlOutputBufferPtr buf;
   1060     xmlCharEncodingHandlerPtr handler = NULL;
   1061     const char *encoding;
   1062     int ret;
   1063 
   1064     xmlInitParser();
   1065 
   1066     if ((cur == NULL) || (f == NULL)) {
   1067 	return(-1);
   1068     }
   1069 
   1070     encoding = (const char *) htmlGetMetaEncoding(cur);
   1071 
   1072     if (encoding != NULL) {
   1073 	xmlCharEncoding enc;
   1074 
   1075 	enc = xmlParseCharEncoding(encoding);
   1076 	if (enc != cur->charset) {
   1077 	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
   1078 		/*
   1079 		 * Not supported yet
   1080 		 */
   1081 		return(-1);
   1082 	    }
   1083 
   1084 	    handler = xmlFindCharEncodingHandler(encoding);
   1085 	    if (handler == NULL)
   1086 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
   1087 	} else {
   1088 	    handler = xmlFindCharEncodingHandler(encoding);
   1089 	}
   1090     }
   1091 
   1092     /*
   1093      * Fallback to HTML or ASCII when the encoding is unspecified
   1094      */
   1095     if (handler == NULL)
   1096 	handler = xmlFindCharEncodingHandler("HTML");
   1097     if (handler == NULL)
   1098 	handler = xmlFindCharEncodingHandler("ascii");
   1099 
   1100     buf = xmlOutputBufferCreateFile(f, handler);
   1101     if (buf == NULL) return(-1);
   1102     htmlDocContentDumpOutput(buf, cur, NULL);
   1103 
   1104     ret = xmlOutputBufferClose(buf);
   1105     return(ret);
   1106 }
   1107 
   1108 /**
   1109  * htmlSaveFile:
   1110  * @filename:  the filename (or URL)
   1111  * @cur:  the document
   1112  *
   1113  * Dump an HTML document to a file. If @filename is "-" the stdout file is
   1114  * used.
   1115  * returns: the number of byte written or -1 in case of failure.
   1116  */
   1117 int
   1118 htmlSaveFile(const char *filename, xmlDocPtr cur) {
   1119     xmlOutputBufferPtr buf;
   1120     xmlCharEncodingHandlerPtr handler = NULL;
   1121     const char *encoding;
   1122     int ret;
   1123 
   1124     if ((cur == NULL) || (filename == NULL))
   1125         return(-1);
   1126 
   1127     xmlInitParser();
   1128 
   1129     encoding = (const char *) htmlGetMetaEncoding(cur);
   1130 
   1131     if (encoding != NULL) {
   1132 	xmlCharEncoding enc;
   1133 
   1134 	enc = xmlParseCharEncoding(encoding);
   1135 	if (enc != cur->charset) {
   1136 	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
   1137 		/*
   1138 		 * Not supported yet
   1139 		 */
   1140 		return(-1);
   1141 	    }
   1142 
   1143 	    handler = xmlFindCharEncodingHandler(encoding);
   1144 	    if (handler == NULL)
   1145 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
   1146 	}
   1147     }
   1148 
   1149     /*
   1150      * Fallback to HTML or ASCII when the encoding is unspecified
   1151      */
   1152     if (handler == NULL)
   1153 	handler = xmlFindCharEncodingHandler("HTML");
   1154     if (handler == NULL)
   1155 	handler = xmlFindCharEncodingHandler("ascii");
   1156 
   1157     /*
   1158      * save the content to a temp buffer.
   1159      */
   1160     buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
   1161     if (buf == NULL) return(0);
   1162 
   1163     htmlDocContentDumpOutput(buf, cur, NULL);
   1164 
   1165     ret = xmlOutputBufferClose(buf);
   1166     return(ret);
   1167 }
   1168 
   1169 /**
   1170  * htmlSaveFileFormat:
   1171  * @filename:  the filename
   1172  * @cur:  the document
   1173  * @format:  should formatting spaces been added
   1174  * @encoding: the document encoding
   1175  *
   1176  * Dump an HTML document to a file using a given encoding.
   1177  *
   1178  * returns: the number of byte written or -1 in case of failure.
   1179  */
   1180 int
   1181 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
   1182 	           const char *encoding, int format) {
   1183     xmlOutputBufferPtr buf;
   1184     xmlCharEncodingHandlerPtr handler = NULL;
   1185     int ret;
   1186 
   1187     if ((cur == NULL) || (filename == NULL))
   1188         return(-1);
   1189 
   1190     xmlInitParser();
   1191 
   1192     if (encoding != NULL) {
   1193 	xmlCharEncoding enc;
   1194 
   1195 	enc = xmlParseCharEncoding(encoding);
   1196 	if (enc != cur->charset) {
   1197 	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
   1198 		/*
   1199 		 * Not supported yet
   1200 		 */
   1201 		return(-1);
   1202 	    }
   1203 
   1204 	    handler = xmlFindCharEncodingHandler(encoding);
   1205 	    if (handler == NULL)
   1206 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
   1207 	}
   1208         htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
   1209     } else {
   1210 	htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
   1211     }
   1212 
   1213     /*
   1214      * Fallback to HTML or ASCII when the encoding is unspecified
   1215      */
   1216     if (handler == NULL)
   1217 	handler = xmlFindCharEncodingHandler("HTML");
   1218     if (handler == NULL)
   1219 	handler = xmlFindCharEncodingHandler("ascii");
   1220 
   1221     /*
   1222      * save the content to a temp buffer.
   1223      */
   1224     buf = xmlOutputBufferCreateFilename(filename, handler, 0);
   1225     if (buf == NULL) return(0);
   1226 
   1227     htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
   1228 
   1229     ret = xmlOutputBufferClose(buf);
   1230     return(ret);
   1231 }
   1232 
   1233 /**
   1234  * htmlSaveFileEnc:
   1235  * @filename:  the filename
   1236  * @cur:  the document
   1237  * @encoding: the document encoding
   1238  *
   1239  * Dump an HTML document to a file using a given encoding
   1240  * and formatting returns/spaces are added.
   1241  *
   1242  * returns: the number of byte written or -1 in case of failure.
   1243  */
   1244 int
   1245 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
   1246     return(htmlSaveFileFormat(filename, cur, encoding, 1));
   1247 }
   1248 
   1249 #endif /* LIBXML_OUTPUT_ENABLED */
   1250 
   1251 #define bottom_HTMLtree
   1252 #include "elfgcchack.h"
   1253 #endif /* LIBXML_HTML_ENABLED */
   1254