Home | History | Annotate | Download | only in libxml2
      1 /*
      2  * HTMLparser.c : an HTML 4.0 non-verifying parser
      3  *
      4  * See Copyright for the status of this software.
      5  *
      6  * daniel (at) veillard.com
      7  */
      8 
      9 #define IN_LIBXML
     10 #include "libxml.h"
     11 #ifdef LIBXML_HTML_ENABLED
     12 
     13 #include <string.h>
     14 #ifdef HAVE_CTYPE_H
     15 #include <ctype.h>
     16 #endif
     17 #ifdef HAVE_STDLIB_H
     18 #include <stdlib.h>
     19 #endif
     20 #ifdef HAVE_SYS_STAT_H
     21 #include <sys/stat.h>
     22 #endif
     23 #ifdef HAVE_FCNTL_H
     24 #include <fcntl.h>
     25 #endif
     26 #ifdef HAVE_UNISTD_H
     27 #include <unistd.h>
     28 #endif
     29 #ifdef HAVE_ZLIB_H
     30 #include <zlib.h>
     31 #endif
     32 
     33 #include <libxml/xmlmemory.h>
     34 #include <libxml/tree.h>
     35 #include <libxml/parser.h>
     36 #include <libxml/parserInternals.h>
     37 #include <libxml/xmlerror.h>
     38 #include <libxml/HTMLparser.h>
     39 #include <libxml/HTMLtree.h>
     40 #include <libxml/entities.h>
     41 #include <libxml/encoding.h>
     42 #include <libxml/valid.h>
     43 #include <libxml/xmlIO.h>
     44 #include <libxml/globals.h>
     45 #include <libxml/uri.h>
     46 
     47 #define HTML_MAX_NAMELEN 1000
     48 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
     49 #define HTML_PARSER_BUFFER_SIZE 100
     50 
     51 /* #define DEBUG */
     52 /* #define DEBUG_PUSH */
     53 
     54 static int htmlOmittedDefaultValue = 1;
     55 
     56 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
     57 			     xmlChar end, xmlChar  end2, xmlChar end3);
     58 static void htmlParseComment(htmlParserCtxtPtr ctxt);
     59 
     60 /************************************************************************
     61  *									*
     62  * 		Some factorized error routines				*
     63  *									*
     64  ************************************************************************/
     65 
     66 /**
     67  * htmlErrMemory:
     68  * @ctxt:  an HTML parser context
     69  * @extra:  extra informations
     70  *
     71  * Handle a redefinition of attribute error
     72  */
     73 static void
     74 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
     75 {
     76     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
     77         (ctxt->instate == XML_PARSER_EOF))
     78 	return;
     79     if (ctxt != NULL) {
     80         ctxt->errNo = XML_ERR_NO_MEMORY;
     81         ctxt->instate = XML_PARSER_EOF;
     82         ctxt->disableSAX = 1;
     83     }
     84     if (extra)
     85         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
     86                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
     87                         NULL, NULL, 0, 0,
     88                         "Memory allocation failed : %s\n", extra);
     89     else
     90         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
     91                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
     92                         NULL, NULL, 0, 0, "Memory allocation failed\n");
     93 }
     94 
     95 /**
     96  * htmlParseErr:
     97  * @ctxt:  an HTML parser context
     98  * @error:  the error number
     99  * @msg:  the error message
    100  * @str1:  string infor
    101  * @str2:  string infor
    102  *
    103  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
    104  */
    105 static void
    106 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
    107              const char *msg, const xmlChar *str1, const xmlChar *str2)
    108 {
    109     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
    110         (ctxt->instate == XML_PARSER_EOF))
    111 	return;
    112     if (ctxt != NULL)
    113 	ctxt->errNo = error;
    114     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
    115                     XML_ERR_ERROR, NULL, 0,
    116 		    (const char *) str1, (const char *) str2,
    117 		    NULL, 0, 0,
    118 		    msg, str1, str2);
    119     if (ctxt != NULL)
    120 	ctxt->wellFormed = 0;
    121 }
    122 
    123 /**
    124  * htmlParseErrInt:
    125  * @ctxt:  an HTML parser context
    126  * @error:  the error number
    127  * @msg:  the error message
    128  * @val:  integer info
    129  *
    130  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
    131  */
    132 static void
    133 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
    134              const char *msg, int val)
    135 {
    136     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
    137         (ctxt->instate == XML_PARSER_EOF))
    138 	return;
    139     if (ctxt != NULL)
    140 	ctxt->errNo = error;
    141     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
    142                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
    143 		    NULL, val, 0, msg, val);
    144     if (ctxt != NULL)
    145 	ctxt->wellFormed = 0;
    146 }
    147 
    148 /************************************************************************
    149  *									*
    150  * 		Parser stacks related functions and macros		*
    151  *									*
    152  ************************************************************************/
    153 
    154 /**
    155  * htmlnamePush:
    156  * @ctxt:  an HTML parser context
    157  * @value:  the element name
    158  *
    159  * Pushes a new element name on top of the name stack
    160  *
    161  * Returns 0 in case of error, the index in the stack otherwise
    162  */
    163 static int
    164 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
    165 {
    166     if (ctxt->nameNr >= ctxt->nameMax) {
    167         ctxt->nameMax *= 2;
    168         ctxt->nameTab = (const xmlChar * *)
    169                          xmlRealloc((xmlChar * *)ctxt->nameTab,
    170                                     ctxt->nameMax *
    171                                     sizeof(ctxt->nameTab[0]));
    172         if (ctxt->nameTab == NULL) {
    173             htmlErrMemory(ctxt, NULL);
    174             return (0);
    175         }
    176     }
    177     ctxt->nameTab[ctxt->nameNr] = value;
    178     ctxt->name = value;
    179     return (ctxt->nameNr++);
    180 }
    181 /**
    182  * htmlnamePop:
    183  * @ctxt: an HTML parser context
    184  *
    185  * Pops the top element name from the name stack
    186  *
    187  * Returns the name just removed
    188  */
    189 static const xmlChar *
    190 htmlnamePop(htmlParserCtxtPtr ctxt)
    191 {
    192     const xmlChar *ret;
    193 
    194     if (ctxt->nameNr <= 0)
    195         return (NULL);
    196     ctxt->nameNr--;
    197     if (ctxt->nameNr < 0)
    198         return (NULL);
    199     if (ctxt->nameNr > 0)
    200         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
    201     else
    202         ctxt->name = NULL;
    203     ret = ctxt->nameTab[ctxt->nameNr];
    204     ctxt->nameTab[ctxt->nameNr] = NULL;
    205     return (ret);
    206 }
    207 
    208 /*
    209  * Macros for accessing the content. Those should be used only by the parser,
    210  * and not exported.
    211  *
    212  * Dirty macros, i.e. one need to make assumption on the context to use them
    213  *
    214  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
    215  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
    216  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
    217  *           in UNICODE mode. This should be used internally by the parser
    218  *           only to compare to ASCII values otherwise it would break when
    219  *           running with UTF-8 encoding.
    220  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
    221  *           to compare on ASCII based substring.
    222  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
    223  *           it should be used only to compare on ASCII based substring.
    224  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
    225  *           strings without newlines within the parser.
    226  *
    227  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
    228  *
    229  *   CURRENT Returns the current char value, with the full decoding of
    230  *           UTF-8 if we are using this mode. It returns an int.
    231  *   NEXT    Skip to the next character, this does the proper decoding
    232  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
    233  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
    234  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
    235  */
    236 
    237 #define UPPER (toupper(*ctxt->input->cur))
    238 
    239 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
    240 
    241 #define NXT(val) ctxt->input->cur[(val)]
    242 
    243 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
    244 
    245 #define CUR_PTR ctxt->input->cur
    246 
    247 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
    248 		   (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
    249 	xmlParserInputShrink(ctxt->input)
    250 
    251 #define GROW if ((ctxt->progressive == 0) &&				\
    252 		 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))	\
    253 	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
    254 
    255 #define CURRENT ((int) (*ctxt->input->cur))
    256 
    257 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
    258 
    259 /* Inported from XML */
    260 
    261 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
    262 #define CUR ((int) (*ctxt->input->cur))
    263 #define NEXT xmlNextChar(ctxt)
    264 
    265 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
    266 #define NXT(val) ctxt->input->cur[(val)]
    267 #define CUR_PTR ctxt->input->cur
    268 
    269 
    270 #define NEXTL(l) do {							\
    271     if (*(ctxt->input->cur) == '\n') {					\
    272 	ctxt->input->line++; ctxt->input->col = 1;			\
    273     } else ctxt->input->col++;						\
    274     ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;		\
    275   } while (0)
    276 
    277 /************
    278     \
    279     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);	\
    280     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
    281  ************/
    282 
    283 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
    284 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
    285 
    286 #define COPY_BUF(l,b,i,v)						\
    287     if (l == 1) b[i++] = (xmlChar) v;					\
    288     else i += xmlCopyChar(l,&b[i],v)
    289 
    290 /**
    291  * htmlCurrentChar:
    292  * @ctxt:  the HTML parser context
    293  * @len:  pointer to the length of the char read
    294  *
    295  * The current char value, if using UTF-8 this may actually span multiple
    296  * bytes in the input buffer. Implement the end of line normalization:
    297  * 2.11 End-of-Line Handling
    298  * If the encoding is unspecified, in the case we find an ISO-Latin-1
    299  * char, then the encoding converter is plugged in automatically.
    300  *
    301  * Returns the current char value and its length
    302  */
    303 
    304 static int
    305 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
    306     if (ctxt->instate == XML_PARSER_EOF)
    307 	return(0);
    308 
    309     if (ctxt->token != 0) {
    310 	*len = 0;
    311 	return(ctxt->token);
    312     }
    313     if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
    314 	/*
    315 	 * We are supposed to handle UTF8, check it's valid
    316 	 * From rfc2044: encoding of the Unicode values on UTF-8:
    317 	 *
    318 	 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
    319 	 * 0000 0000-0000 007F   0xxxxxxx
    320 	 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
    321 	 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
    322 	 *
    323 	 * Check for the 0x110000 limit too
    324 	 */
    325 	const unsigned char *cur = ctxt->input->cur;
    326 	unsigned char c;
    327 	unsigned int val;
    328 
    329 	c = *cur;
    330 	if (c & 0x80) {
    331 	    if (cur[1] == 0)
    332 		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
    333 	    if ((cur[1] & 0xc0) != 0x80)
    334 		goto encoding_error;
    335 	    if ((c & 0xe0) == 0xe0) {
    336 
    337 		if (cur[2] == 0)
    338 		    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
    339 		if ((cur[2] & 0xc0) != 0x80)
    340 		    goto encoding_error;
    341 		if ((c & 0xf0) == 0xf0) {
    342 		    if (cur[3] == 0)
    343 			xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
    344 		    if (((c & 0xf8) != 0xf0) ||
    345 			((cur[3] & 0xc0) != 0x80))
    346 			goto encoding_error;
    347 		    /* 4-byte code */
    348 		    *len = 4;
    349 		    val = (cur[0] & 0x7) << 18;
    350 		    val |= (cur[1] & 0x3f) << 12;
    351 		    val |= (cur[2] & 0x3f) << 6;
    352 		    val |= cur[3] & 0x3f;
    353 		} else {
    354 		  /* 3-byte code */
    355 		    *len = 3;
    356 		    val = (cur[0] & 0xf) << 12;
    357 		    val |= (cur[1] & 0x3f) << 6;
    358 		    val |= cur[2] & 0x3f;
    359 		}
    360 	    } else {
    361 	      /* 2-byte code */
    362 		*len = 2;
    363 		val = (cur[0] & 0x1f) << 6;
    364 		val |= cur[1] & 0x3f;
    365 	    }
    366 	    if (!IS_CHAR(val)) {
    367 	        htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
    368 				"Char 0x%X out of allowed range\n", val);
    369 	    }
    370 	    return(val);
    371 	} else {
    372 	    /* 1-byte code */
    373 	    *len = 1;
    374 	    return((int) *ctxt->input->cur);
    375 	}
    376     }
    377     /*
    378      * Assume it's a fixed length encoding (1) with
    379      * a compatible encoding for the ASCII set, since
    380      * XML constructs only use < 128 chars
    381      */
    382     *len = 1;
    383     if ((int) *ctxt->input->cur < 0x80)
    384 	return((int) *ctxt->input->cur);
    385 
    386     /*
    387      * Humm this is bad, do an automatic flow conversion
    388      */
    389     xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
    390     ctxt->charset = XML_CHAR_ENCODING_UTF8;
    391     return(xmlCurrentChar(ctxt, len));
    392 
    393 encoding_error:
    394     /*
    395      * If we detect an UTF8 error that probably mean that the
    396      * input encoding didn't get properly advertized in the
    397      * declaration header. Report the error and switch the encoding
    398      * to ISO-Latin-1 (if you don't like this policy, just declare the
    399      * encoding !)
    400      */
    401     {
    402         char buffer[150];
    403 
    404 	if (ctxt->input->end - ctxt->input->cur >= 4) {
    405 	    snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
    406 			    ctxt->input->cur[0], ctxt->input->cur[1],
    407 			    ctxt->input->cur[2], ctxt->input->cur[3]);
    408 	} else {
    409 	    snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
    410 	}
    411 	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
    412 		     "Input is not proper UTF-8, indicate encoding !\n",
    413 		     BAD_CAST buffer, NULL);
    414     }
    415 
    416     ctxt->charset = XML_CHAR_ENCODING_8859_1;
    417     *len = 1;
    418     return((int) *ctxt->input->cur);
    419 }
    420 
    421 /**
    422  * htmlSkipBlankChars:
    423  * @ctxt:  the HTML parser context
    424  *
    425  * skip all blanks character found at that point in the input streams.
    426  *
    427  * Returns the number of space chars skipped
    428  */
    429 
    430 static int
    431 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
    432     int res = 0;
    433 
    434     while (IS_BLANK_CH(*(ctxt->input->cur))) {
    435 	if ((*ctxt->input->cur == 0) &&
    436 	    (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
    437 		xmlPopInput(ctxt);
    438 	} else {
    439 	    if (*(ctxt->input->cur) == '\n') {
    440 		ctxt->input->line++; ctxt->input->col = 1;
    441 	    } else ctxt->input->col++;
    442 	    ctxt->input->cur++;
    443 	    ctxt->nbChars++;
    444 	    if (*ctxt->input->cur == 0)
    445 		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
    446 	}
    447 	res++;
    448     }
    449     return(res);
    450 }
    451 
    452 
    453 
    454 /************************************************************************
    455  *									*
    456  * 		The list of HTML elements and their properties		*
    457  *									*
    458  ************************************************************************/
    459 
    460 /*
    461  *  Start Tag: 1 means the start tag can be ommited
    462  *  End Tag:   1 means the end tag can be ommited
    463  *             2 means it's forbidden (empty elements)
    464  *             3 means the tag is stylistic and should be closed easily
    465  *  Depr:      this element is deprecated
    466  *  DTD:       1 means that this element is valid only in the Loose DTD
    467  *             2 means that this element is valid only in the Frameset DTD
    468  *
    469  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
    470 	, subElements , impliedsubelt , Attributes, userdata
    471  */
    472 
    473 /* Definitions and a couple of vars for HTML Elements */
    474 
    475 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
    476 #define NB_FONTSTYLE 8
    477 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
    478 #define NB_PHRASE 10
    479 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
    480 #define NB_SPECIAL 16
    481 #define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
    482 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
    483 #define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
    484 #define NB_BLOCK NB_HEADING + NB_LIST + 14
    485 #define FORMCTRL "input", "select", "textarea", "label", "button"
    486 #define NB_FORMCTRL 5
    487 #define PCDATA
    488 #define NB_PCDATA 0
    489 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
    490 #define NB_HEADING 6
    491 #define LIST "ul", "ol", "dir", "menu"
    492 #define NB_LIST 4
    493 #define MODIFIER
    494 #define NB_MODIFIER 0
    495 #define FLOW BLOCK,INLINE
    496 #define NB_FLOW NB_BLOCK + NB_INLINE
    497 #define EMPTY NULL
    498 
    499 
    500 static const char* const html_flow[] = { FLOW, NULL } ;
    501 static const char* const html_inline[] = { INLINE, NULL } ;
    502 
    503 /* placeholders: elts with content but no subelements */
    504 static const char* const html_pcdata[] = { NULL } ;
    505 #define html_cdata html_pcdata
    506 
    507 
    508 /* ... and for HTML Attributes */
    509 
    510 #define COREATTRS "id", "class", "style", "title"
    511 #define NB_COREATTRS 4
    512 #define I18N "lang", "dir"
    513 #define NB_I18N 2
    514 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
    515 #define NB_EVENTS 9
    516 #define ATTRS COREATTRS,I18N,EVENTS
    517 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
    518 #define CELLHALIGN "align", "char", "charoff"
    519 #define NB_CELLHALIGN 3
    520 #define CELLVALIGN "valign"
    521 #define NB_CELLVALIGN 1
    522 
    523 static const char* const html_attrs[] = { ATTRS, NULL } ;
    524 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
    525 static const char* const core_attrs[] = { COREATTRS, NULL } ;
    526 static const char* const i18n_attrs[] = { I18N, NULL } ;
    527 
    528 
    529 /* Other declarations that should go inline ... */
    530 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
    531 	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
    532 	"tabindex", "onfocus", "onblur", NULL } ;
    533 static const char* const target_attr[] = { "target", NULL } ;
    534 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
    535 static const char* const alt_attr[] = { "alt", NULL } ;
    536 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
    537 static const char* const href_attrs[] = { "href", NULL } ;
    538 static const char* const clear_attrs[] = { "clear", NULL } ;
    539 static const char* const inline_p[] = { INLINE, "p", NULL } ;
    540 
    541 static const char* const flow_param[] = { FLOW, "param", NULL } ;
    542 static const char* const applet_attrs[] = { COREATTRS , "codebase",
    543 		"archive", "alt", "name", "height", "width", "align",
    544 		"hspace", "vspace", NULL } ;
    545 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
    546 	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
    547 static const char* const basefont_attrs[] =
    548 	{ "id", "size", "color", "face", NULL } ;
    549 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
    550 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
    551 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
    552 static const char* const body_depr[] = { "background", "bgcolor", "text",
    553 	"link", "vlink", "alink", NULL } ;
    554 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
    555 	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
    556 
    557 
    558 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
    559 static const char* const col_elt[] = { "col", NULL } ;
    560 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
    561 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
    562 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
    563 static const char* const compact_attr[] = { "compact", NULL } ;
    564 static const char* const label_attr[] = { "label", NULL } ;
    565 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
    566 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
    567 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
    568 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
    569 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
    570 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
    571 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
    572 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
    573 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
    574 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
    575 static const char* const version_attr[] = { "version", NULL } ;
    576 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
    577 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
    578 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
    579 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
    580 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
    581 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
    582 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
    583 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
    584 static const char* const align_attr[] = { "align", NULL } ;
    585 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
    586 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
    587 static const char* const name_attr[] = { "name", NULL } ;
    588 static const char* const action_attr[] = { "action", NULL } ;
    589 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
    590 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
    591 static const char* const content_attr[] = { "content", NULL } ;
    592 static const char* const type_attr[] = { "type", NULL } ;
    593 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
    594 static const char* const object_contents[] = { FLOW, "param", NULL } ;
    595 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
    596 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
    597 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
    598 static const char* const option_elt[] = { "option", NULL } ;
    599 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
    600 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
    601 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
    602 static const char* const width_attr[] = { "width", NULL } ;
    603 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
    604 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
    605 static const char* const language_attr[] = { "language", NULL } ;
    606 static const char* const select_content[] = { "optgroup", "option", NULL } ;
    607 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
    608 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
    609 static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
    610 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
    611 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
    612 static const char* const tr_elt[] = { "tr", NULL } ;
    613 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
    614 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
    615 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
    616 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
    617 static const char* const tr_contents[] = { "th", "td", NULL } ;
    618 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
    619 static const char* const li_elt[] = { "li", NULL } ;
    620 static const char* const ul_depr[] = { "type", "compact", NULL} ;
    621 static const char* const dir_attr[] = { "dir", NULL} ;
    622 
    623 #define DECL (const char**)
    624 
    625 static const htmlElemDesc
    626 html40ElementTable[] = {
    627 { "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
    628 	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
    629 },
    630 { "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
    631 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    632 },
    633 { "acronym",	0, 0, 0, 0, 0, 0, 1, "",
    634 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    635 },
    636 { "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
    637 	DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
    638 },
    639 { "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
    640 	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
    641 },
    642 { "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
    643 	EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
    644 },
    645 { "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
    646 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    647 },
    648 { "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
    649 	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
    650 },
    651 { "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
    652 	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
    653 },
    654 { "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
    655 	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
    656 },
    657 { "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
    658 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    659 },
    660 { "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
    661 	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
    662 },
    663 { "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
    664 	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
    665 },
    666 { "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
    667 	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
    668 },
    669 { "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
    670 	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
    671 },
    672 { "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
    673 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    674 },
    675 { "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
    676 	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
    677 },
    678 { "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
    679 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    680 },
    681 { "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
    682 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    683 },
    684 { "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
    685 	EMPTY , NULL , DECL col_attrs , NULL, NULL
    686 },
    687 { "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
    688 	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
    689 },
    690 { "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
    691 	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
    692 },
    693 { "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
    694 	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
    695 },
    696 { "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
    697 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    698 },
    699 { "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
    700 	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
    701 },
    702 { "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
    703 	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
    704 },
    705 { "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
    706 	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
    707 },
    708 { "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
    709 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    710 },
    711 { "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
    712 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    713 },
    714 { "embed",	0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
    715 	EMPTY, NULL, DECL embed_attrs, NULL, NULL
    716 },
    717 { "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
    718 	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
    719 },
    720 { "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
    721 	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
    722 },
    723 { "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
    724 	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
    725 },
    726 { "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
    727 	EMPTY, NULL, NULL, DECL frame_attrs, NULL
    728 },
    729 { "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
    730 	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
    731 },
    732 { "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
    733 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    734 },
    735 { "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
    736 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    737 },
    738 { "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
    739 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    740 },
    741 { "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
    742 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    743 },
    744 { "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
    745 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    746 },
    747 { "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
    748 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    749 },
    750 { "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
    751 	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
    752 },
    753 { "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
    754 	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
    755 },
    756 { "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
    757 	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
    758 },
    759 { "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
    760 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    761 },
    762 { "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
    763 	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
    764 },
    765 { "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
    766 	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
    767 },
    768 { "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
    769 	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
    770 },
    771 { "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
    772 	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
    773 },
    774 { "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
    775 	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
    776 },
    777 { "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
    778 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    779 },
    780 { "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
    781 	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
    782 },
    783 { "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
    784 	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
    785 },
    786 { "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
    787 	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
    788 },
    789 { "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
    790 	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
    791 },
    792 { "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
    793 	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
    794 },
    795 { "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
    796 	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
    797 },
    798 { "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
    799 	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
    800 },
    801 { "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
    802 	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
    803 },
    804 { "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
    805 	DECL html_flow, "div", DECL html_attrs, NULL, NULL
    806 },
    807 { "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
    808 	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
    809 },
    810 { "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
    811 	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
    812 },
    813 { "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
    814 	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
    815 },
    816 { "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
    817 	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
    818 },
    819 { "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
    820 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    821 },
    822 { "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
    823 	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
    824 },
    825 { "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
    826 	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
    827 },
    828 { "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
    829 	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
    830 },
    831 { "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
    832 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
    833 },
    834 { "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
    835 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    836 },
    837 { "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
    838 	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
    839 },
    840 { "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
    841 	DECL select_content, NULL, DECL select_attrs, NULL, NULL
    842 },
    843 { "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
    844 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    845 },
    846 { "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
    847 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    848 },
    849 { "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
    850 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
    851 },
    852 { "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
    853 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    854 },
    855 { "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
    856 	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
    857 },
    858 { "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
    859 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    860 },
    861 { "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
    862 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    863 },
    864 { "table",	0, 0, 0, 0, 0, 0, 0, "",
    865 	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
    866 },
    867 { "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
    868 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
    869 },
    870 { "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
    871 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
    872 },
    873 { "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
    874 	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
    875 },
    876 { "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
    877 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
    878 },
    879 { "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
    880 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
    881 },
    882 { "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
    883 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
    884 },
    885 { "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
    886 	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
    887 },
    888 { "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
    889 	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
    890 },
    891 { "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
    892 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    893 },
    894 { "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
    895 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
    896 },
    897 { "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
    898 	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
    899 },
    900 { "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
    901 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    902 }
    903 };
    904 
    905 /*
    906  * start tags that imply the end of current element
    907  */
    908 static const char * const htmlStartClose[] = {
    909 "form",		"form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
    910 		"dl", "ul", "ol", "menu", "dir", "address", "pre",
    911 		"listing", "xmp", "head", NULL,
    912 "head",		"p", NULL,
    913 "title",	"p", NULL,
    914 "body",		"head", "style", "link", "title", "p", NULL,
    915 "frameset",	"head", "style", "link", "title", "p", NULL,
    916 "li",		"p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
    917 		"pre", "listing", "xmp", "head", "li", NULL,
    918 "hr",		"p", "head", NULL,
    919 "h1",		"p", "head", NULL,
    920 "h2",		"p", "head", NULL,
    921 "h3",		"p", "head", NULL,
    922 "h4",		"p", "head", NULL,
    923 "h5",		"p", "head", NULL,
    924 "h6",		"p", "head", NULL,
    925 "dir",		"p", "head", NULL,
    926 "address",	"p", "head", "ul", NULL,
    927 "pre",		"p", "head", "ul", NULL,
    928 "listing",	"p", "head", NULL,
    929 "xmp",		"p", "head", NULL,
    930 "blockquote",	"p", "head", NULL,
    931 "dl",		"p", "dt", "menu", "dir", "address", "pre", "listing",
    932 		"xmp", "head", NULL,
    933 "dt",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
    934                 "head", "dd", NULL,
    935 "dd",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
    936                 "head", "dt", NULL,
    937 "ul",		"p", "head", "ol", "menu", "dir", "address", "pre",
    938 		"listing", "xmp", NULL,
    939 "ol",		"p", "head", "ul", NULL,
    940 "menu",		"p", "head", "ul", NULL,
    941 "p",		"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
    942 "div",		"p", "head", NULL,
    943 "noscript",	"p", "head", NULL,
    944 "center",	"font", "b", "i", "p", "head", NULL,
    945 "a",		"a", NULL,
    946 "caption",	"p", NULL,
    947 "colgroup",	"caption", "colgroup", "col", "p", NULL,
    948 "col",		"caption", "col", "p", NULL,
    949 "table",	"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
    950 		"listing", "xmp", "a", NULL,
    951 "th",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
    952 "td",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
    953 "tr",		"th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
    954 "thead",	"caption", "col", "colgroup", NULL,
    955 "tfoot",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
    956 		"tbody", "p", NULL,
    957 "tbody",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
    958 		"tfoot", "tbody", "p", NULL,
    959 "optgroup",	"option", NULL,
    960 "option",	"option", NULL,
    961 "fieldset",	"legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
    962 		"pre", "listing", "xmp", "a", NULL,
    963 NULL
    964 };
    965 
    966 /*
    967  * The list of HTML elements which are supposed not to have
    968  * CDATA content and where a p element will be implied
    969  *
    970  * TODO: extend that list by reading the HTML SGML DTD on
    971  *       implied paragraph
    972  */
    973 static const char *const htmlNoContentElements[] = {
    974     "html",
    975     "head",
    976     NULL
    977 };
    978 
    979 /*
    980  * The list of HTML attributes which are of content %Script;
    981  * NOTE: when adding ones, check htmlIsScriptAttribute() since
    982  *       it assumes the name starts with 'on'
    983  */
    984 static const char *const htmlScriptAttributes[] = {
    985     "onclick",
    986     "ondblclick",
    987     "onmousedown",
    988     "onmouseup",
    989     "onmouseover",
    990     "onmousemove",
    991     "onmouseout",
    992     "onkeypress",
    993     "onkeydown",
    994     "onkeyup",
    995     "onload",
    996     "onunload",
    997     "onfocus",
    998     "onblur",
    999     "onsubmit",
   1000     "onrest",
   1001     "onchange",
   1002     "onselect"
   1003 };
   1004 
   1005 /*
   1006  * This table is used by the htmlparser to know what to do with
   1007  * broken html pages. By assigning different priorities to different
   1008  * elements the parser can decide how to handle extra endtags.
   1009  * Endtags are only allowed to close elements with lower or equal
   1010  * priority.
   1011  */
   1012 
   1013 typedef struct {
   1014     const char *name;
   1015     int priority;
   1016 } elementPriority;
   1017 
   1018 static const elementPriority htmlEndPriority[] = {
   1019     {"div",   150},
   1020     {"td",    160},
   1021     {"th",    160},
   1022     {"tr",    170},
   1023     {"thead", 180},
   1024     {"tbody", 180},
   1025     {"tfoot", 180},
   1026     {"table", 190},
   1027     {"head",  200},
   1028     {"body",  200},
   1029     {"html",  220},
   1030     {NULL,    100} /* Default priority */
   1031 };
   1032 
   1033 static const char** htmlStartCloseIndex[100];
   1034 static int htmlStartCloseIndexinitialized = 0;
   1035 
   1036 /************************************************************************
   1037  *									*
   1038  * 		functions to handle HTML specific data			*
   1039  *									*
   1040  ************************************************************************/
   1041 
   1042 /**
   1043  * htmlInitAutoClose:
   1044  *
   1045  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
   1046  * This is not reentrant. Call xmlInitParser() once before processing in
   1047  * case of use in multithreaded programs.
   1048  */
   1049 void
   1050 htmlInitAutoClose(void) {
   1051     int indx, i = 0;
   1052 
   1053     if (htmlStartCloseIndexinitialized) return;
   1054 
   1055     for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
   1056     indx = 0;
   1057     while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
   1058         htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
   1059 	while (htmlStartClose[i] != NULL) i++;
   1060 	i++;
   1061     }
   1062     htmlStartCloseIndexinitialized = 1;
   1063 }
   1064 
   1065 /**
   1066  * htmlTagLookup:
   1067  * @tag:  The tag name in lowercase
   1068  *
   1069  * Lookup the HTML tag in the ElementTable
   1070  *
   1071  * Returns the related htmlElemDescPtr or NULL if not found.
   1072  */
   1073 const htmlElemDesc *
   1074 htmlTagLookup(const xmlChar *tag) {
   1075     unsigned int i;
   1076 
   1077     for (i = 0; i < (sizeof(html40ElementTable) /
   1078                      sizeof(html40ElementTable[0]));i++) {
   1079         if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
   1080 	    return((htmlElemDescPtr) &html40ElementTable[i]);
   1081     }
   1082     return(NULL);
   1083 }
   1084 
   1085 /**
   1086  * htmlGetEndPriority:
   1087  * @name: The name of the element to look up the priority for.
   1088  *
   1089  * Return value: The "endtag" priority.
   1090  **/
   1091 static int
   1092 htmlGetEndPriority (const xmlChar *name) {
   1093     int i = 0;
   1094 
   1095     while ((htmlEndPriority[i].name != NULL) &&
   1096 	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
   1097 	i++;
   1098 
   1099     return(htmlEndPriority[i].priority);
   1100 }
   1101 
   1102 
   1103 /**
   1104  * htmlCheckAutoClose:
   1105  * @newtag:  The new tag name
   1106  * @oldtag:  The old tag name
   1107  *
   1108  * Checks whether the new tag is one of the registered valid tags for
   1109  * closing old.
   1110  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
   1111  *
   1112  * Returns 0 if no, 1 if yes.
   1113  */
   1114 static int
   1115 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
   1116 {
   1117     int i, indx;
   1118     const char **closed = NULL;
   1119 
   1120     if (htmlStartCloseIndexinitialized == 0)
   1121         htmlInitAutoClose();
   1122 
   1123     /* inefficient, but not a big deal */
   1124     for (indx = 0; indx < 100; indx++) {
   1125         closed = htmlStartCloseIndex[indx];
   1126         if (closed == NULL)
   1127             return (0);
   1128         if (xmlStrEqual(BAD_CAST * closed, newtag))
   1129             break;
   1130     }
   1131 
   1132     i = closed - htmlStartClose;
   1133     i++;
   1134     while (htmlStartClose[i] != NULL) {
   1135         if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
   1136             return (1);
   1137         }
   1138         i++;
   1139     }
   1140     return (0);
   1141 }
   1142 
   1143 /**
   1144  * htmlAutoCloseOnClose:
   1145  * @ctxt:  an HTML parser context
   1146  * @newtag:  The new tag name
   1147  * @force:  force the tag closure
   1148  *
   1149  * The HTML DTD allows an ending tag to implicitly close other tags.
   1150  */
   1151 static void
   1152 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
   1153 {
   1154     const htmlElemDesc *info;
   1155     int i, priority;
   1156 
   1157     priority = htmlGetEndPriority(newtag);
   1158 
   1159     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
   1160 
   1161         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
   1162             break;
   1163         /*
   1164          * A missplaced endtag can only close elements with lower
   1165          * or equal priority, so if we find an element with higher
   1166          * priority before we find an element with
   1167          * matching name, we just ignore this endtag
   1168          */
   1169         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
   1170             return;
   1171     }
   1172     if (i < 0)
   1173         return;
   1174 
   1175     while (!xmlStrEqual(newtag, ctxt->name)) {
   1176         info = htmlTagLookup(ctxt->name);
   1177         if ((info != NULL) && (info->endTag == 3)) {
   1178             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
   1179 	                 "Opening and ending tag mismatch: %s and %s\n",
   1180 			 newtag, ctxt->name);
   1181         }
   1182         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   1183             ctxt->sax->endElement(ctxt->userData, ctxt->name);
   1184 	htmlnamePop(ctxt);
   1185     }
   1186 }
   1187 
   1188 /**
   1189  * htmlAutoCloseOnEnd:
   1190  * @ctxt:  an HTML parser context
   1191  *
   1192  * Close all remaining tags at the end of the stream
   1193  */
   1194 static void
   1195 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
   1196 {
   1197     int i;
   1198 
   1199     if (ctxt->nameNr == 0)
   1200         return;
   1201     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
   1202         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   1203             ctxt->sax->endElement(ctxt->userData, ctxt->name);
   1204 	htmlnamePop(ctxt);
   1205     }
   1206 }
   1207 
   1208 /**
   1209  * htmlAutoClose:
   1210  * @ctxt:  an HTML parser context
   1211  * @newtag:  The new tag name or NULL
   1212  *
   1213  * The HTML DTD allows a tag to implicitly close other tags.
   1214  * The list is kept in htmlStartClose array. This function is
   1215  * called when a new tag has been detected and generates the
   1216  * appropriates closes if possible/needed.
   1217  * If newtag is NULL this mean we are at the end of the resource
   1218  * and we should check
   1219  */
   1220 static void
   1221 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
   1222 {
   1223     while ((newtag != NULL) && (ctxt->name != NULL) &&
   1224            (htmlCheckAutoClose(newtag, ctxt->name))) {
   1225         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   1226             ctxt->sax->endElement(ctxt->userData, ctxt->name);
   1227 	htmlnamePop(ctxt);
   1228     }
   1229     if (newtag == NULL) {
   1230         htmlAutoCloseOnEnd(ctxt);
   1231         return;
   1232     }
   1233     while ((newtag == NULL) && (ctxt->name != NULL) &&
   1234            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
   1235             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
   1236             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
   1237         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   1238             ctxt->sax->endElement(ctxt->userData, ctxt->name);
   1239 	htmlnamePop(ctxt);
   1240     }
   1241 }
   1242 
   1243 /**
   1244  * htmlAutoCloseTag:
   1245  * @doc:  the HTML document
   1246  * @name:  The tag name
   1247  * @elem:  the HTML element
   1248  *
   1249  * The HTML DTD allows a tag to implicitly close other tags.
   1250  * The list is kept in htmlStartClose array. This function checks
   1251  * if the element or one of it's children would autoclose the
   1252  * given tag.
   1253  *
   1254  * Returns 1 if autoclose, 0 otherwise
   1255  */
   1256 int
   1257 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
   1258     htmlNodePtr child;
   1259 
   1260     if (elem == NULL) return(1);
   1261     if (xmlStrEqual(name, elem->name)) return(0);
   1262     if (htmlCheckAutoClose(elem->name, name)) return(1);
   1263     child = elem->children;
   1264     while (child != NULL) {
   1265         if (htmlAutoCloseTag(doc, name, child)) return(1);
   1266 	child = child->next;
   1267     }
   1268     return(0);
   1269 }
   1270 
   1271 /**
   1272  * htmlIsAutoClosed:
   1273  * @doc:  the HTML document
   1274  * @elem:  the HTML element
   1275  *
   1276  * The HTML DTD allows a tag to implicitly close other tags.
   1277  * The list is kept in htmlStartClose array. This function checks
   1278  * if a tag is autoclosed by one of it's child
   1279  *
   1280  * Returns 1 if autoclosed, 0 otherwise
   1281  */
   1282 int
   1283 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
   1284     htmlNodePtr child;
   1285 
   1286     if (elem == NULL) return(1);
   1287     child = elem->children;
   1288     while (child != NULL) {
   1289 	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
   1290 	child = child->next;
   1291     }
   1292     return(0);
   1293 }
   1294 
   1295 /**
   1296  * htmlCheckImplied:
   1297  * @ctxt:  an HTML parser context
   1298  * @newtag:  The new tag name
   1299  *
   1300  * The HTML DTD allows a tag to exists only implicitly
   1301  * called when a new tag has been detected and generates the
   1302  * appropriates implicit tags if missing
   1303  */
   1304 static void
   1305 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
   1306     if (!htmlOmittedDefaultValue)
   1307 	return;
   1308     if (xmlStrEqual(newtag, BAD_CAST"html"))
   1309 	return;
   1310     if (ctxt->nameNr <= 0) {
   1311 	htmlnamePush(ctxt, BAD_CAST"html");
   1312 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1313 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
   1314     }
   1315     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
   1316         return;
   1317     if ((ctxt->nameNr <= 1) &&
   1318         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
   1319 	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
   1320 	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
   1321 	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
   1322 	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
   1323 	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
   1324 	    /*
   1325 	     * dropped OBJECT ... i you put it first BODY will be
   1326 	     * assumed !
   1327 	     */
   1328 	    htmlnamePush(ctxt, BAD_CAST"head");
   1329 	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1330 		ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
   1331     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
   1332 	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
   1333 	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
   1334 	int i;
   1335 	for (i = 0;i < ctxt->nameNr;i++) {
   1336 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
   1337 		return;
   1338 	    }
   1339 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
   1340 		return;
   1341 	    }
   1342 	}
   1343 
   1344 	htmlnamePush(ctxt, BAD_CAST"body");
   1345 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1346 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
   1347     }
   1348 }
   1349 
   1350 /**
   1351  * htmlCheckParagraph
   1352  * @ctxt:  an HTML parser context
   1353  *
   1354  * Check whether a p element need to be implied before inserting
   1355  * characters in the current element.
   1356  *
   1357  * Returns 1 if a paragraph has been inserted, 0 if not and -1
   1358  *         in case of error.
   1359  */
   1360 
   1361 static int
   1362 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
   1363     const xmlChar *tag;
   1364     int i;
   1365 
   1366     if (ctxt == NULL)
   1367 	return(-1);
   1368     tag = ctxt->name;
   1369     if (tag == NULL) {
   1370 	htmlAutoClose(ctxt, BAD_CAST"p");
   1371 	htmlCheckImplied(ctxt, BAD_CAST"p");
   1372 	htmlnamePush(ctxt, BAD_CAST"p");
   1373 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1374 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
   1375 	return(1);
   1376     }
   1377     if (!htmlOmittedDefaultValue)
   1378 	return(0);
   1379     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
   1380 	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
   1381 	    htmlAutoClose(ctxt, BAD_CAST"p");
   1382 	    htmlCheckImplied(ctxt, BAD_CAST"p");
   1383 	    htmlnamePush(ctxt, BAD_CAST"p");
   1384 	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1385 		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
   1386 	    return(1);
   1387 	}
   1388     }
   1389     return(0);
   1390 }
   1391 
   1392 /**
   1393  * htmlIsScriptAttribute:
   1394  * @name:  an attribute name
   1395  *
   1396  * Check if an attribute is of content type Script
   1397  *
   1398  * Returns 1 is the attribute is a script 0 otherwise
   1399  */
   1400 int
   1401 htmlIsScriptAttribute(const xmlChar *name) {
   1402     unsigned int i;
   1403 
   1404     if (name == NULL)
   1405        	return(0);
   1406     /*
   1407      * all script attributes start with 'on'
   1408      */
   1409     if ((name[0] != 'o') || (name[1] != 'n'))
   1410        	return(0);
   1411     for (i = 0;
   1412 	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
   1413 	 i++) {
   1414 	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
   1415 	    return(1);
   1416     }
   1417     return(0);
   1418 }
   1419 
   1420 /************************************************************************
   1421  *									*
   1422  * 		The list of HTML predefined entities			*
   1423  *									*
   1424  ************************************************************************/
   1425 
   1426 
   1427 static const htmlEntityDesc  html40EntitiesTable[] = {
   1428 /*
   1429  * the 4 absolute ones, plus apostrophe.
   1430  */
   1431 { 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
   1432 { 38,	"amp",	"ampersand, U+0026 ISOnum" },
   1433 { 39,	"apos",	"single quote" },
   1434 { 60,	"lt",	"less-than sign, U+003C ISOnum" },
   1435 { 62,	"gt",	"greater-than sign, U+003E ISOnum" },
   1436 
   1437 /*
   1438  * A bunch still in the 128-255 range
   1439  * Replacing them depend really on the charset used.
   1440  */
   1441 { 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
   1442 { 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
   1443 { 162,	"cent",	"cent sign, U+00A2 ISOnum" },
   1444 { 163,	"pound","pound sign, U+00A3 ISOnum" },
   1445 { 164,	"curren","currency sign, U+00A4 ISOnum" },
   1446 { 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
   1447 { 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
   1448 { 167,	"sect",	"section sign, U+00A7 ISOnum" },
   1449 { 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
   1450 { 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
   1451 { 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
   1452 { 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
   1453 { 172,	"not",	"not sign, U+00AC ISOnum" },
   1454 { 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
   1455 { 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
   1456 { 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
   1457 { 176,	"deg",	"degree sign, U+00B0 ISOnum" },
   1458 { 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
   1459 { 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
   1460 { 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
   1461 { 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
   1462 { 181,	"micro","micro sign, U+00B5 ISOnum" },
   1463 { 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
   1464 { 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
   1465 { 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
   1466 { 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
   1467 { 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
   1468 { 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
   1469 { 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
   1470 { 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
   1471 { 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
   1472 { 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
   1473 { 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
   1474 { 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
   1475 { 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
   1476 { 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
   1477 { 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
   1478 { 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
   1479 { 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
   1480 { 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
   1481 { 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
   1482 { 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
   1483 { 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
   1484 { 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
   1485 { 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
   1486 { 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
   1487 { 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
   1488 { 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
   1489 { 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
   1490 { 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
   1491 { 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
   1492 { 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
   1493 { 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
   1494 { 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
   1495 { 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
   1496 { 215,	"times","multiplication sign, U+00D7 ISOnum" },
   1497 { 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
   1498 { 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
   1499 { 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
   1500 { 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
   1501 { 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
   1502 { 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
   1503 { 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
   1504 { 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
   1505 { 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
   1506 { 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
   1507 { 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
   1508 { 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
   1509 { 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
   1510 { 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
   1511 { 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
   1512 { 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
   1513 { 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
   1514 { 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
   1515 { 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
   1516 { 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
   1517 { 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
   1518 { 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
   1519 { 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
   1520 { 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
   1521 { 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
   1522 { 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
   1523 { 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
   1524 { 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
   1525 { 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
   1526 { 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
   1527 { 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
   1528 { 247,	"divide","division sign, U+00F7 ISOnum" },
   1529 { 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
   1530 { 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
   1531 { 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
   1532 { 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
   1533 { 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
   1534 { 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
   1535 { 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
   1536 { 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
   1537 
   1538 { 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
   1539 { 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
   1540 { 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
   1541 { 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
   1542 { 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
   1543 
   1544 /*
   1545  * Anything below should really be kept as entities references
   1546  */
   1547 { 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
   1548 
   1549 { 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
   1550 { 732,	"tilde","small tilde, U+02DC ISOdia" },
   1551 
   1552 { 913,	"Alpha","greek capital letter alpha, U+0391" },
   1553 { 914,	"Beta",	"greek capital letter beta, U+0392" },
   1554 { 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
   1555 { 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
   1556 { 917,	"Epsilon","greek capital letter epsilon, U+0395" },
   1557 { 918,	"Zeta",	"greek capital letter zeta, U+0396" },
   1558 { 919,	"Eta",	"greek capital letter eta, U+0397" },
   1559 { 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
   1560 { 921,	"Iota",	"greek capital letter iota, U+0399" },
   1561 { 922,	"Kappa","greek capital letter kappa, U+039A" },
   1562 { 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
   1563 { 924,	"Mu",	"greek capital letter mu, U+039C" },
   1564 { 925,	"Nu",	"greek capital letter nu, U+039D" },
   1565 { 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
   1566 { 927,	"Omicron","greek capital letter omicron, U+039F" },
   1567 { 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
   1568 { 929,	"Rho",	"greek capital letter rho, U+03A1" },
   1569 { 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
   1570 { 932,	"Tau",	"greek capital letter tau, U+03A4" },
   1571 { 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
   1572 { 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
   1573 { 935,	"Chi",	"greek capital letter chi, U+03A7" },
   1574 { 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
   1575 { 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
   1576 
   1577 { 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
   1578 { 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
   1579 { 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
   1580 { 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
   1581 { 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
   1582 { 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
   1583 { 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
   1584 { 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
   1585 { 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
   1586 { 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
   1587 { 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
   1588 { 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
   1589 { 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
   1590 { 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
   1591 { 959,	"omicron","greek small letter omicron, U+03BF NEW" },
   1592 { 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
   1593 { 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
   1594 { 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
   1595 { 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
   1596 { 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
   1597 { 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
   1598 { 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
   1599 { 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
   1600 { 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
   1601 { 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
   1602 { 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
   1603 { 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
   1604 { 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
   1605 
   1606 { 8194,	"ensp",	"en space, U+2002 ISOpub" },
   1607 { 8195,	"emsp",	"em space, U+2003 ISOpub" },
   1608 { 8201,	"thinsp","thin space, U+2009 ISOpub" },
   1609 { 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
   1610 { 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
   1611 { 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
   1612 { 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
   1613 { 8211,	"ndash","en dash, U+2013 ISOpub" },
   1614 { 8212,	"mdash","em dash, U+2014 ISOpub" },
   1615 { 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
   1616 { 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
   1617 { 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
   1618 { 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
   1619 { 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
   1620 { 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
   1621 { 8224,	"dagger","dagger, U+2020 ISOpub" },
   1622 { 8225,	"Dagger","double dagger, U+2021 ISOpub" },
   1623 
   1624 { 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
   1625 { 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
   1626 
   1627 { 8240,	"permil","per mille sign, U+2030 ISOtech" },
   1628 
   1629 { 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
   1630 { 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
   1631 
   1632 { 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
   1633 { 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
   1634 
   1635 { 8254,	"oline","overline = spacing overscore, U+203E NEW" },
   1636 { 8260,	"frasl","fraction slash, U+2044 NEW" },
   1637 
   1638 { 8364,	"euro",	"euro sign, U+20AC NEW" },
   1639 
   1640 { 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
   1641 { 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
   1642 { 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
   1643 { 8482,	"trade","trade mark sign, U+2122 ISOnum" },
   1644 { 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
   1645 { 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
   1646 { 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
   1647 { 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
   1648 { 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
   1649 { 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
   1650 { 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
   1651 { 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
   1652 { 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
   1653 { 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
   1654 { 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
   1655 { 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
   1656 
   1657 { 8704,	"forall","for all, U+2200 ISOtech" },
   1658 { 8706,	"part",	"partial differential, U+2202 ISOtech" },
   1659 { 8707,	"exist","there exists, U+2203 ISOtech" },
   1660 { 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
   1661 { 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
   1662 { 8712,	"isin",	"element of, U+2208 ISOtech" },
   1663 { 8713,	"notin","not an element of, U+2209 ISOtech" },
   1664 { 8715,	"ni",	"contains as member, U+220B ISOtech" },
   1665 { 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
   1666 { 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
   1667 { 8722,	"minus","minus sign, U+2212 ISOtech" },
   1668 { 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
   1669 { 8730,	"radic","square root = radical sign, U+221A ISOtech" },
   1670 { 8733,	"prop",	"proportional to, U+221D ISOtech" },
   1671 { 8734,	"infin","infinity, U+221E ISOtech" },
   1672 { 8736,	"ang",	"angle, U+2220 ISOamso" },
   1673 { 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
   1674 { 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
   1675 { 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
   1676 { 8746,	"cup",	"union = cup, U+222A ISOtech" },
   1677 { 8747,	"int",	"integral, U+222B ISOtech" },
   1678 { 8756,	"there4","therefore, U+2234 ISOtech" },
   1679 { 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
   1680 { 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
   1681 { 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
   1682 { 8800,	"ne",	"not equal to, U+2260 ISOtech" },
   1683 { 8801,	"equiv","identical to, U+2261 ISOtech" },
   1684 { 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
   1685 { 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
   1686 { 8834,	"sub",	"subset of, U+2282 ISOtech" },
   1687 { 8835,	"sup",	"superset of, U+2283 ISOtech" },
   1688 { 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
   1689 { 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
   1690 { 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
   1691 { 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
   1692 { 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
   1693 { 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
   1694 { 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
   1695 { 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
   1696 { 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
   1697 { 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
   1698 { 8971,	"rfloor","right floor, U+230B ISOamsc" },
   1699 { 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
   1700 { 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
   1701 { 9674,	"loz",	"lozenge, U+25CA ISOpub" },
   1702 
   1703 { 9824,	"spades","black spade suit, U+2660 ISOpub" },
   1704 { 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
   1705 { 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
   1706 { 9830,	"diams","black diamond suit, U+2666 ISOpub" },
   1707 
   1708 };
   1709 
   1710 /************************************************************************
   1711  *									*
   1712  *		Commodity functions to handle entities			*
   1713  *									*
   1714  ************************************************************************/
   1715 
   1716 /*
   1717  * Macro used to grow the current buffer.
   1718  */
   1719 #define growBuffer(buffer) {						\
   1720     xmlChar *tmp;							\
   1721     buffer##_size *= 2;							\
   1722     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
   1723     if (tmp == NULL) {						\
   1724 	htmlErrMemory(ctxt, "growing buffer\n");			\
   1725 	xmlFree(buffer);						\
   1726 	return(NULL);							\
   1727     }									\
   1728     buffer = tmp;							\
   1729 }
   1730 
   1731 /**
   1732  * htmlEntityLookup:
   1733  * @name: the entity name
   1734  *
   1735  * Lookup the given entity in EntitiesTable
   1736  *
   1737  * TODO: the linear scan is really ugly, an hash table is really needed.
   1738  *
   1739  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
   1740  */
   1741 const htmlEntityDesc *
   1742 htmlEntityLookup(const xmlChar *name) {
   1743     unsigned int i;
   1744 
   1745     for (i = 0;i < (sizeof(html40EntitiesTable)/
   1746                     sizeof(html40EntitiesTable[0]));i++) {
   1747         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
   1748             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
   1749 	}
   1750     }
   1751     return(NULL);
   1752 }
   1753 
   1754 /**
   1755  * htmlEntityValueLookup:
   1756  * @value: the entity's unicode value
   1757  *
   1758  * Lookup the given entity in EntitiesTable
   1759  *
   1760  * TODO: the linear scan is really ugly, an hash table is really needed.
   1761  *
   1762  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
   1763  */
   1764 const htmlEntityDesc *
   1765 htmlEntityValueLookup(unsigned int value) {
   1766     unsigned int i;
   1767 
   1768     for (i = 0;i < (sizeof(html40EntitiesTable)/
   1769                     sizeof(html40EntitiesTable[0]));i++) {
   1770         if (html40EntitiesTable[i].value >= value) {
   1771 	    if (html40EntitiesTable[i].value > value)
   1772 		break;
   1773             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
   1774 	}
   1775     }
   1776     return(NULL);
   1777 }
   1778 
   1779 /**
   1780  * UTF8ToHtml:
   1781  * @out:  a pointer to an array of bytes to store the result
   1782  * @outlen:  the length of @out
   1783  * @in:  a pointer to an array of UTF-8 chars
   1784  * @inlen:  the length of @in
   1785  *
   1786  * Take a block of UTF-8 chars in and try to convert it to an ASCII
   1787  * plus HTML entities block of chars out.
   1788  *
   1789  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
   1790  * The value of @inlen after return is the number of octets consumed
   1791  *     as the return value is positive, else unpredictable.
   1792  * The value of @outlen after return is the number of octets consumed.
   1793  */
   1794 int
   1795 UTF8ToHtml(unsigned char* out, int *outlen,
   1796               const unsigned char* in, int *inlen) {
   1797     const unsigned char* processed = in;
   1798     const unsigned char* outend;
   1799     const unsigned char* outstart = out;
   1800     const unsigned char* instart = in;
   1801     const unsigned char* inend;
   1802     unsigned int c, d;
   1803     int trailing;
   1804 
   1805     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
   1806     if (in == NULL) {
   1807         /*
   1808 	 * initialization nothing to do
   1809 	 */
   1810 	*outlen = 0;
   1811 	*inlen = 0;
   1812 	return(0);
   1813     }
   1814     inend = in + (*inlen);
   1815     outend = out + (*outlen);
   1816     while (in < inend) {
   1817 	d = *in++;
   1818 	if      (d < 0x80)  { c= d; trailing= 0; }
   1819 	else if (d < 0xC0) {
   1820 	    /* trailing byte in leading position */
   1821 	    *outlen = out - outstart;
   1822 	    *inlen = processed - instart;
   1823 	    return(-2);
   1824         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
   1825         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
   1826         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
   1827 	else {
   1828 	    /* no chance for this in Ascii */
   1829 	    *outlen = out - outstart;
   1830 	    *inlen = processed - instart;
   1831 	    return(-2);
   1832 	}
   1833 
   1834 	if (inend - in < trailing) {
   1835 	    break;
   1836 	}
   1837 
   1838 	for ( ; trailing; trailing--) {
   1839 	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
   1840 		break;
   1841 	    c <<= 6;
   1842 	    c |= d & 0x3F;
   1843 	}
   1844 
   1845 	/* assertion: c is a single UTF-4 value */
   1846 	if (c < 0x80) {
   1847 	    if (out + 1 >= outend)
   1848 		break;
   1849 	    *out++ = c;
   1850 	} else {
   1851 	    int len;
   1852 	    const htmlEntityDesc * ent;
   1853 	    const char *cp;
   1854 	    char nbuf[16];
   1855 
   1856 	    /*
   1857 	     * Try to lookup a predefined HTML entity for it
   1858 	     */
   1859 
   1860 	    ent = htmlEntityValueLookup(c);
   1861 	    if (ent == NULL) {
   1862 	      snprintf(nbuf, sizeof(nbuf), "#%u", c);
   1863 	      cp = nbuf;
   1864 	    }
   1865 	    else
   1866 	      cp = ent->name;
   1867 	    len = strlen(cp);
   1868 	    if (out + 2 + len >= outend)
   1869 		break;
   1870 	    *out++ = '&';
   1871 	    memcpy(out, cp, len);
   1872 	    out += len;
   1873 	    *out++ = ';';
   1874 	}
   1875 	processed = in;
   1876     }
   1877     *outlen = out - outstart;
   1878     *inlen = processed - instart;
   1879     return(0);
   1880 }
   1881 
   1882 /**
   1883  * htmlEncodeEntities:
   1884  * @out:  a pointer to an array of bytes to store the result
   1885  * @outlen:  the length of @out
   1886  * @in:  a pointer to an array of UTF-8 chars
   1887  * @inlen:  the length of @in
   1888  * @quoteChar: the quote character to escape (' or ") or zero.
   1889  *
   1890  * Take a block of UTF-8 chars in and try to convert it to an ASCII
   1891  * plus HTML entities block of chars out.
   1892  *
   1893  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
   1894  * The value of @inlen after return is the number of octets consumed
   1895  *     as the return value is positive, else unpredictable.
   1896  * The value of @outlen after return is the number of octets consumed.
   1897  */
   1898 int
   1899 htmlEncodeEntities(unsigned char* out, int *outlen,
   1900 		   const unsigned char* in, int *inlen, int quoteChar) {
   1901     const unsigned char* processed = in;
   1902     const unsigned char* outend;
   1903     const unsigned char* outstart = out;
   1904     const unsigned char* instart = in;
   1905     const unsigned char* inend;
   1906     unsigned int c, d;
   1907     int trailing;
   1908 
   1909     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
   1910         return(-1);
   1911     outend = out + (*outlen);
   1912     inend = in + (*inlen);
   1913     while (in < inend) {
   1914 	d = *in++;
   1915 	if      (d < 0x80)  { c= d; trailing= 0; }
   1916 	else if (d < 0xC0) {
   1917 	    /* trailing byte in leading position */
   1918 	    *outlen = out - outstart;
   1919 	    *inlen = processed - instart;
   1920 	    return(-2);
   1921         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
   1922         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
   1923         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
   1924 	else {
   1925 	    /* no chance for this in Ascii */
   1926 	    *outlen = out - outstart;
   1927 	    *inlen = processed - instart;
   1928 	    return(-2);
   1929 	}
   1930 
   1931 	if (inend - in < trailing)
   1932 	    break;
   1933 
   1934 	while (trailing--) {
   1935 	    if (((d= *in++) & 0xC0) != 0x80) {
   1936 		*outlen = out - outstart;
   1937 		*inlen = processed - instart;
   1938 		return(-2);
   1939 	    }
   1940 	    c <<= 6;
   1941 	    c |= d & 0x3F;
   1942 	}
   1943 
   1944 	/* assertion: c is a single UTF-4 value */
   1945 	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
   1946 	    (c != '&') && (c != '<') && (c != '>')) {
   1947 	    if (out >= outend)
   1948 		break;
   1949 	    *out++ = c;
   1950 	} else {
   1951 	    const htmlEntityDesc * ent;
   1952 	    const char *cp;
   1953 	    char nbuf[16];
   1954 	    int len;
   1955 
   1956 	    /*
   1957 	     * Try to lookup a predefined HTML entity for it
   1958 	     */
   1959 	    ent = htmlEntityValueLookup(c);
   1960 	    if (ent == NULL) {
   1961 		snprintf(nbuf, sizeof(nbuf), "#%u", c);
   1962 		cp = nbuf;
   1963 	    }
   1964 	    else
   1965 		cp = ent->name;
   1966 	    len = strlen(cp);
   1967 	    if (out + 2 + len > outend)
   1968 		break;
   1969 	    *out++ = '&';
   1970 	    memcpy(out, cp, len);
   1971 	    out += len;
   1972 	    *out++ = ';';
   1973 	}
   1974 	processed = in;
   1975     }
   1976     *outlen = out - outstart;
   1977     *inlen = processed - instart;
   1978     return(0);
   1979 }
   1980 
   1981 /************************************************************************
   1982  *									*
   1983  *		Commodity functions to handle streams			*
   1984  *									*
   1985  ************************************************************************/
   1986 
   1987 /**
   1988  * htmlNewInputStream:
   1989  * @ctxt:  an HTML parser context
   1990  *
   1991  * Create a new input stream structure
   1992  * Returns the new input stream or NULL
   1993  */
   1994 static htmlParserInputPtr
   1995 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
   1996     htmlParserInputPtr input;
   1997 
   1998     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
   1999     if (input == NULL) {
   2000         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
   2001 	return(NULL);
   2002     }
   2003     memset(input, 0, sizeof(htmlParserInput));
   2004     input->filename = NULL;
   2005     input->directory = NULL;
   2006     input->base = NULL;
   2007     input->cur = NULL;
   2008     input->buf = NULL;
   2009     input->line = 1;
   2010     input->col = 1;
   2011     input->buf = NULL;
   2012     input->free = NULL;
   2013     input->version = NULL;
   2014     input->consumed = 0;
   2015     input->length = 0;
   2016     return(input);
   2017 }
   2018 
   2019 
   2020 /************************************************************************
   2021  *									*
   2022  *		Commodity functions, cleanup needed ?			*
   2023  *									*
   2024  ************************************************************************/
   2025 /*
   2026  * all tags allowing pc data from the html 4.01 loose dtd
   2027  * NOTE: it might be more apropriate to integrate this information
   2028  * into the html40ElementTable array but I don't want to risk any
   2029  * binary incomptibility
   2030  */
   2031 static const char *allowPCData[] = {
   2032     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
   2033     "blockquote", "body", "button", "caption", "center", "cite", "code",
   2034     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
   2035     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
   2036     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
   2037     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
   2038 };
   2039 
   2040 /**
   2041  * areBlanks:
   2042  * @ctxt:  an HTML parser context
   2043  * @str:  a xmlChar *
   2044  * @len:  the size of @str
   2045  *
   2046  * Is this a sequence of blank chars that one can ignore ?
   2047  *
   2048  * Returns 1 if ignorable 0 otherwise.
   2049  */
   2050 
   2051 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
   2052     unsigned int i;
   2053     int j;
   2054     xmlNodePtr lastChild;
   2055     xmlDtdPtr dtd;
   2056 
   2057     for (j = 0;j < len;j++)
   2058         if (!(IS_BLANK_CH(str[j]))) return(0);
   2059 
   2060     if (CUR == 0) return(1);
   2061     if (CUR != '<') return(0);
   2062     if (ctxt->name == NULL)
   2063 	return(1);
   2064     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
   2065 	return(1);
   2066     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
   2067 	return(1);
   2068 
   2069     /* Only strip CDATA children of the body tag for strict HTML DTDs */
   2070     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
   2071         dtd = xmlGetIntSubset(ctxt->myDoc);
   2072         if (dtd != NULL && dtd->ExternalID != NULL) {
   2073             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
   2074                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
   2075                 return(1);
   2076         }
   2077     }
   2078 
   2079     if (ctxt->node == NULL) return(0);
   2080     lastChild = xmlGetLastChild(ctxt->node);
   2081     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
   2082 	lastChild = lastChild->prev;
   2083     if (lastChild == NULL) {
   2084         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
   2085             (ctxt->node->content != NULL)) return(0);
   2086 	/* keep ws in constructs like ...<b> </b>...
   2087 	   for all tags "b" allowing PCDATA */
   2088 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
   2089 	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
   2090 		return(0);
   2091 	    }
   2092 	}
   2093     } else if (xmlNodeIsText(lastChild)) {
   2094         return(0);
   2095     } else {
   2096 	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
   2097 	   for all tags "p" allowing PCDATA */
   2098 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
   2099 	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
   2100 		return(0);
   2101 	    }
   2102 	}
   2103     }
   2104     return(1);
   2105 }
   2106 
   2107 /**
   2108  * htmlNewDocNoDtD:
   2109  * @URI:  URI for the dtd, or NULL
   2110  * @ExternalID:  the external ID of the DTD, or NULL
   2111  *
   2112  * Creates a new HTML document without a DTD node if @URI and @ExternalID
   2113  * are NULL
   2114  *
   2115  * Returns a new document, do not initialize the DTD if not provided
   2116  */
   2117 htmlDocPtr
   2118 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
   2119     xmlDocPtr cur;
   2120 
   2121     /*
   2122      * Allocate a new document and fill the fields.
   2123      */
   2124     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
   2125     if (cur == NULL) {
   2126 	htmlErrMemory(NULL, "HTML document creation failed\n");
   2127 	return(NULL);
   2128     }
   2129     memset(cur, 0, sizeof(xmlDoc));
   2130 
   2131     cur->type = XML_HTML_DOCUMENT_NODE;
   2132     cur->version = NULL;
   2133     cur->intSubset = NULL;
   2134     cur->doc = cur;
   2135     cur->name = NULL;
   2136     cur->children = NULL;
   2137     cur->extSubset = NULL;
   2138     cur->oldNs = NULL;
   2139     cur->encoding = NULL;
   2140     cur->standalone = 1;
   2141     cur->compression = 0;
   2142     cur->ids = NULL;
   2143     cur->refs = NULL;
   2144     cur->_private = NULL;
   2145     cur->charset = XML_CHAR_ENCODING_UTF8;
   2146     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
   2147     if ((ExternalID != NULL) ||
   2148 	(URI != NULL))
   2149 	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
   2150     return(cur);
   2151 }
   2152 
   2153 /**
   2154  * htmlNewDoc:
   2155  * @URI:  URI for the dtd, or NULL
   2156  * @ExternalID:  the external ID of the DTD, or NULL
   2157  *
   2158  * Creates a new HTML document
   2159  *
   2160  * Returns a new document
   2161  */
   2162 htmlDocPtr
   2163 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
   2164     if ((URI == NULL) && (ExternalID == NULL))
   2165 	return(htmlNewDocNoDtD(
   2166 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
   2167 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
   2168 
   2169     return(htmlNewDocNoDtD(URI, ExternalID));
   2170 }
   2171 
   2172 
   2173 /************************************************************************
   2174  *									*
   2175  *			The parser itself				*
   2176  *	Relates to http://www.w3.org/TR/html40				*
   2177  *									*
   2178  ************************************************************************/
   2179 
   2180 /************************************************************************
   2181  *									*
   2182  *			The parser itself				*
   2183  *									*
   2184  ************************************************************************/
   2185 
   2186 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
   2187 
   2188 /**
   2189  * htmlParseHTMLName:
   2190  * @ctxt:  an HTML parser context
   2191  *
   2192  * parse an HTML tag or attribute name, note that we convert it to lowercase
   2193  * since HTML names are not case-sensitive.
   2194  *
   2195  * Returns the Tag Name parsed or NULL
   2196  */
   2197 
   2198 static const xmlChar *
   2199 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
   2200     int i = 0;
   2201     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
   2202 
   2203     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
   2204         (CUR != ':')) return(NULL);
   2205 
   2206     while ((i < HTML_PARSER_BUFFER_SIZE) &&
   2207            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
   2208 	   (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
   2209 	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
   2210         else loc[i] = CUR;
   2211 	i++;
   2212 
   2213 	NEXT;
   2214     }
   2215 
   2216     return(xmlDictLookup(ctxt->dict, loc, i));
   2217 }
   2218 
   2219 
   2220 /**
   2221  * htmlParseHTMLName_nonInvasive:
   2222  * @ctxt:  an HTML parser context
   2223  *
   2224  * parse an HTML tag or attribute name, note that we convert it to lowercase
   2225  * since HTML names are not case-sensitive, this doesn't consume the data
   2226  * from the stream, it's a look-ahead
   2227  *
   2228  * Returns the Tag Name parsed or NULL
   2229  */
   2230 
   2231 static const xmlChar *
   2232 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
   2233     int i = 0;
   2234     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
   2235 
   2236     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
   2237         (NXT(1) != ':')) return(NULL);
   2238 
   2239     while ((i < HTML_PARSER_BUFFER_SIZE) &&
   2240            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
   2241 	   (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
   2242 	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
   2243         else loc[i] = NXT(1+i);
   2244 	i++;
   2245     }
   2246 
   2247     return(xmlDictLookup(ctxt->dict, loc, i));
   2248 }
   2249 
   2250 
   2251 /**
   2252  * htmlParseName:
   2253  * @ctxt:  an HTML parser context
   2254  *
   2255  * parse an HTML name, this routine is case sensitive.
   2256  *
   2257  * Returns the Name parsed or NULL
   2258  */
   2259 
   2260 static const xmlChar *
   2261 htmlParseName(htmlParserCtxtPtr ctxt) {
   2262     const xmlChar *in;
   2263     const xmlChar *ret;
   2264     int count = 0;
   2265 
   2266     GROW;
   2267 
   2268     /*
   2269      * Accelerator for simple ASCII names
   2270      */
   2271     in = ctxt->input->cur;
   2272     if (((*in >= 0x61) && (*in <= 0x7A)) ||
   2273 	((*in >= 0x41) && (*in <= 0x5A)) ||
   2274 	(*in == '_') || (*in == ':')) {
   2275 	in++;
   2276 	while (((*in >= 0x61) && (*in <= 0x7A)) ||
   2277 	       ((*in >= 0x41) && (*in <= 0x5A)) ||
   2278 	       ((*in >= 0x30) && (*in <= 0x39)) ||
   2279 	       (*in == '_') || (*in == '-') ||
   2280 	       (*in == ':') || (*in == '.'))
   2281 	    in++;
   2282 	if ((*in > 0) && (*in < 0x80)) {
   2283 	    count = in - ctxt->input->cur;
   2284 	    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
   2285 	    ctxt->input->cur = in;
   2286 	    ctxt->nbChars += count;
   2287 	    ctxt->input->col += count;
   2288 	    return(ret);
   2289 	}
   2290     }
   2291     return(htmlParseNameComplex(ctxt));
   2292 }
   2293 
   2294 static const xmlChar *
   2295 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
   2296     int len = 0, l;
   2297     int c;
   2298     int count = 0;
   2299 
   2300     /*
   2301      * Handler for more complex cases
   2302      */
   2303     GROW;
   2304     c = CUR_CHAR(l);
   2305     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
   2306 	(!IS_LETTER(c) && (c != '_') &&
   2307          (c != ':'))) {
   2308 	return(NULL);
   2309     }
   2310 
   2311     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
   2312 	   ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
   2313             (c == '.') || (c == '-') ||
   2314 	    (c == '_') || (c == ':') ||
   2315 	    (IS_COMBINING(c)) ||
   2316 	    (IS_EXTENDER(c)))) {
   2317 	if (count++ > 100) {
   2318 	    count = 0;
   2319 	    GROW;
   2320 	}
   2321 	len += l;
   2322 	NEXTL(l);
   2323 	c = CUR_CHAR(l);
   2324     }
   2325     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
   2326 }
   2327 
   2328 
   2329 /**
   2330  * htmlParseHTMLAttribute:
   2331  * @ctxt:  an HTML parser context
   2332  * @stop:  a char stop value
   2333  *
   2334  * parse an HTML attribute value till the stop (quote), if
   2335  * stop is 0 then it stops at the first space
   2336  *
   2337  * Returns the attribute parsed or NULL
   2338  */
   2339 
   2340 static xmlChar *
   2341 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
   2342     xmlChar *buffer = NULL;
   2343     int buffer_size = 0;
   2344     xmlChar *out = NULL;
   2345     const xmlChar *name = NULL;
   2346     const xmlChar *cur = NULL;
   2347     const htmlEntityDesc * ent;
   2348 
   2349     /*
   2350      * allocate a translation buffer.
   2351      */
   2352     buffer_size = HTML_PARSER_BUFFER_SIZE;
   2353     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
   2354     if (buffer == NULL) {
   2355 	htmlErrMemory(ctxt, "buffer allocation failed\n");
   2356 	return(NULL);
   2357     }
   2358     out = buffer;
   2359 
   2360     /*
   2361      * Ok loop until we reach one of the ending chars
   2362      */
   2363     while ((CUR != 0) && (CUR != stop)) {
   2364 	if ((stop == 0) && (CUR == '>')) break;
   2365 	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
   2366         if (CUR == '&') {
   2367 	    if (NXT(1) == '#') {
   2368 		unsigned int c;
   2369 		int bits;
   2370 
   2371 		c = htmlParseCharRef(ctxt);
   2372 		if      (c <    0x80)
   2373 		        { *out++  = c;                bits= -6; }
   2374 		else if (c <   0x800)
   2375 		        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   2376 		else if (c < 0x10000)
   2377 		        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   2378 		else
   2379 		        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   2380 
   2381 		for ( ; bits >= 0; bits-= 6) {
   2382 		    *out++  = ((c >> bits) & 0x3F) | 0x80;
   2383 		}
   2384 
   2385 		if (out - buffer > buffer_size - 100) {
   2386 			int indx = out - buffer;
   2387 
   2388 			growBuffer(buffer);
   2389 			out = &buffer[indx];
   2390 		}
   2391 	    } else {
   2392 		ent = htmlParseEntityRef(ctxt, &name);
   2393 		if (name == NULL) {
   2394 		    *out++ = '&';
   2395 		    if (out - buffer > buffer_size - 100) {
   2396 			int indx = out - buffer;
   2397 
   2398 			growBuffer(buffer);
   2399 			out = &buffer[indx];
   2400 		    }
   2401 		} else if (ent == NULL) {
   2402 		    *out++ = '&';
   2403 		    cur = name;
   2404 		    while (*cur != 0) {
   2405 			if (out - buffer > buffer_size - 100) {
   2406 			    int indx = out - buffer;
   2407 
   2408 			    growBuffer(buffer);
   2409 			    out = &buffer[indx];
   2410 			}
   2411 			*out++ = *cur++;
   2412 		    }
   2413 		} else {
   2414 		    unsigned int c;
   2415 		    int bits;
   2416 
   2417 		    if (out - buffer > buffer_size - 100) {
   2418 			int indx = out - buffer;
   2419 
   2420 			growBuffer(buffer);
   2421 			out = &buffer[indx];
   2422 		    }
   2423 		    c = ent->value;
   2424 		    if      (c <    0x80)
   2425 			{ *out++  = c;                bits= -6; }
   2426 		    else if (c <   0x800)
   2427 			{ *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   2428 		    else if (c < 0x10000)
   2429 			{ *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   2430 		    else
   2431 			{ *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   2432 
   2433 		    for ( ; bits >= 0; bits-= 6) {
   2434 			*out++  = ((c >> bits) & 0x3F) | 0x80;
   2435 		    }
   2436 		}
   2437 	    }
   2438 	} else {
   2439 	    unsigned int c;
   2440 	    int bits, l;
   2441 
   2442 	    if (out - buffer > buffer_size - 100) {
   2443 		int indx = out - buffer;
   2444 
   2445 		growBuffer(buffer);
   2446 		out = &buffer[indx];
   2447 	    }
   2448 	    c = CUR_CHAR(l);
   2449 	    if      (c <    0x80)
   2450 		    { *out++  = c;                bits= -6; }
   2451 	    else if (c <   0x800)
   2452 		    { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   2453 	    else if (c < 0x10000)
   2454 		    { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   2455 	    else
   2456 		    { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   2457 
   2458 	    for ( ; bits >= 0; bits-= 6) {
   2459 		*out++  = ((c >> bits) & 0x3F) | 0x80;
   2460 	    }
   2461 	    NEXT;
   2462 	}
   2463     }
   2464     *out++ = 0;
   2465     return(buffer);
   2466 }
   2467 
   2468 /**
   2469  * htmlParseEntityRef:
   2470  * @ctxt:  an HTML parser context
   2471  * @str:  location to store the entity name
   2472  *
   2473  * parse an HTML ENTITY references
   2474  *
   2475  * [68] EntityRef ::= '&' Name ';'
   2476  *
   2477  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
   2478  *         if non-NULL *str will have to be freed by the caller.
   2479  */
   2480 const htmlEntityDesc *
   2481 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
   2482     const xmlChar *name;
   2483     const htmlEntityDesc * ent = NULL;
   2484 
   2485     if (str != NULL) *str = NULL;
   2486     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
   2487 
   2488     if (CUR == '&') {
   2489         NEXT;
   2490         name = htmlParseName(ctxt);
   2491 	if (name == NULL) {
   2492 	    htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   2493 	                 "htmlParseEntityRef: no name\n", NULL, NULL);
   2494 	} else {
   2495 	    GROW;
   2496 	    if (CUR == ';') {
   2497 	        if (str != NULL)
   2498 		    *str = name;
   2499 
   2500 		/*
   2501 		 * Lookup the entity in the table.
   2502 		 */
   2503 		ent = htmlEntityLookup(name);
   2504 		if (ent != NULL) /* OK that's ugly !!! */
   2505 		    NEXT;
   2506 	    } else {
   2507 		htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
   2508 		             "htmlParseEntityRef: expecting ';'\n",
   2509 			     NULL, NULL);
   2510 	        if (str != NULL)
   2511 		    *str = name;
   2512 	    }
   2513 	}
   2514     }
   2515     return(ent);
   2516 }
   2517 
   2518 /**
   2519  * htmlParseAttValue:
   2520  * @ctxt:  an HTML parser context
   2521  *
   2522  * parse a value for an attribute
   2523  * Note: the parser won't do substitution of entities here, this
   2524  * will be handled later in xmlStringGetNodeList, unless it was
   2525  * asked for ctxt->replaceEntities != 0
   2526  *
   2527  * Returns the AttValue parsed or NULL.
   2528  */
   2529 
   2530 static xmlChar *
   2531 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
   2532     xmlChar *ret = NULL;
   2533 
   2534     if (CUR == '"') {
   2535         NEXT;
   2536 	ret = htmlParseHTMLAttribute(ctxt, '"');
   2537         if (CUR != '"') {
   2538 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
   2539 	                 "AttValue: \" expected\n", NULL, NULL);
   2540 	} else
   2541 	    NEXT;
   2542     } else if (CUR == '\'') {
   2543         NEXT;
   2544 	ret = htmlParseHTMLAttribute(ctxt, '\'');
   2545         if (CUR != '\'') {
   2546 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
   2547 	                 "AttValue: ' expected\n", NULL, NULL);
   2548 	} else
   2549 	    NEXT;
   2550     } else {
   2551         /*
   2552 	 * That's an HTMLism, the attribute value may not be quoted
   2553 	 */
   2554 	ret = htmlParseHTMLAttribute(ctxt, 0);
   2555 	if (ret == NULL) {
   2556 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
   2557 	                 "AttValue: no value found\n", NULL, NULL);
   2558 	}
   2559     }
   2560     return(ret);
   2561 }
   2562 
   2563 /**
   2564  * htmlParseSystemLiteral:
   2565  * @ctxt:  an HTML parser context
   2566  *
   2567  * parse an HTML Literal
   2568  *
   2569  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
   2570  *
   2571  * Returns the SystemLiteral parsed or NULL
   2572  */
   2573 
   2574 static xmlChar *
   2575 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
   2576     const xmlChar *q;
   2577     xmlChar *ret = NULL;
   2578 
   2579     if (CUR == '"') {
   2580         NEXT;
   2581 	q = CUR_PTR;
   2582 	while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
   2583 	    NEXT;
   2584 	if (!IS_CHAR_CH(CUR)) {
   2585 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
   2586 			 "Unfinished SystemLiteral\n", NULL, NULL);
   2587 	} else {
   2588 	    ret = xmlStrndup(q, CUR_PTR - q);
   2589 	    NEXT;
   2590         }
   2591     } else if (CUR == '\'') {
   2592         NEXT;
   2593 	q = CUR_PTR;
   2594 	while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
   2595 	    NEXT;
   2596 	if (!IS_CHAR_CH(CUR)) {
   2597 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
   2598 			 "Unfinished SystemLiteral\n", NULL, NULL);
   2599 	} else {
   2600 	    ret = xmlStrndup(q, CUR_PTR - q);
   2601 	    NEXT;
   2602         }
   2603     } else {
   2604 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
   2605 	             " or ' expected\n", NULL, NULL);
   2606     }
   2607 
   2608     return(ret);
   2609 }
   2610 
   2611 /**
   2612  * htmlParsePubidLiteral:
   2613  * @ctxt:  an HTML parser context
   2614  *
   2615  * parse an HTML public literal
   2616  *
   2617  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
   2618  *
   2619  * Returns the PubidLiteral parsed or NULL.
   2620  */
   2621 
   2622 static xmlChar *
   2623 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
   2624     const xmlChar *q;
   2625     xmlChar *ret = NULL;
   2626     /*
   2627      * Name ::= (Letter | '_') (NameChar)*
   2628      */
   2629     if (CUR == '"') {
   2630         NEXT;
   2631 	q = CUR_PTR;
   2632 	while (IS_PUBIDCHAR_CH(CUR)) NEXT;
   2633 	if (CUR != '"') {
   2634 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
   2635 	                 "Unfinished PubidLiteral\n", NULL, NULL);
   2636 	} else {
   2637 	    ret = xmlStrndup(q, CUR_PTR - q);
   2638 	    NEXT;
   2639 	}
   2640     } else if (CUR == '\'') {
   2641         NEXT;
   2642 	q = CUR_PTR;
   2643 	while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
   2644 	    NEXT;
   2645 	if (CUR != '\'') {
   2646 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
   2647 	                 "Unfinished PubidLiteral\n", NULL, NULL);
   2648 	} else {
   2649 	    ret = xmlStrndup(q, CUR_PTR - q);
   2650 	    NEXT;
   2651 	}
   2652     } else {
   2653 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
   2654 	             "PubidLiteral \" or ' expected\n", NULL, NULL);
   2655     }
   2656 
   2657     return(ret);
   2658 }
   2659 
   2660 /**
   2661  * htmlParseScript:
   2662  * @ctxt:  an HTML parser context
   2663  *
   2664  * parse the content of an HTML SCRIPT or STYLE element
   2665  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
   2666  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
   2667  * http://www.w3.org/TR/html4/types.html#type-script
   2668  * http://www.w3.org/TR/html4/types.html#h-6.15
   2669  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
   2670  *
   2671  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
   2672  * element and the value of intrinsic event attributes. User agents must
   2673  * not evaluate script data as HTML markup but instead must pass it on as
   2674  * data to a script engine.
   2675  * NOTES:
   2676  * - The content is passed like CDATA
   2677  * - the attributes for style and scripting "onXXX" are also described
   2678  *   as CDATA but SGML allows entities references in attributes so their
   2679  *   processing is identical as other attributes
   2680  */
   2681 static void
   2682 htmlParseScript(htmlParserCtxtPtr ctxt) {
   2683     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
   2684     int nbchar = 0;
   2685     int cur,l;
   2686 
   2687     SHRINK;
   2688     cur = CUR_CHAR(l);
   2689     while (IS_CHAR_CH(cur)) {
   2690 	if ((cur == '<') && (NXT(1) == '/')) {
   2691             /*
   2692              * One should break here, the specification is clear:
   2693              * Authors should therefore escape "</" within the content.
   2694              * Escape mechanisms are specific to each scripting or
   2695              * style sheet language.
   2696              *
   2697              * In recovery mode, only break if end tag match the
   2698              * current tag, effectively ignoring all tags inside the
   2699              * script/style block and treating the entire block as
   2700              * CDATA.
   2701              */
   2702             if (ctxt->recovery) {
   2703                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
   2704 				   xmlStrlen(ctxt->name)) == 0)
   2705                 {
   2706                     break; /* while */
   2707                 } else {
   2708 		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
   2709 				 "Element %s embeds close tag\n",
   2710 		                 ctxt->name, NULL);
   2711 		}
   2712             } else {
   2713                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
   2714                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
   2715                 {
   2716                     break; /* while */
   2717                 }
   2718             }
   2719 	}
   2720 	COPY_BUF(l,buf,nbchar,cur);
   2721 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
   2722 	    if (ctxt->sax->cdataBlock!= NULL) {
   2723 		/*
   2724 		 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
   2725 		 */
   2726 		ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
   2727 	    } else if (ctxt->sax->characters != NULL) {
   2728 		ctxt->sax->characters(ctxt->userData, buf, nbchar);
   2729 	    }
   2730 	    nbchar = 0;
   2731 	}
   2732 	GROW;
   2733 	NEXTL(l);
   2734 	cur = CUR_CHAR(l);
   2735     }
   2736 
   2737     if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
   2738 	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
   2739 	                "Invalid char in CDATA 0x%X\n", cur);
   2740 	NEXT;
   2741     }
   2742 
   2743     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
   2744 	if (ctxt->sax->cdataBlock!= NULL) {
   2745 	    /*
   2746 	     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
   2747 	     */
   2748 	    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
   2749 	} else if (ctxt->sax->characters != NULL) {
   2750 	    ctxt->sax->characters(ctxt->userData, buf, nbchar);
   2751 	}
   2752     }
   2753 }
   2754 
   2755 
   2756 /**
   2757  * htmlParseCharData:
   2758  * @ctxt:  an HTML parser context
   2759  *
   2760  * parse a CharData section.
   2761  * if we are within a CDATA section ']]>' marks an end of section.
   2762  *
   2763  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
   2764  */
   2765 
   2766 static void
   2767 htmlParseCharData(htmlParserCtxtPtr ctxt) {
   2768     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
   2769     int nbchar = 0;
   2770     int cur, l;
   2771     int chunk = 0;
   2772 
   2773     SHRINK;
   2774     cur = CUR_CHAR(l);
   2775     while (((cur != '<') || (ctxt->token == '<')) &&
   2776            ((cur != '&') || (ctxt->token == '&')) &&
   2777 	   (cur != 0)) {
   2778 	if (!(IS_CHAR(cur))) {
   2779 	    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
   2780 	                "Invalid char in CDATA 0x%X\n", cur);
   2781 	} else {
   2782 	    COPY_BUF(l,buf,nbchar,cur);
   2783 	}
   2784 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
   2785 	    /*
   2786 	     * Ok the segment is to be consumed as chars.
   2787 	     */
   2788 	    if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
   2789 		if (areBlanks(ctxt, buf, nbchar)) {
   2790 		    if (ctxt->sax->ignorableWhitespace != NULL)
   2791 			ctxt->sax->ignorableWhitespace(ctxt->userData,
   2792 			                               buf, nbchar);
   2793 		} else {
   2794 		    htmlCheckParagraph(ctxt);
   2795 		    if (ctxt->sax->characters != NULL)
   2796 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
   2797 		}
   2798 	    }
   2799 	    nbchar = 0;
   2800 	}
   2801 	NEXTL(l);
   2802         chunk++;
   2803         if (chunk > HTML_PARSER_BUFFER_SIZE) {
   2804             chunk = 0;
   2805             SHRINK;
   2806             GROW;
   2807         }
   2808 	cur = CUR_CHAR(l);
   2809 	if (cur == 0) {
   2810 	    SHRINK;
   2811 	    GROW;
   2812 	    cur = CUR_CHAR(l);
   2813 	}
   2814     }
   2815     if (nbchar != 0) {
   2816         buf[nbchar] = 0;
   2817 
   2818 	/*
   2819 	 * Ok the segment is to be consumed as chars.
   2820 	 */
   2821 	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
   2822 	    if (areBlanks(ctxt, buf, nbchar)) {
   2823 		if (ctxt->sax->ignorableWhitespace != NULL)
   2824 		    ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
   2825 	    } else {
   2826 		htmlCheckParagraph(ctxt);
   2827 		if (ctxt->sax->characters != NULL)
   2828 		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
   2829 	    }
   2830 	}
   2831     } else {
   2832 	/*
   2833 	 * Loop detection
   2834 	 */
   2835 	if (cur == 0)
   2836 	    ctxt->instate = XML_PARSER_EOF;
   2837     }
   2838 }
   2839 
   2840 /**
   2841  * htmlParseExternalID:
   2842  * @ctxt:  an HTML parser context
   2843  * @publicID:  a xmlChar** receiving PubidLiteral
   2844  *
   2845  * Parse an External ID or a Public ID
   2846  *
   2847  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
   2848  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
   2849  *
   2850  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
   2851  *
   2852  * Returns the function returns SystemLiteral and in the second
   2853  *                case publicID receives PubidLiteral, is strict is off
   2854  *                it is possible to return NULL and have publicID set.
   2855  */
   2856 
   2857 static xmlChar *
   2858 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
   2859     xmlChar *URI = NULL;
   2860 
   2861     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
   2862          (UPP(2) == 'S') && (UPP(3) == 'T') &&
   2863 	 (UPP(4) == 'E') && (UPP(5) == 'M')) {
   2864         SKIP(6);
   2865 	if (!IS_BLANK_CH(CUR)) {
   2866 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
   2867 	                 "Space required after 'SYSTEM'\n", NULL, NULL);
   2868 	}
   2869         SKIP_BLANKS;
   2870 	URI = htmlParseSystemLiteral(ctxt);
   2871 	if (URI == NULL) {
   2872 	    htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
   2873 	                 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
   2874         }
   2875     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
   2876 	       (UPP(2) == 'B') && (UPP(3) == 'L') &&
   2877 	       (UPP(4) == 'I') && (UPP(5) == 'C')) {
   2878         SKIP(6);
   2879 	if (!IS_BLANK_CH(CUR)) {
   2880 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
   2881 	                 "Space required after 'PUBLIC'\n", NULL, NULL);
   2882 	}
   2883         SKIP_BLANKS;
   2884 	*publicID = htmlParsePubidLiteral(ctxt);
   2885 	if (*publicID == NULL) {
   2886 	    htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
   2887 	                 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
   2888 			 NULL, NULL);
   2889 	}
   2890         SKIP_BLANKS;
   2891         if ((CUR == '"') || (CUR == '\'')) {
   2892 	    URI = htmlParseSystemLiteral(ctxt);
   2893 	}
   2894     }
   2895     return(URI);
   2896 }
   2897 
   2898 /**
   2899  * xmlParsePI:
   2900  * @ctxt:  an XML parser context
   2901  *
   2902  * parse an XML Processing Instruction.
   2903  *
   2904  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
   2905  */
   2906 static void
   2907 htmlParsePI(htmlParserCtxtPtr ctxt) {
   2908     xmlChar *buf = NULL;
   2909     int len = 0;
   2910     int size = HTML_PARSER_BUFFER_SIZE;
   2911     int cur, l;
   2912     const xmlChar *target;
   2913     xmlParserInputState state;
   2914     int count = 0;
   2915 
   2916     if ((RAW == '<') && (NXT(1) == '?')) {
   2917 	state = ctxt->instate;
   2918         ctxt->instate = XML_PARSER_PI;
   2919 	/*
   2920 	 * this is a Processing Instruction.
   2921 	 */
   2922 	SKIP(2);
   2923 	SHRINK;
   2924 
   2925 	/*
   2926 	 * Parse the target name and check for special support like
   2927 	 * namespace.
   2928 	 */
   2929         target = htmlParseName(ctxt);
   2930 	if (target != NULL) {
   2931 	    if (RAW == '>') {
   2932 		SKIP(1);
   2933 
   2934 		/*
   2935 		 * SAX: PI detected.
   2936 		 */
   2937 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
   2938 		    (ctxt->sax->processingInstruction != NULL))
   2939 		    ctxt->sax->processingInstruction(ctxt->userData,
   2940 		                                     target, NULL);
   2941 		ctxt->instate = state;
   2942 		return;
   2943 	    }
   2944 	    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
   2945 	    if (buf == NULL) {
   2946 		htmlErrMemory(ctxt, NULL);
   2947 		ctxt->instate = state;
   2948 		return;
   2949 	    }
   2950 	    cur = CUR;
   2951 	    if (!IS_BLANK(cur)) {
   2952 		htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
   2953 			  "ParsePI: PI %s space expected\n", target, NULL);
   2954 	    }
   2955             SKIP_BLANKS;
   2956 	    cur = CUR_CHAR(l);
   2957 	    while (IS_CHAR(cur) && (cur != '>')) {
   2958 		if (len + 5 >= size) {
   2959 		    xmlChar *tmp;
   2960 
   2961 		    size *= 2;
   2962 		    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
   2963 		    if (tmp == NULL) {
   2964 			htmlErrMemory(ctxt, NULL);
   2965 			xmlFree(buf);
   2966 			ctxt->instate = state;
   2967 			return;
   2968 		    }
   2969 		    buf = tmp;
   2970 		}
   2971 		count++;
   2972 		if (count > 50) {
   2973 		    GROW;
   2974 		    count = 0;
   2975 		}
   2976 		COPY_BUF(l,buf,len,cur);
   2977 		NEXTL(l);
   2978 		cur = CUR_CHAR(l);
   2979 		if (cur == 0) {
   2980 		    SHRINK;
   2981 		    GROW;
   2982 		    cur = CUR_CHAR(l);
   2983 		}
   2984 	    }
   2985 	    buf[len] = 0;
   2986 	    if (cur != '>') {
   2987 		htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
   2988 		      "ParsePI: PI %s never end ...\n", target, NULL);
   2989 	    } else {
   2990 		SKIP(1);
   2991 
   2992 		/*
   2993 		 * SAX: PI detected.
   2994 		 */
   2995 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
   2996 		    (ctxt->sax->processingInstruction != NULL))
   2997 		    ctxt->sax->processingInstruction(ctxt->userData,
   2998 		                                     target, buf);
   2999 	    }
   3000 	    xmlFree(buf);
   3001 	} else {
   3002 	    htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
   3003                          "PI is not started correctly", NULL, NULL);
   3004 	}
   3005 	ctxt->instate = state;
   3006     }
   3007 }
   3008 
   3009 /**
   3010  * htmlParseComment:
   3011  * @ctxt:  an HTML parser context
   3012  *
   3013  * Parse an XML (SGML) comment <!-- .... -->
   3014  *
   3015  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
   3016  */
   3017 static void
   3018 htmlParseComment(htmlParserCtxtPtr ctxt) {
   3019     xmlChar *buf = NULL;
   3020     int len;
   3021     int size = HTML_PARSER_BUFFER_SIZE;
   3022     int q, ql;
   3023     int r, rl;
   3024     int cur, l;
   3025     xmlParserInputState state;
   3026 
   3027     /*
   3028      * Check that there is a comment right here.
   3029      */
   3030     if ((RAW != '<') || (NXT(1) != '!') ||
   3031         (NXT(2) != '-') || (NXT(3) != '-')) return;
   3032 
   3033     state = ctxt->instate;
   3034     ctxt->instate = XML_PARSER_COMMENT;
   3035     SHRINK;
   3036     SKIP(4);
   3037     buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
   3038     if (buf == NULL) {
   3039         htmlErrMemory(ctxt, "buffer allocation failed\n");
   3040 	ctxt->instate = state;
   3041 	return;
   3042     }
   3043     q = CUR_CHAR(ql);
   3044     NEXTL(ql);
   3045     r = CUR_CHAR(rl);
   3046     NEXTL(rl);
   3047     cur = CUR_CHAR(l);
   3048     len = 0;
   3049     while (IS_CHAR(cur) &&
   3050            ((cur != '>') ||
   3051 	    (r != '-') || (q != '-'))) {
   3052 	if (len + 5 >= size) {
   3053 	    xmlChar *tmp;
   3054 
   3055 	    size *= 2;
   3056 	    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
   3057 	    if (tmp == NULL) {
   3058 	        xmlFree(buf);
   3059 	        htmlErrMemory(ctxt, "growing buffer failed\n");
   3060 		ctxt->instate = state;
   3061 		return;
   3062 	    }
   3063 	    buf = tmp;
   3064 	}
   3065 	COPY_BUF(ql,buf,len,q);
   3066 	q = r;
   3067 	ql = rl;
   3068 	r = cur;
   3069 	rl = l;
   3070 	NEXTL(l);
   3071 	cur = CUR_CHAR(l);
   3072 	if (cur == 0) {
   3073 	    SHRINK;
   3074 	    GROW;
   3075 	    cur = CUR_CHAR(l);
   3076 	}
   3077     }
   3078     buf[len] = 0;
   3079     if (!IS_CHAR(cur)) {
   3080 	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
   3081 	             "Comment not terminated \n<!--%.50s\n", buf, NULL);
   3082 	xmlFree(buf);
   3083     } else {
   3084         NEXT;
   3085 	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
   3086 	    (!ctxt->disableSAX))
   3087 	    ctxt->sax->comment(ctxt->userData, buf);
   3088 	xmlFree(buf);
   3089     }
   3090     ctxt->instate = state;
   3091 }
   3092 
   3093 /**
   3094  * htmlParseCharRef:
   3095  * @ctxt:  an HTML parser context
   3096  *
   3097  * parse Reference declarations
   3098  *
   3099  * [66] CharRef ::= '&#' [0-9]+ ';' |
   3100  *                  '&#x' [0-9a-fA-F]+ ';'
   3101  *
   3102  * Returns the value parsed (as an int)
   3103  */
   3104 int
   3105 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
   3106     int val = 0;
   3107 
   3108     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   3109 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   3110 		     "htmlParseCharRef: context error\n",
   3111 		     NULL, NULL);
   3112         return(0);
   3113     }
   3114     if ((CUR == '&') && (NXT(1) == '#') &&
   3115         ((NXT(2) == 'x') || NXT(2) == 'X')) {
   3116 	SKIP(3);
   3117 	while (CUR != ';') {
   3118 	    if ((CUR >= '0') && (CUR <= '9'))
   3119 	        val = val * 16 + (CUR - '0');
   3120 	    else if ((CUR >= 'a') && (CUR <= 'f'))
   3121 	        val = val * 16 + (CUR - 'a') + 10;
   3122 	    else if ((CUR >= 'A') && (CUR <= 'F'))
   3123 	        val = val * 16 + (CUR - 'A') + 10;
   3124 	    else {
   3125 	        htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
   3126 		             "htmlParseCharRef: missing semicolumn\n",
   3127 			     NULL, NULL);
   3128 		break;
   3129 	    }
   3130 	    NEXT;
   3131 	}
   3132 	if (CUR == ';')
   3133 	    NEXT;
   3134     } else if  ((CUR == '&') && (NXT(1) == '#')) {
   3135 	SKIP(2);
   3136 	while (CUR != ';') {
   3137 	    if ((CUR >= '0') && (CUR <= '9'))
   3138 	        val = val * 10 + (CUR - '0');
   3139 	    else {
   3140 	        htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
   3141 		             "htmlParseCharRef: missing semicolumn\n",
   3142 			     NULL, NULL);
   3143 		break;
   3144 	    }
   3145 	    NEXT;
   3146 	}
   3147 	if (CUR == ';')
   3148 	    NEXT;
   3149     } else {
   3150 	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
   3151 	             "htmlParseCharRef: invalid value\n", NULL, NULL);
   3152     }
   3153     /*
   3154      * Check the value IS_CHAR ...
   3155      */
   3156     if (IS_CHAR(val)) {
   3157         return(val);
   3158     } else {
   3159 	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
   3160 			"htmlParseCharRef: invalid xmlChar value %d\n",
   3161 			val);
   3162     }
   3163     return(0);
   3164 }
   3165 
   3166 
   3167 /**
   3168  * htmlParseDocTypeDecl:
   3169  * @ctxt:  an HTML parser context
   3170  *
   3171  * parse a DOCTYPE declaration
   3172  *
   3173  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
   3174  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
   3175  */
   3176 
   3177 static void
   3178 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
   3179     const xmlChar *name;
   3180     xmlChar *ExternalID = NULL;
   3181     xmlChar *URI = NULL;
   3182 
   3183     /*
   3184      * We know that '<!DOCTYPE' has been detected.
   3185      */
   3186     SKIP(9);
   3187 
   3188     SKIP_BLANKS;
   3189 
   3190     /*
   3191      * Parse the DOCTYPE name.
   3192      */
   3193     name = htmlParseName(ctxt);
   3194     if (name == NULL) {
   3195 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   3196 	             "htmlParseDocTypeDecl : no DOCTYPE name !\n",
   3197 		     NULL, NULL);
   3198     }
   3199     /*
   3200      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
   3201      */
   3202 
   3203     SKIP_BLANKS;
   3204 
   3205     /*
   3206      * Check for SystemID and ExternalID
   3207      */
   3208     URI = htmlParseExternalID(ctxt, &ExternalID);
   3209     SKIP_BLANKS;
   3210 
   3211     /*
   3212      * We should be at the end of the DOCTYPE declaration.
   3213      */
   3214     if (CUR != '>') {
   3215 	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
   3216 	             "DOCTYPE improperly terminated\n", NULL, NULL);
   3217         /* We shouldn't try to resynchronize ... */
   3218     }
   3219     NEXT;
   3220 
   3221     /*
   3222      * Create or update the document accordingly to the DOCTYPE
   3223      */
   3224     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
   3225 	(!ctxt->disableSAX))
   3226 	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
   3227 
   3228     /*
   3229      * Cleanup, since we don't use all those identifiers
   3230      */
   3231     if (URI != NULL) xmlFree(URI);
   3232     if (ExternalID != NULL) xmlFree(ExternalID);
   3233 }
   3234 
   3235 /**
   3236  * htmlParseAttribute:
   3237  * @ctxt:  an HTML parser context
   3238  * @value:  a xmlChar ** used to store the value of the attribute
   3239  *
   3240  * parse an attribute
   3241  *
   3242  * [41] Attribute ::= Name Eq AttValue
   3243  *
   3244  * [25] Eq ::= S? '=' S?
   3245  *
   3246  * With namespace:
   3247  *
   3248  * [NS 11] Attribute ::= QName Eq AttValue
   3249  *
   3250  * Also the case QName == xmlns:??? is handled independently as a namespace
   3251  * definition.
   3252  *
   3253  * Returns the attribute name, and the value in *value.
   3254  */
   3255 
   3256 static const xmlChar *
   3257 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
   3258     const xmlChar *name;
   3259     xmlChar *val = NULL;
   3260 
   3261     *value = NULL;
   3262     name = htmlParseHTMLName(ctxt);
   3263     if (name == NULL) {
   3264 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   3265 	             "error parsing attribute name\n", NULL, NULL);
   3266         return(NULL);
   3267     }
   3268 
   3269     /*
   3270      * read the value
   3271      */
   3272     SKIP_BLANKS;
   3273     if (CUR == '=') {
   3274         NEXT;
   3275 	SKIP_BLANKS;
   3276 	val = htmlParseAttValue(ctxt);
   3277     } else if (htmlIsBooleanAttr(name)) {
   3278         /*
   3279 	 * assume a minimized attribute
   3280 	 */
   3281 	val = xmlStrdup(name);
   3282     }
   3283 
   3284     *value = val;
   3285     return(name);
   3286 }
   3287 
   3288 /**
   3289  * htmlCheckEncoding:
   3290  * @ctxt:  an HTML parser context
   3291  * @attvalue: the attribute value
   3292  *
   3293  * Checks an http-equiv attribute from a Meta tag to detect
   3294  * the encoding
   3295  * If a new encoding is detected the parser is switched to decode
   3296  * it and pass UTF8
   3297  */
   3298 static void
   3299 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
   3300     const xmlChar *encoding;
   3301 
   3302     if ((ctxt == NULL) || (attvalue == NULL))
   3303 	return;
   3304 
   3305     /* do not change encoding */
   3306     if (ctxt->input->encoding != NULL)
   3307         return;
   3308 
   3309     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
   3310     if (encoding != NULL) {
   3311 	encoding += 8;
   3312     } else {
   3313 	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
   3314 	if (encoding != NULL)
   3315 	    encoding += 9;
   3316     }
   3317     if (encoding != NULL) {
   3318 	xmlCharEncoding enc;
   3319 	xmlCharEncodingHandlerPtr handler;
   3320 
   3321 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
   3322 
   3323 	if (ctxt->input->encoding != NULL)
   3324 	    xmlFree((xmlChar *) ctxt->input->encoding);
   3325 	ctxt->input->encoding = xmlStrdup(encoding);
   3326 
   3327 	enc = xmlParseCharEncoding((const char *) encoding);
   3328 	/*
   3329 	 * registered set of known encodings
   3330 	 */
   3331 	if (enc != XML_CHAR_ENCODING_ERROR) {
   3332 	    if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
   3333 	         (enc == XML_CHAR_ENCODING_UTF16BE) ||
   3334 		 (enc == XML_CHAR_ENCODING_UCS4LE) ||
   3335 		 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
   3336 		(ctxt->input->buf != NULL) &&
   3337 		(ctxt->input->buf->encoder == NULL)) {
   3338 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
   3339 		             "htmlCheckEncoding: wrong encoding meta\n",
   3340 			     NULL, NULL);
   3341 	    } else {
   3342 		xmlSwitchEncoding(ctxt, enc);
   3343 	    }
   3344 	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
   3345 	} else {
   3346 	    /*
   3347 	     * fallback for unknown encodings
   3348 	     */
   3349 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
   3350 	    if (handler != NULL) {
   3351 		xmlSwitchToEncoding(ctxt, handler);
   3352 		ctxt->charset = XML_CHAR_ENCODING_UTF8;
   3353 	    } else {
   3354 		ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
   3355 	    }
   3356 	}
   3357 
   3358 	if ((ctxt->input->buf != NULL) &&
   3359 	    (ctxt->input->buf->encoder != NULL) &&
   3360 	    (ctxt->input->buf->raw != NULL) &&
   3361 	    (ctxt->input->buf->buffer != NULL)) {
   3362 	    int nbchars;
   3363 	    int processed;
   3364 
   3365 	    /*
   3366 	     * convert as much as possible to the parser reading buffer.
   3367 	     */
   3368 	    processed = ctxt->input->cur - ctxt->input->base;
   3369 	    xmlBufferShrink(ctxt->input->buf->buffer, processed);
   3370 	    nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
   3371 		                       ctxt->input->buf->buffer,
   3372 				       ctxt->input->buf->raw);
   3373 	    if (nbchars < 0) {
   3374 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
   3375 		             "htmlCheckEncoding: encoder error\n",
   3376 			     NULL, NULL);
   3377 	    }
   3378 	    ctxt->input->base =
   3379 	    ctxt->input->cur = ctxt->input->buf->buffer->content;
   3380 	}
   3381     }
   3382 }
   3383 
   3384 /**
   3385  * htmlCheckMeta:
   3386  * @ctxt:  an HTML parser context
   3387  * @atts:  the attributes values
   3388  *
   3389  * Checks an attributes from a Meta tag
   3390  */
   3391 static void
   3392 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
   3393     int i;
   3394     const xmlChar *att, *value;
   3395     int http = 0;
   3396     const xmlChar *content = NULL;
   3397 
   3398     if ((ctxt == NULL) || (atts == NULL))
   3399 	return;
   3400 
   3401     i = 0;
   3402     att = atts[i++];
   3403     while (att != NULL) {
   3404 	value = atts[i++];
   3405 	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
   3406 	 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
   3407 	    http = 1;
   3408 	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
   3409 	    content = value;
   3410 	att = atts[i++];
   3411     }
   3412     if ((http) && (content != NULL))
   3413 	htmlCheckEncoding(ctxt, content);
   3414 
   3415 }
   3416 
   3417 /**
   3418  * htmlParseStartTag:
   3419  * @ctxt:  an HTML parser context
   3420  *
   3421  * parse a start of tag either for rule element or
   3422  * EmptyElement. In both case we don't parse the tag closing chars.
   3423  *
   3424  * [40] STag ::= '<' Name (S Attribute)* S? '>'
   3425  *
   3426  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
   3427  *
   3428  * With namespace:
   3429  *
   3430  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
   3431  *
   3432  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
   3433  *
   3434  * Returns 0 in case of success, -1 in case of error and 1 if discarded
   3435  */
   3436 
   3437 static int
   3438 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
   3439     const xmlChar *name;
   3440     const xmlChar *attname;
   3441     xmlChar *attvalue;
   3442     const xmlChar **atts;
   3443     int nbatts = 0;
   3444     int maxatts;
   3445     int meta = 0;
   3446     int i;
   3447     int discardtag = 0;
   3448 
   3449     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   3450 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   3451 		     "htmlParseStartTag: context error\n", NULL, NULL);
   3452 	return -1;
   3453     }
   3454     if (CUR != '<') return -1;
   3455     NEXT;
   3456 
   3457     atts = ctxt->atts;
   3458     maxatts = ctxt->maxatts;
   3459 
   3460     GROW;
   3461     name = htmlParseHTMLName(ctxt);
   3462     if (name == NULL) {
   3463 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   3464 	             "htmlParseStartTag: invalid element name\n",
   3465 		     NULL, NULL);
   3466 	/* Dump the bogus tag like browsers do */
   3467 	while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
   3468 	    NEXT;
   3469         return -1;
   3470     }
   3471     if (xmlStrEqual(name, BAD_CAST"meta"))
   3472 	meta = 1;
   3473 
   3474     /*
   3475      * Check for auto-closure of HTML elements.
   3476      */
   3477     htmlAutoClose(ctxt, name);
   3478 
   3479     /*
   3480      * Check for implied HTML elements.
   3481      */
   3482     htmlCheckImplied(ctxt, name);
   3483 
   3484     /*
   3485      * Avoid html at any level > 0, head at any level != 1
   3486      * or any attempt to recurse body
   3487      */
   3488     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
   3489 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   3490 	             "htmlParseStartTag: misplaced <html> tag\n",
   3491 		     name, NULL);
   3492 	discardtag = 1;
   3493 	ctxt->depth++;
   3494     }
   3495     if ((ctxt->nameNr != 1) &&
   3496 	(xmlStrEqual(name, BAD_CAST"head"))) {
   3497 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   3498 	             "htmlParseStartTag: misplaced <head> tag\n",
   3499 		     name, NULL);
   3500 	discardtag = 1;
   3501 	ctxt->depth++;
   3502     }
   3503     if (xmlStrEqual(name, BAD_CAST"body")) {
   3504 	int indx;
   3505 	for (indx = 0;indx < ctxt->nameNr;indx++) {
   3506 	    if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
   3507 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   3508 		             "htmlParseStartTag: misplaced <body> tag\n",
   3509 			     name, NULL);
   3510 		discardtag = 1;
   3511 		ctxt->depth++;
   3512 	    }
   3513 	}
   3514     }
   3515 
   3516     /*
   3517      * Now parse the attributes, it ends up with the ending
   3518      *
   3519      * (S Attribute)* S?
   3520      */
   3521     SKIP_BLANKS;
   3522     while ((IS_CHAR_CH(CUR)) &&
   3523            (CUR != '>') &&
   3524 	   ((CUR != '/') || (NXT(1) != '>'))) {
   3525 	long cons = ctxt->nbChars;
   3526 
   3527 	GROW;
   3528 	attname = htmlParseAttribute(ctxt, &attvalue);
   3529         if (attname != NULL) {
   3530 
   3531 	    /*
   3532 	     * Well formedness requires at most one declaration of an attribute
   3533 	     */
   3534 	    for (i = 0; i < nbatts;i += 2) {
   3535 	        if (xmlStrEqual(atts[i], attname)) {
   3536 		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
   3537 		                 "Attribute %s redefined\n", attname, NULL);
   3538 		    if (attvalue != NULL)
   3539 			xmlFree(attvalue);
   3540 		    goto failed;
   3541 		}
   3542 	    }
   3543 
   3544 	    /*
   3545 	     * Add the pair to atts
   3546 	     */
   3547 	    if (atts == NULL) {
   3548 	        maxatts = 22; /* allow for 10 attrs by default */
   3549 	        atts = (const xmlChar **)
   3550 		       xmlMalloc(maxatts * sizeof(xmlChar *));
   3551 		if (atts == NULL) {
   3552 		    htmlErrMemory(ctxt, NULL);
   3553 		    if (attvalue != NULL)
   3554 			xmlFree(attvalue);
   3555 		    goto failed;
   3556 		}
   3557 		ctxt->atts = atts;
   3558 		ctxt->maxatts = maxatts;
   3559 	    } else if (nbatts + 4 > maxatts) {
   3560 	        const xmlChar **n;
   3561 
   3562 	        maxatts *= 2;
   3563 	        n = (const xmlChar **) xmlRealloc((void *) atts,
   3564 					     maxatts * sizeof(const xmlChar *));
   3565 		if (n == NULL) {
   3566 		    htmlErrMemory(ctxt, NULL);
   3567 		    if (attvalue != NULL)
   3568 			xmlFree(attvalue);
   3569 		    goto failed;
   3570 		}
   3571 		atts = n;
   3572 		ctxt->atts = atts;
   3573 		ctxt->maxatts = maxatts;
   3574 	    }
   3575 	    atts[nbatts++] = attname;
   3576 	    atts[nbatts++] = attvalue;
   3577 	    atts[nbatts] = NULL;
   3578 	    atts[nbatts + 1] = NULL;
   3579 	}
   3580 	else {
   3581 	    if (attvalue != NULL)
   3582 	        xmlFree(attvalue);
   3583 	    /* Dump the bogus attribute string up to the next blank or
   3584 	     * the end of the tag. */
   3585 	    while ((IS_CHAR_CH(CUR)) &&
   3586 	           !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
   3587 		   ((CUR != '/') || (NXT(1) != '>')))
   3588 		NEXT;
   3589 	}
   3590 
   3591 failed:
   3592 	SKIP_BLANKS;
   3593         if (cons == ctxt->nbChars) {
   3594 	    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   3595 	                 "htmlParseStartTag: problem parsing attributes\n",
   3596 			 NULL, NULL);
   3597 	    break;
   3598 	}
   3599     }
   3600 
   3601     /*
   3602      * Handle specific association to the META tag
   3603      */
   3604     if (meta && (nbatts != 0))
   3605 	htmlCheckMeta(ctxt, atts);
   3606 
   3607     /*
   3608      * SAX: Start of Element !
   3609      */
   3610     if (!discardtag) {
   3611 	htmlnamePush(ctxt, name);
   3612 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
   3613 	    if (nbatts != 0)
   3614 		ctxt->sax->startElement(ctxt->userData, name, atts);
   3615 	    else
   3616 		ctxt->sax->startElement(ctxt->userData, name, NULL);
   3617 	}
   3618     }
   3619 
   3620     if (atts != NULL) {
   3621         for (i = 1;i < nbatts;i += 2) {
   3622 	    if (atts[i] != NULL)
   3623 		xmlFree((xmlChar *) atts[i]);
   3624 	}
   3625     }
   3626 
   3627     return(discardtag);
   3628 }
   3629 
   3630 /**
   3631  * htmlParseEndTag:
   3632  * @ctxt:  an HTML parser context
   3633  *
   3634  * parse an end of tag
   3635  *
   3636  * [42] ETag ::= '</' Name S? '>'
   3637  *
   3638  * With namespace
   3639  *
   3640  * [NS 9] ETag ::= '</' QName S? '>'
   3641  *
   3642  * Returns 1 if the current level should be closed.
   3643  */
   3644 
   3645 static int
   3646 htmlParseEndTag(htmlParserCtxtPtr ctxt)
   3647 {
   3648     const xmlChar *name;
   3649     const xmlChar *oldname;
   3650     int i, ret;
   3651 
   3652     if ((CUR != '<') || (NXT(1) != '/')) {
   3653         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
   3654 	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
   3655         return (0);
   3656     }
   3657     SKIP(2);
   3658 
   3659     name = htmlParseHTMLName(ctxt);
   3660     if (name == NULL)
   3661         return (0);
   3662     /*
   3663      * We should definitely be at the ending "S? '>'" part
   3664      */
   3665     SKIP_BLANKS;
   3666     if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
   3667         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
   3668 	             "End tag : expected '>'\n", NULL, NULL);
   3669 	if (ctxt->recovery) {
   3670 	    /*
   3671 	     * We're not at the ending > !!
   3672 	     * Error, unless in recover mode where we search forwards
   3673 	     * until we find a >
   3674 	     */
   3675 	    while (CUR != '\0' && CUR != '>') NEXT;
   3676 	    NEXT;
   3677 	}
   3678     } else
   3679         NEXT;
   3680 
   3681     /*
   3682      * if we ignored misplaced tags in htmlParseStartTag don't pop them
   3683      * out now.
   3684      */
   3685     if ((ctxt->depth > 0) &&
   3686         (xmlStrEqual(name, BAD_CAST "html") ||
   3687          xmlStrEqual(name, BAD_CAST "body") ||
   3688 	 xmlStrEqual(name, BAD_CAST "head"))) {
   3689 	ctxt->depth--;
   3690 	return (0);
   3691     }
   3692 
   3693     /*
   3694      * If the name read is not one of the element in the parsing stack
   3695      * then return, it's just an error.
   3696      */
   3697     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
   3698         if (xmlStrEqual(name, ctxt->nameTab[i]))
   3699             break;
   3700     }
   3701     if (i < 0) {
   3702         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
   3703 	             "Unexpected end tag : %s\n", name, NULL);
   3704         return (0);
   3705     }
   3706 
   3707 
   3708     /*
   3709      * Check for auto-closure of HTML elements.
   3710      */
   3711 
   3712     htmlAutoCloseOnClose(ctxt, name);
   3713 
   3714     /*
   3715      * Well formedness constraints, opening and closing must match.
   3716      * With the exception that the autoclose may have popped stuff out
   3717      * of the stack.
   3718      */
   3719     if (!xmlStrEqual(name, ctxt->name)) {
   3720         if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
   3721             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
   3722 	                 "Opening and ending tag mismatch: %s and %s\n",
   3723 			 name, ctxt->name);
   3724         }
   3725     }
   3726 
   3727     /*
   3728      * SAX: End of Tag
   3729      */
   3730     oldname = ctxt->name;
   3731     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
   3732         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   3733             ctxt->sax->endElement(ctxt->userData, name);
   3734         htmlnamePop(ctxt);
   3735         ret = 1;
   3736     } else {
   3737         ret = 0;
   3738     }
   3739 
   3740     return (ret);
   3741 }
   3742 
   3743 
   3744 /**
   3745  * htmlParseReference:
   3746  * @ctxt:  an HTML parser context
   3747  *
   3748  * parse and handle entity references in content,
   3749  * this will end-up in a call to character() since this is either a
   3750  * CharRef, or a predefined entity.
   3751  */
   3752 static void
   3753 htmlParseReference(htmlParserCtxtPtr ctxt) {
   3754     const htmlEntityDesc * ent;
   3755     xmlChar out[6];
   3756     const xmlChar *name;
   3757     if (CUR != '&') return;
   3758 
   3759     if (NXT(1) == '#') {
   3760 	unsigned int c;
   3761 	int bits, i = 0;
   3762 
   3763 	c = htmlParseCharRef(ctxt);
   3764 	if (c == 0)
   3765 	    return;
   3766 
   3767         if      (c <    0x80) { out[i++]= c;                bits= -6; }
   3768         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   3769         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   3770         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   3771 
   3772         for ( ; bits >= 0; bits-= 6) {
   3773             out[i++]= ((c >> bits) & 0x3F) | 0x80;
   3774         }
   3775 	out[i] = 0;
   3776 
   3777 	htmlCheckParagraph(ctxt);
   3778 	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
   3779 	    ctxt->sax->characters(ctxt->userData, out, i);
   3780     } else {
   3781 	ent = htmlParseEntityRef(ctxt, &name);
   3782 	if (name == NULL) {
   3783 	    htmlCheckParagraph(ctxt);
   3784 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
   3785 	        ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
   3786 	    return;
   3787 	}
   3788 	if ((ent == NULL) || !(ent->value > 0)) {
   3789 	    htmlCheckParagraph(ctxt);
   3790 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
   3791 		ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
   3792 		ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
   3793 		/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
   3794 	    }
   3795 	} else {
   3796 	    unsigned int c;
   3797 	    int bits, i = 0;
   3798 
   3799 	    c = ent->value;
   3800 	    if      (c <    0x80)
   3801 	            { out[i++]= c;                bits= -6; }
   3802 	    else if (c <   0x800)
   3803 	            { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   3804 	    else if (c < 0x10000)
   3805 	            { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   3806 	    else
   3807 	            { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   3808 
   3809 	    for ( ; bits >= 0; bits-= 6) {
   3810 		out[i++]= ((c >> bits) & 0x3F) | 0x80;
   3811 	    }
   3812 	    out[i] = 0;
   3813 
   3814 	    htmlCheckParagraph(ctxt);
   3815 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
   3816 		ctxt->sax->characters(ctxt->userData, out, i);
   3817 	}
   3818     }
   3819 }
   3820 
   3821 /**
   3822  * htmlParseContent:
   3823  * @ctxt:  an HTML parser context
   3824  *
   3825  * Parse a content: comment, sub-element, reference or text.
   3826  */
   3827 
   3828 static void
   3829 htmlParseContent(htmlParserCtxtPtr ctxt) {
   3830     xmlChar *currentNode;
   3831     int depth;
   3832     const xmlChar *name;
   3833 
   3834     currentNode = xmlStrdup(ctxt->name);
   3835     depth = ctxt->nameNr;
   3836     while (1) {
   3837 	long cons = ctxt->nbChars;
   3838 
   3839         GROW;
   3840 	/*
   3841 	 * Our tag or one of it's parent or children is ending.
   3842 	 */
   3843         if ((CUR == '<') && (NXT(1) == '/')) {
   3844 	    if (htmlParseEndTag(ctxt) &&
   3845 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
   3846 		if (currentNode != NULL)
   3847 		    xmlFree(currentNode);
   3848 		return;
   3849 	    }
   3850 	    continue; /* while */
   3851         }
   3852 
   3853 	else if ((CUR == '<') &&
   3854 	         ((IS_ASCII_LETTER(NXT(1))) ||
   3855 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
   3856 	    name = htmlParseHTMLName_nonInvasive(ctxt);
   3857 	    if (name == NULL) {
   3858 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   3859 			 "htmlParseStartTag: invalid element name\n",
   3860 			 NULL, NULL);
   3861 	        /* Dump the bogus tag like browsers do */
   3862  	        while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
   3863 	            NEXT;
   3864 
   3865 	        if (currentNode != NULL)
   3866 	            xmlFree(currentNode);
   3867 	        return;
   3868 	    }
   3869 
   3870 	    if (ctxt->name != NULL) {
   3871 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
   3872 	            htmlAutoClose(ctxt, name);
   3873 	            continue;
   3874 	        }
   3875 	    }
   3876 	}
   3877 
   3878 	/*
   3879 	 * Has this node been popped out during parsing of
   3880 	 * the next element
   3881 	 */
   3882         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
   3883 	    (!xmlStrEqual(currentNode, ctxt->name)))
   3884 	     {
   3885 	    if (currentNode != NULL) xmlFree(currentNode);
   3886 	    return;
   3887 	}
   3888 
   3889 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
   3890 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
   3891 	    /*
   3892 	     * Handle SCRIPT/STYLE separately
   3893 	     */
   3894 	    htmlParseScript(ctxt);
   3895 	} else {
   3896 	    /*
   3897 	     * Sometimes DOCTYPE arrives in the middle of the document
   3898 	     */
   3899 	    if ((CUR == '<') && (NXT(1) == '!') &&
   3900 		(UPP(2) == 'D') && (UPP(3) == 'O') &&
   3901 		(UPP(4) == 'C') && (UPP(5) == 'T') &&
   3902 		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
   3903 		(UPP(8) == 'E')) {
   3904 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   3905 		             "Misplaced DOCTYPE declaration\n",
   3906 			     BAD_CAST "DOCTYPE" , NULL);
   3907 		htmlParseDocTypeDecl(ctxt);
   3908 	    }
   3909 
   3910 	    /*
   3911 	     * First case :  a comment
   3912 	     */
   3913 	    if ((CUR == '<') && (NXT(1) == '!') &&
   3914 		(NXT(2) == '-') && (NXT(3) == '-')) {
   3915 		htmlParseComment(ctxt);
   3916 	    }
   3917 
   3918 	    /*
   3919 	     * Second case : a Processing Instruction.
   3920 	     */
   3921 	    else if ((CUR == '<') && (NXT(1) == '?')) {
   3922 		htmlParsePI(ctxt);
   3923 	    }
   3924 
   3925 	    /*
   3926 	     * Third case :  a sub-element.
   3927 	     */
   3928 	    else if (CUR == '<') {
   3929 		htmlParseElement(ctxt);
   3930 	    }
   3931 
   3932 	    /*
   3933 	     * Fourth case : a reference. If if has not been resolved,
   3934 	     *    parsing returns it's Name, create the node
   3935 	     */
   3936 	    else if (CUR == '&') {
   3937 		htmlParseReference(ctxt);
   3938 	    }
   3939 
   3940 	    /*
   3941 	     * Fifth case : end of the resource
   3942 	     */
   3943 	    else if (CUR == 0) {
   3944 		htmlAutoCloseOnEnd(ctxt);
   3945 		break;
   3946 	    }
   3947 
   3948 	    /*
   3949 	     * Last case, text. Note that References are handled directly.
   3950 	     */
   3951 	    else {
   3952 		htmlParseCharData(ctxt);
   3953 	    }
   3954 
   3955 	    if (cons == ctxt->nbChars) {
   3956 		if (ctxt->node != NULL) {
   3957 		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   3958 		                 "detected an error in element content\n",
   3959 				 NULL, NULL);
   3960 		}
   3961 		break;
   3962 	    }
   3963 	}
   3964         GROW;
   3965     }
   3966     if (currentNode != NULL) xmlFree(currentNode);
   3967 }
   3968 
   3969 /**
   3970  * htmlParseContent:
   3971  * @ctxt:  an HTML parser context
   3972  *
   3973  * Parse a content: comment, sub-element, reference or text.
   3974  */
   3975 
   3976 void
   3977 __htmlParseContent(void *ctxt) {
   3978     if (ctxt != NULL)
   3979 	htmlParseContent((htmlParserCtxtPtr) ctxt);
   3980 }
   3981 
   3982 /**
   3983  * htmlParseElement:
   3984  * @ctxt:  an HTML parser context
   3985  *
   3986  * parse an HTML element, this is highly recursive
   3987  *
   3988  * [39] element ::= EmptyElemTag | STag content ETag
   3989  *
   3990  * [41] Attribute ::= Name Eq AttValue
   3991  */
   3992 
   3993 void
   3994 htmlParseElement(htmlParserCtxtPtr ctxt) {
   3995     const xmlChar *name;
   3996     xmlChar *currentNode = NULL;
   3997     const htmlElemDesc * info;
   3998     htmlParserNodeInfo node_info;
   3999     int failed;
   4000     int depth;
   4001     const xmlChar *oldptr;
   4002 
   4003     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   4004 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   4005 		     "htmlParseElement: context error\n", NULL, NULL);
   4006 	return;
   4007     }
   4008     /* Capture start position */
   4009     if (ctxt->record_info) {
   4010         node_info.begin_pos = ctxt->input->consumed +
   4011                           (CUR_PTR - ctxt->input->base);
   4012 	node_info.begin_line = ctxt->input->line;
   4013     }
   4014 
   4015     failed = htmlParseStartTag(ctxt);
   4016     name = ctxt->name;
   4017     if ((failed == -1) || (name == NULL)) {
   4018 	if (CUR == '>')
   4019 	    NEXT;
   4020         return;
   4021     }
   4022 
   4023     /*
   4024      * Lookup the info for that element.
   4025      */
   4026     info = htmlTagLookup(name);
   4027     if (info == NULL) {
   4028 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
   4029 	             "Tag %s invalid\n", name, NULL);
   4030     }
   4031 
   4032     /*
   4033      * Check for an Empty Element labeled the XML/SGML way
   4034      */
   4035     if ((CUR == '/') && (NXT(1) == '>')) {
   4036         SKIP(2);
   4037 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   4038 	    ctxt->sax->endElement(ctxt->userData, name);
   4039 	htmlnamePop(ctxt);
   4040 	return;
   4041     }
   4042 
   4043     if (CUR == '>') {
   4044         NEXT;
   4045     } else {
   4046 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
   4047 	             "Couldn't find end of Start Tag %s\n", name, NULL);
   4048 
   4049 	/*
   4050 	 * end of parsing of this node.
   4051 	 */
   4052 	if (xmlStrEqual(name, ctxt->name)) {
   4053 	    nodePop(ctxt);
   4054 	    htmlnamePop(ctxt);
   4055 	}
   4056 
   4057 	/*
   4058 	 * Capture end position and add node
   4059 	 */
   4060 	if (ctxt->record_info) {
   4061 	   node_info.end_pos = ctxt->input->consumed +
   4062 			      (CUR_PTR - ctxt->input->base);
   4063 	   node_info.end_line = ctxt->input->line;
   4064 	   node_info.node = ctxt->node;
   4065 	   xmlParserAddNodeInfo(ctxt, &node_info);
   4066 	}
   4067 	return;
   4068     }
   4069 
   4070     /*
   4071      * Check for an Empty Element from DTD definition
   4072      */
   4073     if ((info != NULL) && (info->empty)) {
   4074 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   4075 	    ctxt->sax->endElement(ctxt->userData, name);
   4076 	htmlnamePop(ctxt);
   4077 	return;
   4078     }
   4079 
   4080     /*
   4081      * Parse the content of the element:
   4082      */
   4083     currentNode = xmlStrdup(ctxt->name);
   4084     depth = ctxt->nameNr;
   4085     while (IS_CHAR_CH(CUR)) {
   4086 	oldptr = ctxt->input->cur;
   4087 	htmlParseContent(ctxt);
   4088 	if (oldptr==ctxt->input->cur) break;
   4089 	if (ctxt->nameNr < depth) break;
   4090     }
   4091 
   4092     /*
   4093      * Capture end position and add node
   4094      */
   4095     if ( currentNode != NULL && ctxt->record_info ) {
   4096        node_info.end_pos = ctxt->input->consumed +
   4097                           (CUR_PTR - ctxt->input->base);
   4098        node_info.end_line = ctxt->input->line;
   4099        node_info.node = ctxt->node;
   4100        xmlParserAddNodeInfo(ctxt, &node_info);
   4101     }
   4102     if (!IS_CHAR_CH(CUR)) {
   4103 	htmlAutoCloseOnEnd(ctxt);
   4104     }
   4105 
   4106     if (currentNode != NULL)
   4107 	xmlFree(currentNode);
   4108 }
   4109 
   4110 /**
   4111  * htmlParseDocument:
   4112  * @ctxt:  an HTML parser context
   4113  *
   4114  * parse an HTML document (and build a tree if using the standard SAX
   4115  * interface).
   4116  *
   4117  * Returns 0, -1 in case of error. the parser context is augmented
   4118  *                as a result of the parsing.
   4119  */
   4120 
   4121 int
   4122 htmlParseDocument(htmlParserCtxtPtr ctxt) {
   4123     xmlChar start[4];
   4124     xmlCharEncoding enc;
   4125     xmlDtdPtr dtd;
   4126 
   4127     xmlInitParser();
   4128 
   4129     htmlDefaultSAXHandlerInit();
   4130 
   4131     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   4132 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   4133 		     "htmlParseDocument: context error\n", NULL, NULL);
   4134 	return(XML_ERR_INTERNAL_ERROR);
   4135     }
   4136     ctxt->html = 1;
   4137     GROW;
   4138     /*
   4139      * SAX: beginning of the document processing.
   4140      */
   4141     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
   4142         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
   4143 
   4144     if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
   4145         ((ctxt->input->end - ctxt->input->cur) >= 4)) {
   4146 	/*
   4147 	 * Get the 4 first bytes and decode the charset
   4148 	 * if enc != XML_CHAR_ENCODING_NONE
   4149 	 * plug some encoding conversion routines.
   4150 	 */
   4151 	start[0] = RAW;
   4152 	start[1] = NXT(1);
   4153 	start[2] = NXT(2);
   4154 	start[3] = NXT(3);
   4155 	enc = xmlDetectCharEncoding(&start[0], 4);
   4156 	if (enc != XML_CHAR_ENCODING_NONE) {
   4157 	    xmlSwitchEncoding(ctxt, enc);
   4158 	}
   4159     }
   4160 
   4161     /*
   4162      * Wipe out everything which is before the first '<'
   4163      */
   4164     SKIP_BLANKS;
   4165     if (CUR == 0) {
   4166 	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
   4167 	             "Document is empty\n", NULL, NULL);
   4168     }
   4169 
   4170     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
   4171 	ctxt->sax->startDocument(ctxt->userData);
   4172 
   4173 
   4174     /*
   4175      * Parse possible comments and PIs before any content
   4176      */
   4177     while (((CUR == '<') && (NXT(1) == '!') &&
   4178             (NXT(2) == '-') && (NXT(3) == '-')) ||
   4179 	   ((CUR == '<') && (NXT(1) == '?'))) {
   4180         htmlParseComment(ctxt);
   4181         htmlParsePI(ctxt);
   4182 	SKIP_BLANKS;
   4183     }
   4184 
   4185 
   4186     /*
   4187      * Then possibly doc type declaration(s) and more Misc
   4188      * (doctypedecl Misc*)?
   4189      */
   4190     if ((CUR == '<') && (NXT(1) == '!') &&
   4191 	(UPP(2) == 'D') && (UPP(3) == 'O') &&
   4192 	(UPP(4) == 'C') && (UPP(5) == 'T') &&
   4193 	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
   4194 	(UPP(8) == 'E')) {
   4195 	htmlParseDocTypeDecl(ctxt);
   4196     }
   4197     SKIP_BLANKS;
   4198 
   4199     /*
   4200      * Parse possible comments and PIs before any content
   4201      */
   4202     while (((CUR == '<') && (NXT(1) == '!') &&
   4203             (NXT(2) == '-') && (NXT(3) == '-')) ||
   4204 	   ((CUR == '<') && (NXT(1) == '?'))) {
   4205         htmlParseComment(ctxt);
   4206         htmlParsePI(ctxt);
   4207 	SKIP_BLANKS;
   4208     }
   4209 
   4210     /*
   4211      * Time to start parsing the tree itself
   4212      */
   4213     htmlParseContent(ctxt);
   4214 
   4215     /*
   4216      * autoclose
   4217      */
   4218     if (CUR == 0)
   4219 	htmlAutoCloseOnEnd(ctxt);
   4220 
   4221 
   4222     /*
   4223      * SAX: end of the document processing.
   4224      */
   4225     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
   4226         ctxt->sax->endDocument(ctxt->userData);
   4227 
   4228     if (ctxt->myDoc != NULL) {
   4229 	dtd = xmlGetIntSubset(ctxt->myDoc);
   4230 	if (dtd == NULL)
   4231 	    ctxt->myDoc->intSubset =
   4232 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
   4233 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
   4234 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
   4235     }
   4236     if (! ctxt->wellFormed) return(-1);
   4237     return(0);
   4238 }
   4239 
   4240 
   4241 /************************************************************************
   4242  *									*
   4243  *			Parser contexts handling			*
   4244  *									*
   4245  ************************************************************************/
   4246 
   4247 /**
   4248  * htmlInitParserCtxt:
   4249  * @ctxt:  an HTML parser context
   4250  *
   4251  * Initialize a parser context
   4252  *
   4253  * Returns 0 in case of success and -1 in case of error
   4254  */
   4255 
   4256 static int
   4257 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
   4258 {
   4259     htmlSAXHandler *sax;
   4260 
   4261     if (ctxt == NULL) return(-1);
   4262     memset(ctxt, 0, sizeof(htmlParserCtxt));
   4263 
   4264     ctxt->dict = xmlDictCreate();
   4265     if (ctxt->dict == NULL) {
   4266         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
   4267 	return(-1);
   4268     }
   4269     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
   4270     if (sax == NULL) {
   4271         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
   4272 	return(-1);
   4273     }
   4274     else
   4275         memset(sax, 0, sizeof(htmlSAXHandler));
   4276 
   4277     /* Allocate the Input stack */
   4278     ctxt->inputTab = (htmlParserInputPtr *)
   4279                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
   4280     if (ctxt->inputTab == NULL) {
   4281         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
   4282 	ctxt->inputNr = 0;
   4283 	ctxt->inputMax = 0;
   4284 	ctxt->input = NULL;
   4285 	return(-1);
   4286     }
   4287     ctxt->inputNr = 0;
   4288     ctxt->inputMax = 5;
   4289     ctxt->input = NULL;
   4290     ctxt->version = NULL;
   4291     ctxt->encoding = NULL;
   4292     ctxt->standalone = -1;
   4293     ctxt->instate = XML_PARSER_START;
   4294 
   4295     /* Allocate the Node stack */
   4296     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
   4297     if (ctxt->nodeTab == NULL) {
   4298         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
   4299 	ctxt->nodeNr = 0;
   4300 	ctxt->nodeMax = 0;
   4301 	ctxt->node = NULL;
   4302 	ctxt->inputNr = 0;
   4303 	ctxt->inputMax = 0;
   4304 	ctxt->input = NULL;
   4305 	return(-1);
   4306     }
   4307     ctxt->nodeNr = 0;
   4308     ctxt->nodeMax = 10;
   4309     ctxt->node = NULL;
   4310 
   4311     /* Allocate the Name stack */
   4312     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
   4313     if (ctxt->nameTab == NULL) {
   4314         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
   4315 	ctxt->nameNr = 0;
   4316 	ctxt->nameMax = 10;
   4317 	ctxt->name = NULL;
   4318 	ctxt->nodeNr = 0;
   4319 	ctxt->nodeMax = 0;
   4320 	ctxt->node = NULL;
   4321 	ctxt->inputNr = 0;
   4322 	ctxt->inputMax = 0;
   4323 	ctxt->input = NULL;
   4324 	return(-1);
   4325     }
   4326     ctxt->nameNr = 0;
   4327     ctxt->nameMax = 10;
   4328     ctxt->name = NULL;
   4329 
   4330     if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
   4331     else {
   4332         ctxt->sax = sax;
   4333 	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
   4334     }
   4335     ctxt->userData = ctxt;
   4336     ctxt->myDoc = NULL;
   4337     ctxt->wellFormed = 1;
   4338     ctxt->replaceEntities = 0;
   4339     ctxt->linenumbers = xmlLineNumbersDefaultValue;
   4340     ctxt->html = 1;
   4341     ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
   4342     ctxt->vctxt.userData = ctxt;
   4343     ctxt->vctxt.error = xmlParserValidityError;
   4344     ctxt->vctxt.warning = xmlParserValidityWarning;
   4345     ctxt->record_info = 0;
   4346     ctxt->validate = 0;
   4347     ctxt->nbChars = 0;
   4348     ctxt->checkIndex = 0;
   4349     ctxt->catalogs = NULL;
   4350     xmlInitNodeInfoSeq(&ctxt->node_seq);
   4351     return(0);
   4352 }
   4353 
   4354 /**
   4355  * htmlFreeParserCtxt:
   4356  * @ctxt:  an HTML parser context
   4357  *
   4358  * Free all the memory used by a parser context. However the parsed
   4359  * document in ctxt->myDoc is not freed.
   4360  */
   4361 
   4362 void
   4363 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
   4364 {
   4365     xmlFreeParserCtxt(ctxt);
   4366 }
   4367 
   4368 /**
   4369  * htmlNewParserCtxt:
   4370  *
   4371  * Allocate and initialize a new parser context.
   4372  *
   4373  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
   4374  */
   4375 
   4376 htmlParserCtxtPtr
   4377 htmlNewParserCtxt(void)
   4378 {
   4379     xmlParserCtxtPtr ctxt;
   4380 
   4381     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
   4382     if (ctxt == NULL) {
   4383         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
   4384 	return(NULL);
   4385     }
   4386     memset(ctxt, 0, sizeof(xmlParserCtxt));
   4387     if (htmlInitParserCtxt(ctxt) < 0) {
   4388         htmlFreeParserCtxt(ctxt);
   4389 	return(NULL);
   4390     }
   4391     return(ctxt);
   4392 }
   4393 
   4394 /**
   4395  * htmlCreateMemoryParserCtxt:
   4396  * @buffer:  a pointer to a char array
   4397  * @size:  the size of the array
   4398  *
   4399  * Create a parser context for an HTML in-memory document.
   4400  *
   4401  * Returns the new parser context or NULL
   4402  */
   4403 htmlParserCtxtPtr
   4404 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
   4405     xmlParserCtxtPtr ctxt;
   4406     xmlParserInputPtr input;
   4407     xmlParserInputBufferPtr buf;
   4408 
   4409     if (buffer == NULL)
   4410 	return(NULL);
   4411     if (size <= 0)
   4412 	return(NULL);
   4413 
   4414     ctxt = htmlNewParserCtxt();
   4415     if (ctxt == NULL)
   4416 	return(NULL);
   4417 
   4418     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
   4419     if (buf == NULL) return(NULL);
   4420 
   4421     input = xmlNewInputStream(ctxt);
   4422     if (input == NULL) {
   4423 	xmlFreeParserCtxt(ctxt);
   4424 	return(NULL);
   4425     }
   4426 
   4427     input->filename = NULL;
   4428     input->buf = buf;
   4429     input->base = input->buf->buffer->content;
   4430     input->cur = input->buf->buffer->content;
   4431     input->end = &input->buf->buffer->content[input->buf->buffer->use];
   4432 
   4433     inputPush(ctxt, input);
   4434     return(ctxt);
   4435 }
   4436 
   4437 /**
   4438  * htmlCreateDocParserCtxt:
   4439  * @cur:  a pointer to an array of xmlChar
   4440  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   4441  *
   4442  * Create a parser context for an HTML document.
   4443  *
   4444  * TODO: check the need to add encoding handling there
   4445  *
   4446  * Returns the new parser context or NULL
   4447  */
   4448 static htmlParserCtxtPtr
   4449 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
   4450     int len;
   4451     htmlParserCtxtPtr ctxt;
   4452 
   4453     if (cur == NULL)
   4454 	return(NULL);
   4455     len = xmlStrlen(cur);
   4456     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
   4457     if (ctxt == NULL)
   4458 	return(NULL);
   4459 
   4460     if (encoding != NULL) {
   4461 	xmlCharEncoding enc;
   4462 	xmlCharEncodingHandlerPtr handler;
   4463 
   4464 	if (ctxt->input->encoding != NULL)
   4465 	    xmlFree((xmlChar *) ctxt->input->encoding);
   4466 	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
   4467 
   4468 	enc = xmlParseCharEncoding(encoding);
   4469 	/*
   4470 	 * registered set of known encodings
   4471 	 */
   4472 	if (enc != XML_CHAR_ENCODING_ERROR) {
   4473 	    xmlSwitchEncoding(ctxt, enc);
   4474 	    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
   4475 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
   4476 		             "Unsupported encoding %s\n",
   4477 			     (const xmlChar *) encoding, NULL);
   4478 	    }
   4479 	} else {
   4480 	    /*
   4481 	     * fallback for unknown encodings
   4482 	     */
   4483 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
   4484 	    if (handler != NULL) {
   4485 		xmlSwitchToEncoding(ctxt, handler);
   4486 	    } else {
   4487 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
   4488 		             "Unsupported encoding %s\n",
   4489 			     (const xmlChar *) encoding, NULL);
   4490 	    }
   4491 	}
   4492     }
   4493     return(ctxt);
   4494 }
   4495 
   4496 #ifdef LIBXML_PUSH_ENABLED
   4497 /************************************************************************
   4498  *									*
   4499  * 		Progressive parsing interfaces				*
   4500  *									*
   4501  ************************************************************************/
   4502 
   4503 /**
   4504  * htmlParseLookupSequence:
   4505  * @ctxt:  an HTML parser context
   4506  * @first:  the first char to lookup
   4507  * @next:  the next char to lookup or zero
   4508  * @third:  the next char to lookup or zero
   4509  * @comment: flag to force checking inside comments
   4510  *
   4511  * Try to find if a sequence (first, next, third) or  just (first next) or
   4512  * (first) is available in the input stream.
   4513  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
   4514  * to avoid rescanning sequences of bytes, it DOES change the state of the
   4515  * parser, do not use liberally.
   4516  * This is basically similar to xmlParseLookupSequence()
   4517  *
   4518  * Returns the index to the current parsing point if the full sequence
   4519  *      is available, -1 otherwise.
   4520  */
   4521 static int
   4522 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
   4523                         xmlChar next, xmlChar third, int iscomment) {
   4524     int base, len;
   4525     htmlParserInputPtr in;
   4526     const xmlChar *buf;
   4527     int incomment = 0;
   4528 
   4529     in = ctxt->input;
   4530     if (in == NULL) return(-1);
   4531     base = in->cur - in->base;
   4532     if (base < 0) return(-1);
   4533     if (ctxt->checkIndex > base)
   4534         base = ctxt->checkIndex;
   4535     if (in->buf == NULL) {
   4536 	buf = in->base;
   4537 	len = in->length;
   4538     } else {
   4539 	buf = in->buf->buffer->content;
   4540 	len = in->buf->buffer->use;
   4541     }
   4542     /* take into account the sequence length */
   4543     if (third) len -= 2;
   4544     else if (next) len --;
   4545     for (;base < len;base++) {
   4546 	if (!incomment && (base + 4 < len) && !iscomment) {
   4547 	    if ((buf[base] == '<') && (buf[base + 1] == '!') &&
   4548 		(buf[base + 2] == '-') && (buf[base + 3] == '-')) {
   4549 		incomment = 1;
   4550 		/* do not increment past <! - some people use <!--> */
   4551 		base += 2;
   4552 	    }
   4553 	}
   4554 	if (incomment) {
   4555 	    if (base + 3 > len)
   4556 		return(-1);
   4557 	    if ((buf[base] == '-') && (buf[base + 1] == '-') &&
   4558 		(buf[base + 2] == '>')) {
   4559 		incomment = 0;
   4560 		base += 2;
   4561 	    }
   4562 	    continue;
   4563 	}
   4564         if (buf[base] == first) {
   4565 	    if (third != 0) {
   4566 		if ((buf[base + 1] != next) ||
   4567 		    (buf[base + 2] != third)) continue;
   4568 	    } else if (next != 0) {
   4569 		if (buf[base + 1] != next) continue;
   4570 	    }
   4571 	    ctxt->checkIndex = 0;
   4572 #ifdef DEBUG_PUSH
   4573 	    if (next == 0)
   4574 		xmlGenericError(xmlGenericErrorContext,
   4575 			"HPP: lookup '%c' found at %d\n",
   4576 			first, base);
   4577 	    else if (third == 0)
   4578 		xmlGenericError(xmlGenericErrorContext,
   4579 			"HPP: lookup '%c%c' found at %d\n",
   4580 			first, next, base);
   4581 	    else
   4582 		xmlGenericError(xmlGenericErrorContext,
   4583 			"HPP: lookup '%c%c%c' found at %d\n",
   4584 			first, next, third, base);
   4585 #endif
   4586 	    return(base - (in->cur - in->base));
   4587 	}
   4588     }
   4589     ctxt->checkIndex = base;
   4590 #ifdef DEBUG_PUSH
   4591     if (next == 0)
   4592 	xmlGenericError(xmlGenericErrorContext,
   4593 		"HPP: lookup '%c' failed\n", first);
   4594     else if (third == 0)
   4595 	xmlGenericError(xmlGenericErrorContext,
   4596 		"HPP: lookup '%c%c' failed\n", first, next);
   4597     else
   4598 	xmlGenericError(xmlGenericErrorContext,
   4599 		"HPP: lookup '%c%c%c' failed\n", first, next, third);
   4600 #endif
   4601     return(-1);
   4602 }
   4603 
   4604 /**
   4605  * htmlParseTryOrFinish:
   4606  * @ctxt:  an HTML parser context
   4607  * @terminate:  last chunk indicator
   4608  *
   4609  * Try to progress on parsing
   4610  *
   4611  * Returns zero if no parsing was possible
   4612  */
   4613 static int
   4614 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
   4615     int ret = 0;
   4616     htmlParserInputPtr in;
   4617     int avail = 0;
   4618     xmlChar cur, next;
   4619 
   4620 #ifdef DEBUG_PUSH
   4621     switch (ctxt->instate) {
   4622 	case XML_PARSER_EOF:
   4623 	    xmlGenericError(xmlGenericErrorContext,
   4624 		    "HPP: try EOF\n"); break;
   4625 	case XML_PARSER_START:
   4626 	    xmlGenericError(xmlGenericErrorContext,
   4627 		    "HPP: try START\n"); break;
   4628 	case XML_PARSER_MISC:
   4629 	    xmlGenericError(xmlGenericErrorContext,
   4630 		    "HPP: try MISC\n");break;
   4631 	case XML_PARSER_COMMENT:
   4632 	    xmlGenericError(xmlGenericErrorContext,
   4633 		    "HPP: try COMMENT\n");break;
   4634 	case XML_PARSER_PROLOG:
   4635 	    xmlGenericError(xmlGenericErrorContext,
   4636 		    "HPP: try PROLOG\n");break;
   4637 	case XML_PARSER_START_TAG:
   4638 	    xmlGenericError(xmlGenericErrorContext,
   4639 		    "HPP: try START_TAG\n");break;
   4640 	case XML_PARSER_CONTENT:
   4641 	    xmlGenericError(xmlGenericErrorContext,
   4642 		    "HPP: try CONTENT\n");break;
   4643 	case XML_PARSER_CDATA_SECTION:
   4644 	    xmlGenericError(xmlGenericErrorContext,
   4645 		    "HPP: try CDATA_SECTION\n");break;
   4646 	case XML_PARSER_END_TAG:
   4647 	    xmlGenericError(xmlGenericErrorContext,
   4648 		    "HPP: try END_TAG\n");break;
   4649 	case XML_PARSER_ENTITY_DECL:
   4650 	    xmlGenericError(xmlGenericErrorContext,
   4651 		    "HPP: try ENTITY_DECL\n");break;
   4652 	case XML_PARSER_ENTITY_VALUE:
   4653 	    xmlGenericError(xmlGenericErrorContext,
   4654 		    "HPP: try ENTITY_VALUE\n");break;
   4655 	case XML_PARSER_ATTRIBUTE_VALUE:
   4656 	    xmlGenericError(xmlGenericErrorContext,
   4657 		    "HPP: try ATTRIBUTE_VALUE\n");break;
   4658 	case XML_PARSER_DTD:
   4659 	    xmlGenericError(xmlGenericErrorContext,
   4660 		    "HPP: try DTD\n");break;
   4661 	case XML_PARSER_EPILOG:
   4662 	    xmlGenericError(xmlGenericErrorContext,
   4663 		    "HPP: try EPILOG\n");break;
   4664 	case XML_PARSER_PI:
   4665 	    xmlGenericError(xmlGenericErrorContext,
   4666 		    "HPP: try PI\n");break;
   4667 	case XML_PARSER_SYSTEM_LITERAL:
   4668 	    xmlGenericError(xmlGenericErrorContext,
   4669 		    "HPP: try SYSTEM_LITERAL\n");break;
   4670     }
   4671 #endif
   4672 
   4673     while (1) {
   4674 
   4675 	in = ctxt->input;
   4676 	if (in == NULL) break;
   4677 	if (in->buf == NULL)
   4678 	    avail = in->length - (in->cur - in->base);
   4679 	else
   4680 	    avail = in->buf->buffer->use - (in->cur - in->base);
   4681 	if ((avail == 0) && (terminate)) {
   4682 	    htmlAutoCloseOnEnd(ctxt);
   4683 	    if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
   4684 		/*
   4685 		 * SAX: end of the document processing.
   4686 		 */
   4687 		ctxt->instate = XML_PARSER_EOF;
   4688 		if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
   4689 		    ctxt->sax->endDocument(ctxt->userData);
   4690 	    }
   4691 	}
   4692         if (avail < 1)
   4693 	    goto done;
   4694 	cur = in->cur[0];
   4695 	if (cur == 0) {
   4696 	    SKIP(1);
   4697 	    continue;
   4698 	}
   4699 
   4700         switch (ctxt->instate) {
   4701             case XML_PARSER_EOF:
   4702 	        /*
   4703 		 * Document parsing is done !
   4704 		 */
   4705 	        goto done;
   4706             case XML_PARSER_START:
   4707 	        /*
   4708 		 * Very first chars read from the document flow.
   4709 		 */
   4710 		cur = in->cur[0];
   4711 		if (IS_BLANK_CH(cur)) {
   4712 		    SKIP_BLANKS;
   4713 		    if (in->buf == NULL)
   4714 			avail = in->length - (in->cur - in->base);
   4715 		    else
   4716 			avail = in->buf->buffer->use - (in->cur - in->base);
   4717 		}
   4718 		if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
   4719 		    ctxt->sax->setDocumentLocator(ctxt->userData,
   4720 						  &xmlDefaultSAXLocator);
   4721 		if ((ctxt->sax) && (ctxt->sax->startDocument) &&
   4722 	            (!ctxt->disableSAX))
   4723 		    ctxt->sax->startDocument(ctxt->userData);
   4724 
   4725 		cur = in->cur[0];
   4726 		next = in->cur[1];
   4727 		if ((cur == '<') && (next == '!') &&
   4728 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
   4729 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
   4730 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
   4731 		    (UPP(8) == 'E')) {
   4732 		    if ((!terminate) &&
   4733 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
   4734 			goto done;
   4735 #ifdef DEBUG_PUSH
   4736 		    xmlGenericError(xmlGenericErrorContext,
   4737 			    "HPP: Parsing internal subset\n");
   4738 #endif
   4739 		    htmlParseDocTypeDecl(ctxt);
   4740 		    ctxt->instate = XML_PARSER_PROLOG;
   4741 #ifdef DEBUG_PUSH
   4742 		    xmlGenericError(xmlGenericErrorContext,
   4743 			    "HPP: entering PROLOG\n");
   4744 #endif
   4745                 } else {
   4746 		    ctxt->instate = XML_PARSER_MISC;
   4747 #ifdef DEBUG_PUSH
   4748 		    xmlGenericError(xmlGenericErrorContext,
   4749 			    "HPP: entering MISC\n");
   4750 #endif
   4751 		}
   4752 		break;
   4753             case XML_PARSER_MISC:
   4754 		SKIP_BLANKS;
   4755 		if (in->buf == NULL)
   4756 		    avail = in->length - (in->cur - in->base);
   4757 		else
   4758 		    avail = in->buf->buffer->use - (in->cur - in->base);
   4759 		if (avail < 2)
   4760 		    goto done;
   4761 		cur = in->cur[0];
   4762 		next = in->cur[1];
   4763 	        if ((cur == '<') && (next == '!') &&
   4764 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
   4765 		    if ((!terminate) &&
   4766 		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
   4767 			goto done;
   4768 #ifdef DEBUG_PUSH
   4769 		    xmlGenericError(xmlGenericErrorContext,
   4770 			    "HPP: Parsing Comment\n");
   4771 #endif
   4772 		    htmlParseComment(ctxt);
   4773 		    ctxt->instate = XML_PARSER_MISC;
   4774 	        } else if ((cur == '<') && (next == '?')) {
   4775 		    if ((!terminate) &&
   4776 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
   4777 			goto done;
   4778 #ifdef DEBUG_PUSH
   4779 		    xmlGenericError(xmlGenericErrorContext,
   4780 			    "HPP: Parsing PI\n");
   4781 #endif
   4782 		    htmlParsePI(ctxt);
   4783 		    ctxt->instate = XML_PARSER_MISC;
   4784 		} else if ((cur == '<') && (next == '!') &&
   4785 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
   4786 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
   4787 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
   4788 		    (UPP(8) == 'E')) {
   4789 		    if ((!terminate) &&
   4790 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
   4791 			goto done;
   4792 #ifdef DEBUG_PUSH
   4793 		    xmlGenericError(xmlGenericErrorContext,
   4794 			    "HPP: Parsing internal subset\n");
   4795 #endif
   4796 		    htmlParseDocTypeDecl(ctxt);
   4797 		    ctxt->instate = XML_PARSER_PROLOG;
   4798 #ifdef DEBUG_PUSH
   4799 		    xmlGenericError(xmlGenericErrorContext,
   4800 			    "HPP: entering PROLOG\n");
   4801 #endif
   4802 		} else if ((cur == '<') && (next == '!') &&
   4803 		           (avail < 9)) {
   4804 		    goto done;
   4805 		} else {
   4806 		    ctxt->instate = XML_PARSER_START_TAG;
   4807 #ifdef DEBUG_PUSH
   4808 		    xmlGenericError(xmlGenericErrorContext,
   4809 			    "HPP: entering START_TAG\n");
   4810 #endif
   4811 		}
   4812 		break;
   4813             case XML_PARSER_PROLOG:
   4814 		SKIP_BLANKS;
   4815 		if (in->buf == NULL)
   4816 		    avail = in->length - (in->cur - in->base);
   4817 		else
   4818 		    avail = in->buf->buffer->use - (in->cur - in->base);
   4819 		if (avail < 2)
   4820 		    goto done;
   4821 		cur = in->cur[0];
   4822 		next = in->cur[1];
   4823 		if ((cur == '<') && (next == '!') &&
   4824 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
   4825 		    if ((!terminate) &&
   4826 		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
   4827 			goto done;
   4828 #ifdef DEBUG_PUSH
   4829 		    xmlGenericError(xmlGenericErrorContext,
   4830 			    "HPP: Parsing Comment\n");
   4831 #endif
   4832 		    htmlParseComment(ctxt);
   4833 		    ctxt->instate = XML_PARSER_PROLOG;
   4834 	        } else if ((cur == '<') && (next == '?')) {
   4835 		    if ((!terminate) &&
   4836 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
   4837 			goto done;
   4838 #ifdef DEBUG_PUSH
   4839 		    xmlGenericError(xmlGenericErrorContext,
   4840 			    "HPP: Parsing PI\n");
   4841 #endif
   4842 		    htmlParsePI(ctxt);
   4843 		    ctxt->instate = XML_PARSER_PROLOG;
   4844 		} else if ((cur == '<') && (next == '!') &&
   4845 		           (avail < 4)) {
   4846 		    goto done;
   4847 		} else {
   4848 		    ctxt->instate = XML_PARSER_START_TAG;
   4849 #ifdef DEBUG_PUSH
   4850 		    xmlGenericError(xmlGenericErrorContext,
   4851 			    "HPP: entering START_TAG\n");
   4852 #endif
   4853 		}
   4854 		break;
   4855             case XML_PARSER_EPILOG:
   4856 		if (in->buf == NULL)
   4857 		    avail = in->length - (in->cur - in->base);
   4858 		else
   4859 		    avail = in->buf->buffer->use - (in->cur - in->base);
   4860 		if (avail < 1)
   4861 		    goto done;
   4862 		cur = in->cur[0];
   4863 		if (IS_BLANK_CH(cur)) {
   4864 		    htmlParseCharData(ctxt);
   4865 		    goto done;
   4866 		}
   4867 		if (avail < 2)
   4868 		    goto done;
   4869 		next = in->cur[1];
   4870 	        if ((cur == '<') && (next == '!') &&
   4871 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
   4872 		    if ((!terminate) &&
   4873 		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
   4874 			goto done;
   4875 #ifdef DEBUG_PUSH
   4876 		    xmlGenericError(xmlGenericErrorContext,
   4877 			    "HPP: Parsing Comment\n");
   4878 #endif
   4879 		    htmlParseComment(ctxt);
   4880 		    ctxt->instate = XML_PARSER_EPILOG;
   4881 	        } else if ((cur == '<') && (next == '?')) {
   4882 		    if ((!terminate) &&
   4883 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
   4884 			goto done;
   4885 #ifdef DEBUG_PUSH
   4886 		    xmlGenericError(xmlGenericErrorContext,
   4887 			    "HPP: Parsing PI\n");
   4888 #endif
   4889 		    htmlParsePI(ctxt);
   4890 		    ctxt->instate = XML_PARSER_EPILOG;
   4891 		} else if ((cur == '<') && (next == '!') &&
   4892 		           (avail < 4)) {
   4893 		    goto done;
   4894 		} else {
   4895 		    ctxt->errNo = XML_ERR_DOCUMENT_END;
   4896 		    ctxt->wellFormed = 0;
   4897 		    ctxt->instate = XML_PARSER_EOF;
   4898 #ifdef DEBUG_PUSH
   4899 		    xmlGenericError(xmlGenericErrorContext,
   4900 			    "HPP: entering EOF\n");
   4901 #endif
   4902 		    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
   4903 			ctxt->sax->endDocument(ctxt->userData);
   4904 		    goto done;
   4905 		}
   4906 		break;
   4907             case XML_PARSER_START_TAG: {
   4908 	        const xmlChar *name;
   4909 		int failed;
   4910 		const htmlElemDesc * info;
   4911 
   4912 		if (avail < 2)
   4913 		    goto done;
   4914 		cur = in->cur[0];
   4915 	        if (cur != '<') {
   4916 		    ctxt->instate = XML_PARSER_CONTENT;
   4917 #ifdef DEBUG_PUSH
   4918 		    xmlGenericError(xmlGenericErrorContext,
   4919 			    "HPP: entering CONTENT\n");
   4920 #endif
   4921 		    break;
   4922 		}
   4923 		if (in->cur[1] == '/') {
   4924 		    ctxt->instate = XML_PARSER_END_TAG;
   4925 		    ctxt->checkIndex = 0;
   4926 #ifdef DEBUG_PUSH
   4927 		    xmlGenericError(xmlGenericErrorContext,
   4928 			    "HPP: entering END_TAG\n");
   4929 #endif
   4930 		    break;
   4931 		}
   4932 		if ((!terminate) &&
   4933 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
   4934 		    goto done;
   4935 
   4936 		failed = htmlParseStartTag(ctxt);
   4937 		name = ctxt->name;
   4938 		if ((failed == -1) ||
   4939 		    (name == NULL)) {
   4940 		    if (CUR == '>')
   4941 			NEXT;
   4942 		    break;
   4943 		}
   4944 
   4945 		/*
   4946 		 * Lookup the info for that element.
   4947 		 */
   4948 		info = htmlTagLookup(name);
   4949 		if (info == NULL) {
   4950 		    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
   4951 		                 "Tag %s invalid\n", name, NULL);
   4952 		}
   4953 
   4954 		/*
   4955 		 * Check for an Empty Element labeled the XML/SGML way
   4956 		 */
   4957 		if ((CUR == '/') && (NXT(1) == '>')) {
   4958 		    SKIP(2);
   4959 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   4960 			ctxt->sax->endElement(ctxt->userData, name);
   4961 		    htmlnamePop(ctxt);
   4962 		    ctxt->instate = XML_PARSER_CONTENT;
   4963 #ifdef DEBUG_PUSH
   4964 		    xmlGenericError(xmlGenericErrorContext,
   4965 			    "HPP: entering CONTENT\n");
   4966 #endif
   4967 		    break;
   4968 		}
   4969 
   4970 		if (CUR == '>') {
   4971 		    NEXT;
   4972 		} else {
   4973 		    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
   4974 		                 "Couldn't find end of Start Tag %s\n",
   4975 				 name, NULL);
   4976 
   4977 		    /*
   4978 		     * end of parsing of this node.
   4979 		     */
   4980 		    if (xmlStrEqual(name, ctxt->name)) {
   4981 			nodePop(ctxt);
   4982 			htmlnamePop(ctxt);
   4983 		    }
   4984 
   4985 		    ctxt->instate = XML_PARSER_CONTENT;
   4986 #ifdef DEBUG_PUSH
   4987 		    xmlGenericError(xmlGenericErrorContext,
   4988 			    "HPP: entering CONTENT\n");
   4989 #endif
   4990 		    break;
   4991 		}
   4992 
   4993 		/*
   4994 		 * Check for an Empty Element from DTD definition
   4995 		 */
   4996 		if ((info != NULL) && (info->empty)) {
   4997 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   4998 			ctxt->sax->endElement(ctxt->userData, name);
   4999 		    htmlnamePop(ctxt);
   5000 		}
   5001 		ctxt->instate = XML_PARSER_CONTENT;
   5002 #ifdef DEBUG_PUSH
   5003 		xmlGenericError(xmlGenericErrorContext,
   5004 			"HPP: entering CONTENT\n");
   5005 #endif
   5006                 break;
   5007 	    }
   5008             case XML_PARSER_CONTENT: {
   5009 		long cons;
   5010                 /*
   5011 		 * Handle preparsed entities and charRef
   5012 		 */
   5013 		if (ctxt->token != 0) {
   5014 		    xmlChar chr[2] = { 0 , 0 } ;
   5015 
   5016 		    chr[0] = (xmlChar) ctxt->token;
   5017 		    htmlCheckParagraph(ctxt);
   5018 		    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
   5019 			ctxt->sax->characters(ctxt->userData, chr, 1);
   5020 		    ctxt->token = 0;
   5021 		    ctxt->checkIndex = 0;
   5022 		}
   5023 		if ((avail == 1) && (terminate)) {
   5024 		    cur = in->cur[0];
   5025 		    if ((cur != '<') && (cur != '&')) {
   5026 			if (ctxt->sax != NULL) {
   5027 			    if (IS_BLANK_CH(cur)) {
   5028 				if (ctxt->sax->ignorableWhitespace != NULL)
   5029 				    ctxt->sax->ignorableWhitespace(
   5030 					    ctxt->userData, &cur, 1);
   5031 			    } else {
   5032 				htmlCheckParagraph(ctxt);
   5033 				if (ctxt->sax->characters != NULL)
   5034 				    ctxt->sax->characters(
   5035 					    ctxt->userData, &cur, 1);
   5036 			    }
   5037 			}
   5038 			ctxt->token = 0;
   5039 			ctxt->checkIndex = 0;
   5040 			in->cur++;
   5041 			break;
   5042 		    }
   5043 		}
   5044 		if (avail < 2)
   5045 		    goto done;
   5046 		cur = in->cur[0];
   5047 		next = in->cur[1];
   5048 		cons = ctxt->nbChars;
   5049 		if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
   5050 		    (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
   5051 		    /*
   5052 		     * Handle SCRIPT/STYLE separately
   5053 		     */
   5054 		    if (!terminate) {
   5055 		        int idx;
   5056 			xmlChar val;
   5057 
   5058 			idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
   5059 			if (idx < 0)
   5060 			    goto done;
   5061 		        val = in->cur[idx + 2];
   5062 			if (val == 0) /* bad cut of input */
   5063 			    goto done;
   5064 		    }
   5065 		    htmlParseScript(ctxt);
   5066 		    if ((cur == '<') && (next == '/')) {
   5067 			ctxt->instate = XML_PARSER_END_TAG;
   5068 			ctxt->checkIndex = 0;
   5069 #ifdef DEBUG_PUSH
   5070 			xmlGenericError(xmlGenericErrorContext,
   5071 				"HPP: entering END_TAG\n");
   5072 #endif
   5073 			break;
   5074 		    }
   5075 		} else {
   5076 		    /*
   5077 		     * Sometimes DOCTYPE arrives in the middle of the document
   5078 		     */
   5079 		    if ((cur == '<') && (next == '!') &&
   5080 			(UPP(2) == 'D') && (UPP(3) == 'O') &&
   5081 			(UPP(4) == 'C') && (UPP(5) == 'T') &&
   5082 			(UPP(6) == 'Y') && (UPP(7) == 'P') &&
   5083 			(UPP(8) == 'E')) {
   5084 			if ((!terminate) &&
   5085 			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
   5086 			    goto done;
   5087 			htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   5088 			             "Misplaced DOCTYPE declaration\n",
   5089 				     BAD_CAST "DOCTYPE" , NULL);
   5090 			htmlParseDocTypeDecl(ctxt);
   5091 		    } else if ((cur == '<') && (next == '!') &&
   5092 			(in->cur[2] == '-') && (in->cur[3] == '-')) {
   5093 			if ((!terminate) &&
   5094 			    (htmlParseLookupSequence(
   5095 			    		ctxt, '-', '-', '>', 1) < 0))
   5096 			    goto done;
   5097 #ifdef DEBUG_PUSH
   5098 			xmlGenericError(xmlGenericErrorContext,
   5099 				"HPP: Parsing Comment\n");
   5100 #endif
   5101 			htmlParseComment(ctxt);
   5102 			ctxt->instate = XML_PARSER_CONTENT;
   5103 		    } else if ((cur == '<') && (next == '?')) {
   5104 			if ((!terminate) &&
   5105 			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
   5106 			    goto done;
   5107 #ifdef DEBUG_PUSH
   5108 			xmlGenericError(xmlGenericErrorContext,
   5109 				"HPP: Parsing PI\n");
   5110 #endif
   5111 			htmlParsePI(ctxt);
   5112 			ctxt->instate = XML_PARSER_CONTENT;
   5113 		    } else if ((cur == '<') && (next == '!') && (avail < 4)) {
   5114 			goto done;
   5115 		    } else if ((cur == '<') && (next == '/')) {
   5116 			ctxt->instate = XML_PARSER_END_TAG;
   5117 			ctxt->checkIndex = 0;
   5118 #ifdef DEBUG_PUSH
   5119 			xmlGenericError(xmlGenericErrorContext,
   5120 				"HPP: entering END_TAG\n");
   5121 #endif
   5122 			break;
   5123 		    } else if (cur == '<') {
   5124 			ctxt->instate = XML_PARSER_START_TAG;
   5125 			ctxt->checkIndex = 0;
   5126 #ifdef DEBUG_PUSH
   5127 			xmlGenericError(xmlGenericErrorContext,
   5128 				"HPP: entering START_TAG\n");
   5129 #endif
   5130 			break;
   5131 		    } else if (cur == '&') {
   5132 			if ((!terminate) &&
   5133 			    (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
   5134 			    goto done;
   5135 #ifdef DEBUG_PUSH
   5136 			xmlGenericError(xmlGenericErrorContext,
   5137 				"HPP: Parsing Reference\n");
   5138 #endif
   5139 			/* TODO: check generation of subtrees if noent !!! */
   5140 			htmlParseReference(ctxt);
   5141 		    } else {
   5142 		        /*
   5143 			 * check that the text sequence is complete
   5144 			 * before handing out the data to the parser
   5145 			 * to avoid problems with erroneous end of
   5146 			 * data detection.
   5147 			 */
   5148 			if ((!terminate) &&
   5149 			    (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
   5150 			    goto done;
   5151 			ctxt->checkIndex = 0;
   5152 #ifdef DEBUG_PUSH
   5153 			xmlGenericError(xmlGenericErrorContext,
   5154 				"HPP: Parsing char data\n");
   5155 #endif
   5156 			htmlParseCharData(ctxt);
   5157 		    }
   5158 		}
   5159 		if (cons == ctxt->nbChars) {
   5160 		    if (ctxt->node != NULL) {
   5161 			htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5162 			             "detected an error in element content\n",
   5163 				     NULL, NULL);
   5164 		    }
   5165 		    NEXT;
   5166 		    break;
   5167 		}
   5168 
   5169 		break;
   5170 	    }
   5171             case XML_PARSER_END_TAG:
   5172 		if (avail < 2)
   5173 		    goto done;
   5174 		if ((!terminate) &&
   5175 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
   5176 		    goto done;
   5177 		htmlParseEndTag(ctxt);
   5178 		if (ctxt->nameNr == 0) {
   5179 		    ctxt->instate = XML_PARSER_EPILOG;
   5180 		} else {
   5181 		    ctxt->instate = XML_PARSER_CONTENT;
   5182 		}
   5183 		ctxt->checkIndex = 0;
   5184 #ifdef DEBUG_PUSH
   5185 		xmlGenericError(xmlGenericErrorContext,
   5186 			"HPP: entering CONTENT\n");
   5187 #endif
   5188 	        break;
   5189             case XML_PARSER_CDATA_SECTION:
   5190 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5191 			"HPP: internal error, state == CDATA\n",
   5192 			     NULL, NULL);
   5193 		ctxt->instate = XML_PARSER_CONTENT;
   5194 		ctxt->checkIndex = 0;
   5195 #ifdef DEBUG_PUSH
   5196 		xmlGenericError(xmlGenericErrorContext,
   5197 			"HPP: entering CONTENT\n");
   5198 #endif
   5199 		break;
   5200             case XML_PARSER_DTD:
   5201 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5202 			"HPP: internal error, state == DTD\n",
   5203 			     NULL, NULL);
   5204 		ctxt->instate = XML_PARSER_CONTENT;
   5205 		ctxt->checkIndex = 0;
   5206 #ifdef DEBUG_PUSH
   5207 		xmlGenericError(xmlGenericErrorContext,
   5208 			"HPP: entering CONTENT\n");
   5209 #endif
   5210 		break;
   5211             case XML_PARSER_COMMENT:
   5212 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5213 			"HPP: internal error, state == COMMENT\n",
   5214 			     NULL, NULL);
   5215 		ctxt->instate = XML_PARSER_CONTENT;
   5216 		ctxt->checkIndex = 0;
   5217 #ifdef DEBUG_PUSH
   5218 		xmlGenericError(xmlGenericErrorContext,
   5219 			"HPP: entering CONTENT\n");
   5220 #endif
   5221 		break;
   5222             case XML_PARSER_PI:
   5223 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5224 			"HPP: internal error, state == PI\n",
   5225 			     NULL, NULL);
   5226 		ctxt->instate = XML_PARSER_CONTENT;
   5227 		ctxt->checkIndex = 0;
   5228 #ifdef DEBUG_PUSH
   5229 		xmlGenericError(xmlGenericErrorContext,
   5230 			"HPP: entering CONTENT\n");
   5231 #endif
   5232 		break;
   5233             case XML_PARSER_ENTITY_DECL:
   5234 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5235 			"HPP: internal error, state == ENTITY_DECL\n",
   5236 			     NULL, NULL);
   5237 		ctxt->instate = XML_PARSER_CONTENT;
   5238 		ctxt->checkIndex = 0;
   5239 #ifdef DEBUG_PUSH
   5240 		xmlGenericError(xmlGenericErrorContext,
   5241 			"HPP: entering CONTENT\n");
   5242 #endif
   5243 		break;
   5244             case XML_PARSER_ENTITY_VALUE:
   5245 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5246 			"HPP: internal error, state == ENTITY_VALUE\n",
   5247 			     NULL, NULL);
   5248 		ctxt->instate = XML_PARSER_CONTENT;
   5249 		ctxt->checkIndex = 0;
   5250 #ifdef DEBUG_PUSH
   5251 		xmlGenericError(xmlGenericErrorContext,
   5252 			"HPP: entering DTD\n");
   5253 #endif
   5254 		break;
   5255             case XML_PARSER_ATTRIBUTE_VALUE:
   5256 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5257 			"HPP: internal error, state == ATTRIBUTE_VALUE\n",
   5258 			     NULL, NULL);
   5259 		ctxt->instate = XML_PARSER_START_TAG;
   5260 		ctxt->checkIndex = 0;
   5261 #ifdef DEBUG_PUSH
   5262 		xmlGenericError(xmlGenericErrorContext,
   5263 			"HPP: entering START_TAG\n");
   5264 #endif
   5265 		break;
   5266 	    case XML_PARSER_SYSTEM_LITERAL:
   5267 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5268 		    "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
   5269 			     NULL, NULL);
   5270 		ctxt->instate = XML_PARSER_CONTENT;
   5271 		ctxt->checkIndex = 0;
   5272 #ifdef DEBUG_PUSH
   5273 		xmlGenericError(xmlGenericErrorContext,
   5274 			"HPP: entering CONTENT\n");
   5275 #endif
   5276 		break;
   5277 	    case XML_PARSER_IGNORE:
   5278 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5279 			"HPP: internal error, state == XML_PARSER_IGNORE\n",
   5280 			     NULL, NULL);
   5281 		ctxt->instate = XML_PARSER_CONTENT;
   5282 		ctxt->checkIndex = 0;
   5283 #ifdef DEBUG_PUSH
   5284 		xmlGenericError(xmlGenericErrorContext,
   5285 			"HPP: entering CONTENT\n");
   5286 #endif
   5287 		break;
   5288 	    case XML_PARSER_PUBLIC_LITERAL:
   5289 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5290 			"HPP: internal error, state == XML_PARSER_LITERAL\n",
   5291 			     NULL, NULL);
   5292 		ctxt->instate = XML_PARSER_CONTENT;
   5293 		ctxt->checkIndex = 0;
   5294 #ifdef DEBUG_PUSH
   5295 		xmlGenericError(xmlGenericErrorContext,
   5296 			"HPP: entering CONTENT\n");
   5297 #endif
   5298 		break;
   5299 
   5300 	}
   5301     }
   5302 done:
   5303     if ((avail == 0) && (terminate)) {
   5304 	htmlAutoCloseOnEnd(ctxt);
   5305 	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
   5306 	    /*
   5307 	     * SAX: end of the document processing.
   5308 	     */
   5309 	    ctxt->instate = XML_PARSER_EOF;
   5310 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
   5311 		ctxt->sax->endDocument(ctxt->userData);
   5312 	}
   5313     }
   5314     if ((ctxt->myDoc != NULL) &&
   5315 	((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
   5316 	 (ctxt->instate == XML_PARSER_EPILOG))) {
   5317 	xmlDtdPtr dtd;
   5318 	dtd = xmlGetIntSubset(ctxt->myDoc);
   5319 	if (dtd == NULL)
   5320 	    ctxt->myDoc->intSubset =
   5321 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
   5322 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
   5323 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
   5324     }
   5325 #ifdef DEBUG_PUSH
   5326     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
   5327 #endif
   5328     return(ret);
   5329 }
   5330 
   5331 /**
   5332  * htmlParseChunk:
   5333  * @ctxt:  an HTML parser context
   5334  * @chunk:  an char array
   5335  * @size:  the size in byte of the chunk
   5336  * @terminate:  last chunk indicator
   5337  *
   5338  * Parse a Chunk of memory
   5339  *
   5340  * Returns zero if no error, the xmlParserErrors otherwise.
   5341  */
   5342 int
   5343 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
   5344               int terminate) {
   5345     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   5346 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5347 		     "htmlParseChunk: context error\n", NULL, NULL);
   5348 	return(XML_ERR_INTERNAL_ERROR);
   5349     }
   5350     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
   5351         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
   5352 	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
   5353 	int cur = ctxt->input->cur - ctxt->input->base;
   5354 	int res;
   5355 
   5356 	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
   5357 	if (res < 0) {
   5358 	    ctxt->errNo = XML_PARSER_EOF;
   5359 	    ctxt->disableSAX = 1;
   5360 	    return (XML_PARSER_EOF);
   5361 	}
   5362 	ctxt->input->base = ctxt->input->buf->buffer->content + base;
   5363 	ctxt->input->cur = ctxt->input->base + cur;
   5364 	ctxt->input->end =
   5365 	  &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
   5366 #ifdef DEBUG_PUSH
   5367 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
   5368 #endif
   5369 
   5370 #if 0
   5371 	if ((terminate) || (ctxt->input->buf->buffer->use > 80))
   5372 	    htmlParseTryOrFinish(ctxt, terminate);
   5373 #endif
   5374     } else if (ctxt->instate != XML_PARSER_EOF) {
   5375 	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
   5376 	    xmlParserInputBufferPtr in = ctxt->input->buf;
   5377 	    if ((in->encoder != NULL) && (in->buffer != NULL) &&
   5378 		    (in->raw != NULL)) {
   5379 		int nbchars;
   5380 
   5381 		nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
   5382 		if (nbchars < 0) {
   5383 		    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
   5384 			         "encoder error\n", NULL, NULL);
   5385 		    return(XML_ERR_INVALID_ENCODING);
   5386 		}
   5387 	    }
   5388 	}
   5389     }
   5390     htmlParseTryOrFinish(ctxt, terminate);
   5391     if (terminate) {
   5392 	if ((ctxt->instate != XML_PARSER_EOF) &&
   5393 	    (ctxt->instate != XML_PARSER_EPILOG) &&
   5394 	    (ctxt->instate != XML_PARSER_MISC)) {
   5395 	    ctxt->errNo = XML_ERR_DOCUMENT_END;
   5396 	    ctxt->wellFormed = 0;
   5397 	}
   5398 	if (ctxt->instate != XML_PARSER_EOF) {
   5399 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
   5400 		ctxt->sax->endDocument(ctxt->userData);
   5401 	}
   5402 	ctxt->instate = XML_PARSER_EOF;
   5403     }
   5404     return((xmlParserErrors) ctxt->errNo);
   5405 }
   5406 
   5407 /************************************************************************
   5408  *									*
   5409  *			User entry points				*
   5410  *									*
   5411  ************************************************************************/
   5412 
   5413 /**
   5414  * htmlCreatePushParserCtxt:
   5415  * @sax:  a SAX handler
   5416  * @user_data:  The user data returned on SAX callbacks
   5417  * @chunk:  a pointer to an array of chars
   5418  * @size:  number of chars in the array
   5419  * @filename:  an optional file name or URI
   5420  * @enc:  an optional encoding
   5421  *
   5422  * Create a parser context for using the HTML parser in push mode
   5423  * The value of @filename is used for fetching external entities
   5424  * and error/warning reports.
   5425  *
   5426  * Returns the new parser context or NULL
   5427  */
   5428 htmlParserCtxtPtr
   5429 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
   5430                          const char *chunk, int size, const char *filename,
   5431 			 xmlCharEncoding enc) {
   5432     htmlParserCtxtPtr ctxt;
   5433     htmlParserInputPtr inputStream;
   5434     xmlParserInputBufferPtr buf;
   5435 
   5436     xmlInitParser();
   5437 
   5438     buf = xmlAllocParserInputBuffer(enc);
   5439     if (buf == NULL) return(NULL);
   5440 
   5441     ctxt = htmlNewParserCtxt();
   5442     if (ctxt == NULL) {
   5443 	xmlFreeParserInputBuffer(buf);
   5444 	return(NULL);
   5445     }
   5446     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
   5447 	ctxt->charset=XML_CHAR_ENCODING_UTF8;
   5448     if (sax != NULL) {
   5449 	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
   5450 	    xmlFree(ctxt->sax);
   5451 	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
   5452 	if (ctxt->sax == NULL) {
   5453 	    xmlFree(buf);
   5454 	    xmlFree(ctxt);
   5455 	    return(NULL);
   5456 	}
   5457 	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
   5458 	if (user_data != NULL)
   5459 	    ctxt->userData = user_data;
   5460     }
   5461     if (filename == NULL) {
   5462 	ctxt->directory = NULL;
   5463     } else {
   5464         ctxt->directory = xmlParserGetDirectory(filename);
   5465     }
   5466 
   5467     inputStream = htmlNewInputStream(ctxt);
   5468     if (inputStream == NULL) {
   5469 	xmlFreeParserCtxt(ctxt);
   5470 	xmlFree(buf);
   5471 	return(NULL);
   5472     }
   5473 
   5474     if (filename == NULL)
   5475 	inputStream->filename = NULL;
   5476     else
   5477 	inputStream->filename = (char *)
   5478 	    xmlCanonicPath((const xmlChar *) filename);
   5479     inputStream->buf = buf;
   5480     inputStream->base = inputStream->buf->buffer->content;
   5481     inputStream->cur = inputStream->buf->buffer->content;
   5482     inputStream->end =
   5483 	&inputStream->buf->buffer->content[inputStream->buf->buffer->use];
   5484 
   5485     inputPush(ctxt, inputStream);
   5486 
   5487     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
   5488         (ctxt->input->buf != NULL))  {
   5489 	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
   5490 	int cur = ctxt->input->cur - ctxt->input->base;
   5491 
   5492 	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
   5493 
   5494 	ctxt->input->base = ctxt->input->buf->buffer->content + base;
   5495 	ctxt->input->cur = ctxt->input->base + cur;
   5496 	ctxt->input->end =
   5497 	    &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
   5498 #ifdef DEBUG_PUSH
   5499 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
   5500 #endif
   5501     }
   5502     ctxt->progressive = 1;
   5503 
   5504     return(ctxt);
   5505 }
   5506 #endif /* LIBXML_PUSH_ENABLED */
   5507 
   5508 /**
   5509  * htmlSAXParseDoc:
   5510  * @cur:  a pointer to an array of xmlChar
   5511  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   5512  * @sax:  the SAX handler block
   5513  * @userData: if using SAX, this pointer will be provided on callbacks.
   5514  *
   5515  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
   5516  * to handle parse events. If sax is NULL, fallback to the default DOM
   5517  * behavior and return a tree.
   5518  *
   5519  * Returns the resulting document tree unless SAX is NULL or the document is
   5520  *     not well formed.
   5521  */
   5522 
   5523 htmlDocPtr
   5524 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
   5525     htmlDocPtr ret;
   5526     htmlParserCtxtPtr ctxt;
   5527 
   5528     xmlInitParser();
   5529 
   5530     if (cur == NULL) return(NULL);
   5531 
   5532 
   5533     ctxt = htmlCreateDocParserCtxt(cur, encoding);
   5534     if (ctxt == NULL) return(NULL);
   5535     if (sax != NULL) {
   5536         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
   5537         ctxt->sax = sax;
   5538         ctxt->userData = userData;
   5539     }
   5540 
   5541     htmlParseDocument(ctxt);
   5542     ret = ctxt->myDoc;
   5543     if (sax != NULL) {
   5544 	ctxt->sax = NULL;
   5545 	ctxt->userData = NULL;
   5546     }
   5547     htmlFreeParserCtxt(ctxt);
   5548 
   5549     return(ret);
   5550 }
   5551 
   5552 /**
   5553  * htmlParseDoc:
   5554  * @cur:  a pointer to an array of xmlChar
   5555  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   5556  *
   5557  * parse an HTML in-memory document and build a tree.
   5558  *
   5559  * Returns the resulting document tree
   5560  */
   5561 
   5562 htmlDocPtr
   5563 htmlParseDoc(xmlChar *cur, const char *encoding) {
   5564     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
   5565 }
   5566 
   5567 
   5568 /**
   5569  * htmlCreateFileParserCtxt:
   5570  * @filename:  the filename
   5571  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   5572  *
   5573  * Create a parser context for a file content.
   5574  * Automatic support for ZLIB/Compress compressed document is provided
   5575  * by default if found at compile-time.
   5576  *
   5577  * Returns the new parser context or NULL
   5578  */
   5579 htmlParserCtxtPtr
   5580 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
   5581 {
   5582     htmlParserCtxtPtr ctxt;
   5583     htmlParserInputPtr inputStream;
   5584     char *canonicFilename;
   5585     /* htmlCharEncoding enc; */
   5586     xmlChar *content, *content_line = (xmlChar *) "charset=";
   5587 
   5588     if (filename == NULL)
   5589         return(NULL);
   5590 
   5591     ctxt = htmlNewParserCtxt();
   5592     if (ctxt == NULL) {
   5593 	return(NULL);
   5594     }
   5595     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
   5596     if (canonicFilename == NULL) {
   5597 #ifdef LIBXML_SAX1_ENABLED
   5598 	if (xmlDefaultSAXHandler.error != NULL) {
   5599 	    xmlDefaultSAXHandler.error(NULL, "out of memory\n");
   5600 	}
   5601 #endif
   5602 	xmlFreeParserCtxt(ctxt);
   5603 	return(NULL);
   5604     }
   5605 
   5606     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
   5607     xmlFree(canonicFilename);
   5608     if (inputStream == NULL) {
   5609 	xmlFreeParserCtxt(ctxt);
   5610 	return(NULL);
   5611     }
   5612 
   5613     inputPush(ctxt, inputStream);
   5614 
   5615     /* set encoding */
   5616     if (encoding) {
   5617         content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
   5618 	if (content) {
   5619 	    strcpy ((char *)content, (char *)content_line);
   5620             strcat ((char *)content, (char *)encoding);
   5621             htmlCheckEncoding (ctxt, content);
   5622 	    xmlFree (content);
   5623 	}
   5624     }
   5625 
   5626     return(ctxt);
   5627 }
   5628 
   5629 /**
   5630  * htmlSAXParseFile:
   5631  * @filename:  the filename
   5632  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   5633  * @sax:  the SAX handler block
   5634  * @userData: if using SAX, this pointer will be provided on callbacks.
   5635  *
   5636  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
   5637  * compressed document is provided by default if found at compile-time.
   5638  * It use the given SAX function block to handle the parsing callback.
   5639  * If sax is NULL, fallback to the default DOM tree building routines.
   5640  *
   5641  * Returns the resulting document tree unless SAX is NULL or the document is
   5642  *     not well formed.
   5643  */
   5644 
   5645 htmlDocPtr
   5646 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
   5647                  void *userData) {
   5648     htmlDocPtr ret;
   5649     htmlParserCtxtPtr ctxt;
   5650     htmlSAXHandlerPtr oldsax = NULL;
   5651 
   5652     xmlInitParser();
   5653 
   5654     ctxt = htmlCreateFileParserCtxt(filename, encoding);
   5655     if (ctxt == NULL) return(NULL);
   5656     if (sax != NULL) {
   5657 	oldsax = ctxt->sax;
   5658         ctxt->sax = sax;
   5659         ctxt->userData = userData;
   5660     }
   5661 
   5662     htmlParseDocument(ctxt);
   5663 
   5664     ret = ctxt->myDoc;
   5665     if (sax != NULL) {
   5666         ctxt->sax = oldsax;
   5667         ctxt->userData = NULL;
   5668     }
   5669     htmlFreeParserCtxt(ctxt);
   5670 
   5671     return(ret);
   5672 }
   5673 
   5674 /**
   5675  * htmlParseFile:
   5676  * @filename:  the filename
   5677  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   5678  *
   5679  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
   5680  * compressed document is provided by default if found at compile-time.
   5681  *
   5682  * Returns the resulting document tree
   5683  */
   5684 
   5685 htmlDocPtr
   5686 htmlParseFile(const char *filename, const char *encoding) {
   5687     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
   5688 }
   5689 
   5690 /**
   5691  * htmlHandleOmittedElem:
   5692  * @val:  int 0 or 1
   5693  *
   5694  * Set and return the previous value for handling HTML omitted tags.
   5695  *
   5696  * Returns the last value for 0 for no handling, 1 for auto insertion.
   5697  */
   5698 
   5699 int
   5700 htmlHandleOmittedElem(int val) {
   5701     int old = htmlOmittedDefaultValue;
   5702 
   5703     htmlOmittedDefaultValue = val;
   5704     return(old);
   5705 }
   5706 
   5707 /**
   5708  * htmlElementAllowedHere:
   5709  * @parent: HTML parent element
   5710  * @elt: HTML element
   5711  *
   5712  * Checks whether an HTML element may be a direct child of a parent element.
   5713  * Note - doesn't check for deprecated elements
   5714  *
   5715  * Returns 1 if allowed; 0 otherwise.
   5716  */
   5717 int
   5718 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
   5719   const char** p ;
   5720 
   5721   if ( ! elt || ! parent || ! parent->subelts )
   5722 	return 0 ;
   5723 
   5724   for ( p = parent->subelts; *p; ++p )
   5725     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
   5726       return 1 ;
   5727 
   5728   return 0 ;
   5729 }
   5730 /**
   5731  * htmlElementStatusHere:
   5732  * @parent: HTML parent element
   5733  * @elt: HTML element
   5734  *
   5735  * Checks whether an HTML element may be a direct child of a parent element.
   5736  * and if so whether it is valid or deprecated.
   5737  *
   5738  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
   5739  */
   5740 htmlStatus
   5741 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
   5742   if ( ! parent || ! elt )
   5743     return HTML_INVALID ;
   5744   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
   5745     return HTML_INVALID ;
   5746 
   5747   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
   5748 }
   5749 /**
   5750  * htmlAttrAllowed:
   5751  * @elt: HTML element
   5752  * @attr: HTML attribute
   5753  * @legacy: whether to allow deprecated attributes
   5754  *
   5755  * Checks whether an attribute is valid for an element
   5756  * Has full knowledge of Required and Deprecated attributes
   5757  *
   5758  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
   5759  */
   5760 htmlStatus
   5761 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
   5762   const char** p ;
   5763 
   5764   if ( !elt || ! attr )
   5765 	return HTML_INVALID ;
   5766 
   5767   if ( elt->attrs_req )
   5768     for ( p = elt->attrs_req; *p; ++p)
   5769       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
   5770         return HTML_REQUIRED ;
   5771 
   5772   if ( elt->attrs_opt )
   5773     for ( p = elt->attrs_opt; *p; ++p)
   5774       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
   5775         return HTML_VALID ;
   5776 
   5777   if ( legacy && elt->attrs_depr )
   5778     for ( p = elt->attrs_depr; *p; ++p)
   5779       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
   5780         return HTML_DEPRECATED ;
   5781 
   5782   return HTML_INVALID ;
   5783 }
   5784 /**
   5785  * htmlNodeStatus:
   5786  * @node: an htmlNodePtr in a tree
   5787  * @legacy: whether to allow deprecated elements (YES is faster here
   5788  *	for Element nodes)
   5789  *
   5790  * Checks whether the tree node is valid.  Experimental (the author
   5791  *     only uses the HTML enhancements in a SAX parser)
   5792  *
   5793  * Return: for Element nodes, a return from htmlElementAllowedHere (if
   5794  *	legacy allowed) or htmlElementStatusHere (otherwise).
   5795  *	for Attribute nodes, a return from htmlAttrAllowed
   5796  *	for other nodes, HTML_NA (no checks performed)
   5797  */
   5798 htmlStatus
   5799 htmlNodeStatus(const htmlNodePtr node, int legacy) {
   5800   if ( ! node )
   5801     return HTML_INVALID ;
   5802 
   5803   switch ( node->type ) {
   5804     case XML_ELEMENT_NODE:
   5805       return legacy
   5806 	? ( htmlElementAllowedHere (
   5807 		htmlTagLookup(node->parent->name) , node->name
   5808 		) ? HTML_VALID : HTML_INVALID )
   5809 	: htmlElementStatusHere(
   5810 		htmlTagLookup(node->parent->name) ,
   5811 		htmlTagLookup(node->name) )
   5812 	;
   5813     case XML_ATTRIBUTE_NODE:
   5814       return htmlAttrAllowed(
   5815 	htmlTagLookup(node->parent->name) , node->name, legacy) ;
   5816     default: return HTML_NA ;
   5817   }
   5818 }
   5819 /************************************************************************
   5820  *									*
   5821  *	New set (2.6.0) of simpler and more flexible APIs		*
   5822  *									*
   5823  ************************************************************************/
   5824 /**
   5825  * DICT_FREE:
   5826  * @str:  a string
   5827  *
   5828  * Free a string if it is not owned by the "dict" dictionnary in the
   5829  * current scope
   5830  */
   5831 #define DICT_FREE(str)						\
   5832 	if ((str) && ((!dict) || 				\
   5833 	    (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))	\
   5834 	    xmlFree((char *)(str));
   5835 
   5836 /**
   5837  * htmlCtxtReset:
   5838  * @ctxt: an HTML parser context
   5839  *
   5840  * Reset a parser context
   5841  */
   5842 void
   5843 htmlCtxtReset(htmlParserCtxtPtr ctxt)
   5844 {
   5845     xmlParserInputPtr input;
   5846     xmlDictPtr dict;
   5847 
   5848     if (ctxt == NULL)
   5849         return;
   5850 
   5851     xmlInitParser();
   5852     dict = ctxt->dict;
   5853 
   5854     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
   5855         xmlFreeInputStream(input);
   5856     }
   5857     ctxt->inputNr = 0;
   5858     ctxt->input = NULL;
   5859 
   5860     ctxt->spaceNr = 0;
   5861     if (ctxt->spaceTab != NULL) {
   5862 	ctxt->spaceTab[0] = -1;
   5863 	ctxt->space = &ctxt->spaceTab[0];
   5864     } else {
   5865 	ctxt->space = NULL;
   5866     }
   5867 
   5868 
   5869     ctxt->nodeNr = 0;
   5870     ctxt->node = NULL;
   5871 
   5872     ctxt->nameNr = 0;
   5873     ctxt->name = NULL;
   5874 
   5875     DICT_FREE(ctxt->version);
   5876     ctxt->version = NULL;
   5877     DICT_FREE(ctxt->encoding);
   5878     ctxt->encoding = NULL;
   5879     DICT_FREE(ctxt->directory);
   5880     ctxt->directory = NULL;
   5881     DICT_FREE(ctxt->extSubURI);
   5882     ctxt->extSubURI = NULL;
   5883     DICT_FREE(ctxt->extSubSystem);
   5884     ctxt->extSubSystem = NULL;
   5885     if (ctxt->myDoc != NULL)
   5886         xmlFreeDoc(ctxt->myDoc);
   5887     ctxt->myDoc = NULL;
   5888 
   5889     ctxt->standalone = -1;
   5890     ctxt->hasExternalSubset = 0;
   5891     ctxt->hasPErefs = 0;
   5892     ctxt->html = 1;
   5893     ctxt->external = 0;
   5894     ctxt->instate = XML_PARSER_START;
   5895     ctxt->token = 0;
   5896 
   5897     ctxt->wellFormed = 1;
   5898     ctxt->nsWellFormed = 1;
   5899     ctxt->valid = 1;
   5900     ctxt->vctxt.userData = ctxt;
   5901     ctxt->vctxt.error = xmlParserValidityError;
   5902     ctxt->vctxt.warning = xmlParserValidityWarning;
   5903     ctxt->record_info = 0;
   5904     ctxt->nbChars = 0;
   5905     ctxt->checkIndex = 0;
   5906     ctxt->inSubset = 0;
   5907     ctxt->errNo = XML_ERR_OK;
   5908     ctxt->depth = 0;
   5909     ctxt->charset = XML_CHAR_ENCODING_NONE;
   5910     ctxt->catalogs = NULL;
   5911     xmlInitNodeInfoSeq(&ctxt->node_seq);
   5912 
   5913     if (ctxt->attsDefault != NULL) {
   5914         xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
   5915         ctxt->attsDefault = NULL;
   5916     }
   5917     if (ctxt->attsSpecial != NULL) {
   5918         xmlHashFree(ctxt->attsSpecial, NULL);
   5919         ctxt->attsSpecial = NULL;
   5920     }
   5921 }
   5922 
   5923 /**
   5924  * htmlCtxtUseOptions:
   5925  * @ctxt: an HTML parser context
   5926  * @options:  a combination of htmlParserOption(s)
   5927  *
   5928  * Applies the options to the parser context
   5929  *
   5930  * Returns 0 in case of success, the set of unknown or unimplemented options
   5931  *         in case of error.
   5932  */
   5933 int
   5934 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
   5935 {
   5936     if (ctxt == NULL)
   5937         return(-1);
   5938 
   5939     if (options & HTML_PARSE_NOWARNING) {
   5940         ctxt->sax->warning = NULL;
   5941         ctxt->vctxt.warning = NULL;
   5942         options -= XML_PARSE_NOWARNING;
   5943 	ctxt->options |= XML_PARSE_NOWARNING;
   5944     }
   5945     if (options & HTML_PARSE_NOERROR) {
   5946         ctxt->sax->error = NULL;
   5947         ctxt->vctxt.error = NULL;
   5948         ctxt->sax->fatalError = NULL;
   5949         options -= XML_PARSE_NOERROR;
   5950 	ctxt->options |= XML_PARSE_NOERROR;
   5951     }
   5952     if (options & HTML_PARSE_PEDANTIC) {
   5953         ctxt->pedantic = 1;
   5954         options -= XML_PARSE_PEDANTIC;
   5955 	ctxt->options |= XML_PARSE_PEDANTIC;
   5956     } else
   5957         ctxt->pedantic = 0;
   5958     if (options & XML_PARSE_NOBLANKS) {
   5959         ctxt->keepBlanks = 0;
   5960         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
   5961         options -= XML_PARSE_NOBLANKS;
   5962 	ctxt->options |= XML_PARSE_NOBLANKS;
   5963     } else
   5964         ctxt->keepBlanks = 1;
   5965     if (options & HTML_PARSE_RECOVER) {
   5966         ctxt->recovery = 1;
   5967 	options -= HTML_PARSE_RECOVER;
   5968     } else
   5969         ctxt->recovery = 0;
   5970     if (options & HTML_PARSE_COMPACT) {
   5971 	ctxt->options |= HTML_PARSE_COMPACT;
   5972         options -= HTML_PARSE_COMPACT;
   5973     }
   5974     ctxt->dictNames = 0;
   5975     return (options);
   5976 }
   5977 
   5978 /**
   5979  * htmlDoRead:
   5980  * @ctxt:  an HTML parser context
   5981  * @URL:  the base URL to use for the document
   5982  * @encoding:  the document encoding, or NULL
   5983  * @options:  a combination of htmlParserOption(s)
   5984  * @reuse:  keep the context for reuse
   5985  *
   5986  * Common front-end for the htmlRead functions
   5987  *
   5988  * Returns the resulting document tree or NULL
   5989  */
   5990 static htmlDocPtr
   5991 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
   5992           int options, int reuse)
   5993 {
   5994     htmlDocPtr ret;
   5995 
   5996     htmlCtxtUseOptions(ctxt, options);
   5997     ctxt->html = 1;
   5998     if (encoding != NULL) {
   5999         xmlCharEncodingHandlerPtr hdlr;
   6000 
   6001 	hdlr = xmlFindCharEncodingHandler(encoding);
   6002 	if (hdlr != NULL) {
   6003 	    xmlSwitchToEncoding(ctxt, hdlr);
   6004 	    if (ctxt->input->encoding != NULL)
   6005 	      xmlFree((xmlChar *) ctxt->input->encoding);
   6006             ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
   6007         }
   6008     }
   6009     if ((URL != NULL) && (ctxt->input != NULL) &&
   6010         (ctxt->input->filename == NULL))
   6011         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
   6012     htmlParseDocument(ctxt);
   6013     ret = ctxt->myDoc;
   6014     ctxt->myDoc = NULL;
   6015     if (!reuse) {
   6016         if ((ctxt->dictNames) &&
   6017 	    (ret != NULL) &&
   6018 	    (ret->dict == ctxt->dict))
   6019 	    ctxt->dict = NULL;
   6020 	xmlFreeParserCtxt(ctxt);
   6021     }
   6022     return (ret);
   6023 }
   6024 
   6025 /**
   6026  * htmlReadDoc:
   6027  * @cur:  a pointer to a zero terminated string
   6028  * @URL:  the base URL to use for the document
   6029  * @encoding:  the document encoding, or NULL
   6030  * @options:  a combination of htmlParserOption(s)
   6031  *
   6032  * parse an XML in-memory document and build a tree.
   6033  *
   6034  * Returns the resulting document tree
   6035  */
   6036 htmlDocPtr
   6037 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
   6038 {
   6039     htmlParserCtxtPtr ctxt;
   6040 
   6041     if (cur == NULL)
   6042         return (NULL);
   6043 
   6044     xmlInitParser();
   6045     ctxt = htmlCreateDocParserCtxt(cur, NULL);
   6046     if (ctxt == NULL)
   6047         return (NULL);
   6048     return (htmlDoRead(ctxt, URL, encoding, options, 0));
   6049 }
   6050 
   6051 /**
   6052  * htmlReadFile:
   6053  * @filename:  a file or URL
   6054  * @encoding:  the document encoding, or NULL
   6055  * @options:  a combination of htmlParserOption(s)
   6056  *
   6057  * parse an XML file from the filesystem or the network.
   6058  *
   6059  * Returns the resulting document tree
   6060  */
   6061 htmlDocPtr
   6062 htmlReadFile(const char *filename, const char *encoding, int options)
   6063 {
   6064     htmlParserCtxtPtr ctxt;
   6065 
   6066     xmlInitParser();
   6067     ctxt = htmlCreateFileParserCtxt(filename, encoding);
   6068     if (ctxt == NULL)
   6069         return (NULL);
   6070     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
   6071 }
   6072 
   6073 /**
   6074  * htmlReadMemory:
   6075  * @buffer:  a pointer to a char array
   6076  * @size:  the size of the array
   6077  * @URL:  the base URL to use for the document
   6078  * @encoding:  the document encoding, or NULL
   6079  * @options:  a combination of htmlParserOption(s)
   6080  *
   6081  * parse an XML in-memory document and build a tree.
   6082  *
   6083  * Returns the resulting document tree
   6084  */
   6085 htmlDocPtr
   6086 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
   6087 {
   6088     htmlParserCtxtPtr ctxt;
   6089 
   6090     xmlInitParser();
   6091     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
   6092     if (ctxt == NULL)
   6093         return (NULL);
   6094     htmlDefaultSAXHandlerInit();
   6095     if (ctxt->sax != NULL)
   6096         memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
   6097     return (htmlDoRead(ctxt, URL, encoding, options, 0));
   6098 }
   6099 
   6100 /**
   6101  * htmlReadFd:
   6102  * @fd:  an open file descriptor
   6103  * @URL:  the base URL to use for the document
   6104  * @encoding:  the document encoding, or NULL
   6105  * @options:  a combination of htmlParserOption(s)
   6106  *
   6107  * parse an XML from a file descriptor and build a tree.
   6108  *
   6109  * Returns the resulting document tree
   6110  */
   6111 htmlDocPtr
   6112 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
   6113 {
   6114     htmlParserCtxtPtr ctxt;
   6115     xmlParserInputBufferPtr input;
   6116     xmlParserInputPtr stream;
   6117 
   6118     if (fd < 0)
   6119         return (NULL);
   6120 
   6121     xmlInitParser();
   6122     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
   6123     if (input == NULL)
   6124         return (NULL);
   6125     ctxt = xmlNewParserCtxt();
   6126     if (ctxt == NULL) {
   6127         xmlFreeParserInputBuffer(input);
   6128         return (NULL);
   6129     }
   6130     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
   6131     if (stream == NULL) {
   6132         xmlFreeParserInputBuffer(input);
   6133 	xmlFreeParserCtxt(ctxt);
   6134         return (NULL);
   6135     }
   6136     inputPush(ctxt, stream);
   6137     return (htmlDoRead(ctxt, URL, encoding, options, 0));
   6138 }
   6139 
   6140 /**
   6141  * htmlReadIO:
   6142  * @ioread:  an I/O read function
   6143  * @ioclose:  an I/O close function
   6144  * @ioctx:  an I/O handler
   6145  * @URL:  the base URL to use for the document
   6146  * @encoding:  the document encoding, or NULL
   6147  * @options:  a combination of htmlParserOption(s)
   6148  *
   6149  * parse an HTML document from I/O functions and source and build a tree.
   6150  *
   6151  * Returns the resulting document tree
   6152  */
   6153 htmlDocPtr
   6154 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
   6155           void *ioctx, const char *URL, const char *encoding, int options)
   6156 {
   6157     htmlParserCtxtPtr ctxt;
   6158     xmlParserInputBufferPtr input;
   6159     xmlParserInputPtr stream;
   6160 
   6161     if (ioread == NULL)
   6162         return (NULL);
   6163     xmlInitParser();
   6164 
   6165     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
   6166                                          XML_CHAR_ENCODING_NONE);
   6167     if (input == NULL)
   6168         return (NULL);
   6169     ctxt = htmlNewParserCtxt();
   6170     if (ctxt == NULL) {
   6171         xmlFreeParserInputBuffer(input);
   6172         return (NULL);
   6173     }
   6174     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
   6175     if (stream == NULL) {
   6176         xmlFreeParserInputBuffer(input);
   6177 	xmlFreeParserCtxt(ctxt);
   6178         return (NULL);
   6179     }
   6180     inputPush(ctxt, stream);
   6181     return (htmlDoRead(ctxt, URL, encoding, options, 0));
   6182 }
   6183 
   6184 /**
   6185  * htmlCtxtReadDoc:
   6186  * @ctxt:  an HTML parser context
   6187  * @cur:  a pointer to a zero terminated string
   6188  * @URL:  the base URL to use for the document
   6189  * @encoding:  the document encoding, or NULL
   6190  * @options:  a combination of htmlParserOption(s)
   6191  *
   6192  * parse an XML in-memory document and build a tree.
   6193  * This reuses the existing @ctxt parser context
   6194  *
   6195  * Returns the resulting document tree
   6196  */
   6197 htmlDocPtr
   6198 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
   6199                const char *URL, const char *encoding, int options)
   6200 {
   6201     xmlParserInputPtr stream;
   6202 
   6203     if (cur == NULL)
   6204         return (NULL);
   6205     if (ctxt == NULL)
   6206         return (NULL);
   6207 
   6208     htmlCtxtReset(ctxt);
   6209 
   6210     stream = xmlNewStringInputStream(ctxt, cur);
   6211     if (stream == NULL) {
   6212         return (NULL);
   6213     }
   6214     inputPush(ctxt, stream);
   6215     return (htmlDoRead(ctxt, URL, encoding, options, 1));
   6216 }
   6217 
   6218 /**
   6219  * htmlCtxtReadFile:
   6220  * @ctxt:  an HTML parser context
   6221  * @filename:  a file or URL
   6222  * @encoding:  the document encoding, or NULL
   6223  * @options:  a combination of htmlParserOption(s)
   6224  *
   6225  * parse an XML file from the filesystem or the network.
   6226  * This reuses the existing @ctxt parser context
   6227  *
   6228  * Returns the resulting document tree
   6229  */
   6230 htmlDocPtr
   6231 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
   6232                 const char *encoding, int options)
   6233 {
   6234     xmlParserInputPtr stream;
   6235 
   6236     if (filename == NULL)
   6237         return (NULL);
   6238     if (ctxt == NULL)
   6239         return (NULL);
   6240 
   6241     htmlCtxtReset(ctxt);
   6242 
   6243     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
   6244     if (stream == NULL) {
   6245         return (NULL);
   6246     }
   6247     inputPush(ctxt, stream);
   6248     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
   6249 }
   6250 
   6251 /**
   6252  * htmlCtxtReadMemory:
   6253  * @ctxt:  an HTML parser context
   6254  * @buffer:  a pointer to a char array
   6255  * @size:  the size of the array
   6256  * @URL:  the base URL to use for the document
   6257  * @encoding:  the document encoding, or NULL
   6258  * @options:  a combination of htmlParserOption(s)
   6259  *
   6260  * parse an XML in-memory document and build a tree.
   6261  * This reuses the existing @ctxt parser context
   6262  *
   6263  * Returns the resulting document tree
   6264  */
   6265 htmlDocPtr
   6266 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
   6267                   const char *URL, const char *encoding, int options)
   6268 {
   6269     xmlParserInputBufferPtr input;
   6270     xmlParserInputPtr stream;
   6271 
   6272     if (ctxt == NULL)
   6273         return (NULL);
   6274     if (buffer == NULL)
   6275         return (NULL);
   6276 
   6277     htmlCtxtReset(ctxt);
   6278 
   6279     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
   6280     if (input == NULL) {
   6281 	return(NULL);
   6282     }
   6283 
   6284     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
   6285     if (stream == NULL) {
   6286 	xmlFreeParserInputBuffer(input);
   6287 	return(NULL);
   6288     }
   6289 
   6290     inputPush(ctxt, stream);
   6291     return (htmlDoRead(ctxt, URL, encoding, options, 1));
   6292 }
   6293 
   6294 /**
   6295  * htmlCtxtReadFd:
   6296  * @ctxt:  an HTML parser context
   6297  * @fd:  an open file descriptor
   6298  * @URL:  the base URL to use for the document
   6299  * @encoding:  the document encoding, or NULL
   6300  * @options:  a combination of htmlParserOption(s)
   6301  *
   6302  * parse an XML from a file descriptor and build a tree.
   6303  * This reuses the existing @ctxt parser context
   6304  *
   6305  * Returns the resulting document tree
   6306  */
   6307 htmlDocPtr
   6308 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
   6309               const char *URL, const char *encoding, int options)
   6310 {
   6311     xmlParserInputBufferPtr input;
   6312     xmlParserInputPtr stream;
   6313 
   6314     if (fd < 0)
   6315         return (NULL);
   6316     if (ctxt == NULL)
   6317         return (NULL);
   6318 
   6319     htmlCtxtReset(ctxt);
   6320 
   6321 
   6322     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
   6323     if (input == NULL)
   6324         return (NULL);
   6325     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
   6326     if (stream == NULL) {
   6327         xmlFreeParserInputBuffer(input);
   6328         return (NULL);
   6329     }
   6330     inputPush(ctxt, stream);
   6331     return (htmlDoRead(ctxt, URL, encoding, options, 1));
   6332 }
   6333 
   6334 /**
   6335  * htmlCtxtReadIO:
   6336  * @ctxt:  an HTML parser context
   6337  * @ioread:  an I/O read function
   6338  * @ioclose:  an I/O close function
   6339  * @ioctx:  an I/O handler
   6340  * @URL:  the base URL to use for the document
   6341  * @encoding:  the document encoding, or NULL
   6342  * @options:  a combination of htmlParserOption(s)
   6343  *
   6344  * parse an HTML document from I/O functions and source and build a tree.
   6345  * This reuses the existing @ctxt parser context
   6346  *
   6347  * Returns the resulting document tree
   6348  */
   6349 htmlDocPtr
   6350 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
   6351               xmlInputCloseCallback ioclose, void *ioctx,
   6352 	      const char *URL,
   6353               const char *encoding, int options)
   6354 {
   6355     xmlParserInputBufferPtr input;
   6356     xmlParserInputPtr stream;
   6357 
   6358     if (ioread == NULL)
   6359         return (NULL);
   6360     if (ctxt == NULL)
   6361         return (NULL);
   6362 
   6363     htmlCtxtReset(ctxt);
   6364 
   6365     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
   6366                                          XML_CHAR_ENCODING_NONE);
   6367     if (input == NULL)
   6368         return (NULL);
   6369     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
   6370     if (stream == NULL) {
   6371         xmlFreeParserInputBuffer(input);
   6372         return (NULL);
   6373     }
   6374     inputPush(ctxt, stream);
   6375     return (htmlDoRead(ctxt, URL, encoding, options, 1));
   6376 }
   6377 
   6378 #define bottom_HTMLparser
   6379 #include "elfgcchack.h"
   6380 #endif /* LIBXML_HTML_ENABLED */
   6381