Home | History | Annotate | Download | only in libxml2
      1 /*
      2  * HTMLparser.c : an HTML 4.0 non-verifying parser
      3  *
      4  * See Copyright for the status of this software.
      5  *
      6  * daniel (at) veillard.com
      7  */
      8 
      9 #define IN_LIBXML
     10 #include "libxml.h"
     11 #ifdef LIBXML_HTML_ENABLED
     12 
     13 #include <string.h>
     14 #ifdef HAVE_CTYPE_H
     15 #include <ctype.h>
     16 #endif
     17 #ifdef HAVE_STDLIB_H
     18 #include <stdlib.h>
     19 #endif
     20 #ifdef HAVE_SYS_STAT_H
     21 #include <sys/stat.h>
     22 #endif
     23 #ifdef HAVE_FCNTL_H
     24 #include <fcntl.h>
     25 #endif
     26 #ifdef HAVE_UNISTD_H
     27 #include <unistd.h>
     28 #endif
     29 #ifdef LIBXML_ZLIB_ENABLED
     30 #include <zlib.h>
     31 #endif
     32 
     33 #include <libxml/xmlmemory.h>
     34 #include <libxml/tree.h>
     35 #include <libxml/parser.h>
     36 #include <libxml/parserInternals.h>
     37 #include <libxml/xmlerror.h>
     38 #include <libxml/HTMLparser.h>
     39 #include <libxml/HTMLtree.h>
     40 #include <libxml/entities.h>
     41 #include <libxml/encoding.h>
     42 #include <libxml/valid.h>
     43 #include <libxml/xmlIO.h>
     44 #include <libxml/globals.h>
     45 #include <libxml/uri.h>
     46 
     47 #include "buf.h"
     48 #include "enc.h"
     49 
     50 #define HTML_MAX_NAMELEN 1000
     51 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
     52 #define HTML_PARSER_BUFFER_SIZE 100
     53 
     54 /* #define DEBUG */
     55 /* #define DEBUG_PUSH */
     56 
     57 static int htmlOmittedDefaultValue = 1;
     58 
     59 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
     60 			     xmlChar end, xmlChar  end2, xmlChar end3);
     61 static void htmlParseComment(htmlParserCtxtPtr ctxt);
     62 
     63 /************************************************************************
     64  *									*
     65  *		Some factorized error routines				*
     66  *									*
     67  ************************************************************************/
     68 
     69 /**
     70  * htmlErrMemory:
     71  * @ctxt:  an HTML parser context
     72  * @extra:  extra informations
     73  *
     74  * Handle a redefinition of attribute error
     75  */
     76 static void
     77 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
     78 {
     79     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
     80         (ctxt->instate == XML_PARSER_EOF))
     81 	return;
     82     if (ctxt != NULL) {
     83         ctxt->errNo = XML_ERR_NO_MEMORY;
     84         ctxt->instate = XML_PARSER_EOF;
     85         ctxt->disableSAX = 1;
     86     }
     87     if (extra)
     88         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
     89                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
     90                         NULL, NULL, 0, 0,
     91                         "Memory allocation failed : %s\n", extra);
     92     else
     93         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
     94                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
     95                         NULL, NULL, 0, 0, "Memory allocation failed\n");
     96 }
     97 
     98 /**
     99  * htmlParseErr:
    100  * @ctxt:  an HTML parser context
    101  * @error:  the error number
    102  * @msg:  the error message
    103  * @str1:  string infor
    104  * @str2:  string infor
    105  *
    106  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
    107  */
    108 static void LIBXML_ATTR_FORMAT(3,0)
    109 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
    110              const char *msg, const xmlChar *str1, const xmlChar *str2)
    111 {
    112     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
    113         (ctxt->instate == XML_PARSER_EOF))
    114 	return;
    115     if (ctxt != NULL)
    116 	ctxt->errNo = error;
    117     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
    118                     XML_ERR_ERROR, NULL, 0,
    119 		    (const char *) str1, (const char *) str2,
    120 		    NULL, 0, 0,
    121 		    msg, str1, str2);
    122     if (ctxt != NULL)
    123 	ctxt->wellFormed = 0;
    124 }
    125 
    126 /**
    127  * htmlParseErrInt:
    128  * @ctxt:  an HTML parser context
    129  * @error:  the error number
    130  * @msg:  the error message
    131  * @val:  integer info
    132  *
    133  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
    134  */
    135 static void LIBXML_ATTR_FORMAT(3,0)
    136 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
    137              const char *msg, int val)
    138 {
    139     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
    140         (ctxt->instate == XML_PARSER_EOF))
    141 	return;
    142     if (ctxt != NULL)
    143 	ctxt->errNo = error;
    144     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
    145                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
    146 		    NULL, val, 0, msg, val);
    147     if (ctxt != NULL)
    148 	ctxt->wellFormed = 0;
    149 }
    150 
    151 /************************************************************************
    152  *									*
    153  *	Parser stacks related functions and macros		*
    154  *									*
    155  ************************************************************************/
    156 
    157 /**
    158  * htmlnamePush:
    159  * @ctxt:  an HTML parser context
    160  * @value:  the element name
    161  *
    162  * Pushes a new element name on top of the name stack
    163  *
    164  * Returns 0 in case of error, the index in the stack otherwise
    165  */
    166 static int
    167 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
    168 {
    169     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
    170         ctxt->html = 3;
    171     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
    172         ctxt->html = 10;
    173     if (ctxt->nameNr >= ctxt->nameMax) {
    174         ctxt->nameMax *= 2;
    175         ctxt->nameTab = (const xmlChar * *)
    176                          xmlRealloc((xmlChar * *)ctxt->nameTab,
    177                                     ctxt->nameMax *
    178                                     sizeof(ctxt->nameTab[0]));
    179         if (ctxt->nameTab == NULL) {
    180             htmlErrMemory(ctxt, NULL);
    181             return (0);
    182         }
    183     }
    184     ctxt->nameTab[ctxt->nameNr] = value;
    185     ctxt->name = value;
    186     return (ctxt->nameNr++);
    187 }
    188 /**
    189  * htmlnamePop:
    190  * @ctxt: an HTML parser context
    191  *
    192  * Pops the top element name from the name stack
    193  *
    194  * Returns the name just removed
    195  */
    196 static const xmlChar *
    197 htmlnamePop(htmlParserCtxtPtr ctxt)
    198 {
    199     const xmlChar *ret;
    200 
    201     if (ctxt->nameNr <= 0)
    202         return (NULL);
    203     ctxt->nameNr--;
    204     if (ctxt->nameNr < 0)
    205         return (NULL);
    206     if (ctxt->nameNr > 0)
    207         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
    208     else
    209         ctxt->name = NULL;
    210     ret = ctxt->nameTab[ctxt->nameNr];
    211     ctxt->nameTab[ctxt->nameNr] = NULL;
    212     return (ret);
    213 }
    214 
    215 /**
    216  * htmlNodeInfoPush:
    217  * @ctxt:  an HTML parser context
    218  * @value:  the node info
    219  *
    220  * Pushes a new element name on top of the node info stack
    221  *
    222  * Returns 0 in case of error, the index in the stack otherwise
    223  */
    224 static int
    225 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
    226 {
    227     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
    228         if (ctxt->nodeInfoMax == 0)
    229                 ctxt->nodeInfoMax = 5;
    230         ctxt->nodeInfoMax *= 2;
    231         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
    232                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
    233                                     ctxt->nodeInfoMax *
    234                                     sizeof(ctxt->nodeInfoTab[0]));
    235         if (ctxt->nodeInfoTab == NULL) {
    236             htmlErrMemory(ctxt, NULL);
    237             return (0);
    238         }
    239     }
    240     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
    241     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
    242     return (ctxt->nodeInfoNr++);
    243 }
    244 
    245 /**
    246  * htmlNodeInfoPop:
    247  * @ctxt:  an HTML parser context
    248  *
    249  * Pops the top element name from the node info stack
    250  *
    251  * Returns 0 in case of error, the pointer to NodeInfo otherwise
    252  */
    253 static htmlParserNodeInfo *
    254 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
    255 {
    256     if (ctxt->nodeInfoNr <= 0)
    257         return (NULL);
    258     ctxt->nodeInfoNr--;
    259     if (ctxt->nodeInfoNr < 0)
    260         return (NULL);
    261     if (ctxt->nodeInfoNr > 0)
    262         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
    263     else
    264         ctxt->nodeInfo = NULL;
    265     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
    266 }
    267 
    268 /*
    269  * Macros for accessing the content. Those should be used only by the parser,
    270  * and not exported.
    271  *
    272  * Dirty macros, i.e. one need to make assumption on the context to use them
    273  *
    274  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
    275  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
    276  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
    277  *           in UNICODE mode. This should be used internally by the parser
    278  *           only to compare to ASCII values otherwise it would break when
    279  *           running with UTF-8 encoding.
    280  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
    281  *           to compare on ASCII based substring.
    282  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
    283  *           it should be used only to compare on ASCII based substring.
    284  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
    285  *           strings without newlines within the parser.
    286  *
    287  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
    288  *
    289  *   CURRENT Returns the current char value, with the full decoding of
    290  *           UTF-8 if we are using this mode. It returns an int.
    291  *   NEXT    Skip to the next character, this does the proper decoding
    292  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
    293  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
    294  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
    295  */
    296 
    297 #define UPPER (toupper(*ctxt->input->cur))
    298 
    299 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
    300 
    301 #define NXT(val) ctxt->input->cur[(val)]
    302 
    303 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
    304 
    305 #define CUR_PTR ctxt->input->cur
    306 #define BASE_PTR ctxt->input->base
    307 
    308 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
    309 		   (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
    310 	xmlParserInputShrink(ctxt->input)
    311 
    312 #define GROW if ((ctxt->progressive == 0) &&				\
    313 		 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))	\
    314 	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
    315 
    316 #define CURRENT ((int) (*ctxt->input->cur))
    317 
    318 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
    319 
    320 /* Inported from XML */
    321 
    322 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
    323 #define CUR ((int) (*ctxt->input->cur))
    324 #define NEXT xmlNextChar(ctxt)
    325 
    326 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
    327 
    328 
    329 #define NEXTL(l) do {							\
    330     if (*(ctxt->input->cur) == '\n') {					\
    331 	ctxt->input->line++; ctxt->input->col = 1;			\
    332     } else ctxt->input->col++;						\
    333     ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;		\
    334   } while (0)
    335 
    336 /************
    337     \
    338     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);	\
    339     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
    340  ************/
    341 
    342 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
    343 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
    344 
    345 #define COPY_BUF(l,b,i,v)						\
    346     if (l == 1) b[i++] = (xmlChar) v;					\
    347     else i += xmlCopyChar(l,&b[i],v)
    348 
    349 /**
    350  * htmlFindEncoding:
    351  * @the HTML parser context
    352  *
    353  * Ty to find and encoding in the current data available in the input
    354  * buffer this is needed to try to switch to the proper encoding when
    355  * one face a character error.
    356  * That's an heuristic, since it's operating outside of parsing it could
    357  * try to use a meta which had been commented out, that's the reason it
    358  * should only be used in case of error, not as a default.
    359  *
    360  * Returns an encoding string or NULL if not found, the string need to
    361  *   be freed
    362  */
    363 static xmlChar *
    364 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
    365     const xmlChar *start, *cur, *end;
    366 
    367     if ((ctxt == NULL) || (ctxt->input == NULL) ||
    368         (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
    369         (ctxt->input->buf->encoder != NULL))
    370         return(NULL);
    371     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
    372         return(NULL);
    373 
    374     start = ctxt->input->cur;
    375     end = ctxt->input->end;
    376     /* we also expect the input buffer to be zero terminated */
    377     if (*end != 0)
    378         return(NULL);
    379 
    380     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
    381     if (cur == NULL)
    382         return(NULL);
    383     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
    384     if (cur == NULL)
    385         return(NULL);
    386     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
    387     if (cur == NULL)
    388         return(NULL);
    389     cur += 8;
    390     start = cur;
    391     while (((*cur >= 'A') && (*cur <= 'Z')) ||
    392            ((*cur >= 'a') && (*cur <= 'z')) ||
    393            ((*cur >= '0') && (*cur <= '9')) ||
    394            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
    395            cur++;
    396     if (cur == start)
    397         return(NULL);
    398     return(xmlStrndup(start, cur - start));
    399 }
    400 
    401 /**
    402  * htmlCurrentChar:
    403  * @ctxt:  the HTML parser context
    404  * @len:  pointer to the length of the char read
    405  *
    406  * The current char value, if using UTF-8 this may actually span multiple
    407  * bytes in the input buffer. Implement the end of line normalization:
    408  * 2.11 End-of-Line Handling
    409  * If the encoding is unspecified, in the case we find an ISO-Latin-1
    410  * char, then the encoding converter is plugged in automatically.
    411  *
    412  * Returns the current char value and its length
    413  */
    414 
    415 static int
    416 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
    417     if (ctxt->instate == XML_PARSER_EOF)
    418 	return(0);
    419 
    420     if (ctxt->token != 0) {
    421 	*len = 0;
    422 	return(ctxt->token);
    423     }
    424     if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
    425 	/*
    426 	 * We are supposed to handle UTF8, check it's valid
    427 	 * From rfc2044: encoding of the Unicode values on UTF-8:
    428 	 *
    429 	 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
    430 	 * 0000 0000-0000 007F   0xxxxxxx
    431 	 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
    432 	 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
    433 	 *
    434 	 * Check for the 0x110000 limit too
    435 	 */
    436 	const unsigned char *cur = ctxt->input->cur;
    437 	unsigned char c;
    438 	unsigned int val;
    439 
    440 	c = *cur;
    441 	if (c & 0x80) {
    442 	    if (cur[1] == 0) {
    443 		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
    444                 cur = ctxt->input->cur;
    445             }
    446 	    if ((cur[1] & 0xc0) != 0x80)
    447 		goto encoding_error;
    448 	    if ((c & 0xe0) == 0xe0) {
    449 
    450 		if (cur[2] == 0) {
    451 		    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
    452                     cur = ctxt->input->cur;
    453                 }
    454 		if ((cur[2] & 0xc0) != 0x80)
    455 		    goto encoding_error;
    456 		if ((c & 0xf0) == 0xf0) {
    457 		    if (cur[3] == 0) {
    458 			xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
    459                         cur = ctxt->input->cur;
    460                     }
    461 		    if (((c & 0xf8) != 0xf0) ||
    462 			((cur[3] & 0xc0) != 0x80))
    463 			goto encoding_error;
    464 		    /* 4-byte code */
    465 		    *len = 4;
    466 		    val = (cur[0] & 0x7) << 18;
    467 		    val |= (cur[1] & 0x3f) << 12;
    468 		    val |= (cur[2] & 0x3f) << 6;
    469 		    val |= cur[3] & 0x3f;
    470 		} else {
    471 		  /* 3-byte code */
    472 		    *len = 3;
    473 		    val = (cur[0] & 0xf) << 12;
    474 		    val |= (cur[1] & 0x3f) << 6;
    475 		    val |= cur[2] & 0x3f;
    476 		}
    477 	    } else {
    478 	      /* 2-byte code */
    479 		*len = 2;
    480 		val = (cur[0] & 0x1f) << 6;
    481 		val |= cur[1] & 0x3f;
    482 	    }
    483 	    if (!IS_CHAR(val)) {
    484 	        htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
    485 				"Char 0x%X out of allowed range\n", val);
    486 	    }
    487 	    return(val);
    488 	} else {
    489             if ((*ctxt->input->cur == 0) &&
    490                 (ctxt->input->cur < ctxt->input->end)) {
    491                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
    492 				"Char 0x%X out of allowed range\n", 0);
    493                 *len = 1;
    494                 return(' ');
    495             }
    496 	    /* 1-byte code */
    497 	    *len = 1;
    498 	    return((int) *ctxt->input->cur);
    499 	}
    500     }
    501     /*
    502      * Assume it's a fixed length encoding (1) with
    503      * a compatible encoding for the ASCII set, since
    504      * XML constructs only use < 128 chars
    505      */
    506     *len = 1;
    507     if ((int) *ctxt->input->cur < 0x80)
    508 	return((int) *ctxt->input->cur);
    509 
    510     /*
    511      * Humm this is bad, do an automatic flow conversion
    512      */
    513     {
    514         xmlChar * guess;
    515         xmlCharEncodingHandlerPtr handler;
    516 
    517         guess = htmlFindEncoding(ctxt);
    518         if (guess == NULL) {
    519             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
    520         } else {
    521             if (ctxt->input->encoding != NULL)
    522                 xmlFree((xmlChar *) ctxt->input->encoding);
    523             ctxt->input->encoding = guess;
    524             handler = xmlFindCharEncodingHandler((const char *) guess);
    525             if (handler != NULL) {
    526                 xmlSwitchToEncoding(ctxt, handler);
    527             } else {
    528                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
    529                              "Unsupported encoding %s", guess, NULL);
    530             }
    531         }
    532         ctxt->charset = XML_CHAR_ENCODING_UTF8;
    533     }
    534 
    535     return(xmlCurrentChar(ctxt, len));
    536 
    537 encoding_error:
    538     /*
    539      * If we detect an UTF8 error that probably mean that the
    540      * input encoding didn't get properly advertized in the
    541      * declaration header. Report the error and switch the encoding
    542      * to ISO-Latin-1 (if you don't like this policy, just declare the
    543      * encoding !)
    544      */
    545     {
    546         char buffer[150];
    547 
    548 	if (ctxt->input->end - ctxt->input->cur >= 4) {
    549 	    snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
    550 			    ctxt->input->cur[0], ctxt->input->cur[1],
    551 			    ctxt->input->cur[2], ctxt->input->cur[3]);
    552 	} else {
    553 	    snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
    554 	}
    555 	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
    556 		     "Input is not proper UTF-8, indicate encoding !\n",
    557 		     BAD_CAST buffer, NULL);
    558     }
    559 
    560     ctxt->charset = XML_CHAR_ENCODING_8859_1;
    561     *len = 1;
    562     return((int) *ctxt->input->cur);
    563 }
    564 
    565 /**
    566  * htmlSkipBlankChars:
    567  * @ctxt:  the HTML parser context
    568  *
    569  * skip all blanks character found at that point in the input streams.
    570  *
    571  * Returns the number of space chars skipped
    572  */
    573 
    574 static int
    575 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
    576     int res = 0;
    577 
    578     while (IS_BLANK_CH(*(ctxt->input->cur))) {
    579 	if ((*ctxt->input->cur == 0) &&
    580 	    (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
    581 		xmlPopInput(ctxt);
    582 	} else {
    583 	    if (*(ctxt->input->cur) == '\n') {
    584 		ctxt->input->line++; ctxt->input->col = 1;
    585 	    } else ctxt->input->col++;
    586 	    ctxt->input->cur++;
    587 	    ctxt->nbChars++;
    588 	    if (*ctxt->input->cur == 0)
    589 		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
    590 	}
    591 	res++;
    592     }
    593     return(res);
    594 }
    595 
    596 
    597 
    598 /************************************************************************
    599  *									*
    600  *	The list of HTML elements and their properties		*
    601  *									*
    602  ************************************************************************/
    603 
    604 /*
    605  *  Start Tag: 1 means the start tag can be ommited
    606  *  End Tag:   1 means the end tag can be ommited
    607  *             2 means it's forbidden (empty elements)
    608  *             3 means the tag is stylistic and should be closed easily
    609  *  Depr:      this element is deprecated
    610  *  DTD:       1 means that this element is valid only in the Loose DTD
    611  *             2 means that this element is valid only in the Frameset DTD
    612  *
    613  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
    614 	, subElements , impliedsubelt , Attributes, userdata
    615  */
    616 
    617 /* Definitions and a couple of vars for HTML Elements */
    618 
    619 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
    620 #define NB_FONTSTYLE 8
    621 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
    622 #define NB_PHRASE 10
    623 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
    624 #define NB_SPECIAL 16
    625 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
    626 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
    627 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
    628 #define NB_BLOCK NB_HEADING + NB_LIST + 14
    629 #define FORMCTRL "input", "select", "textarea", "label", "button"
    630 #define NB_FORMCTRL 5
    631 #define PCDATA
    632 #define NB_PCDATA 0
    633 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
    634 #define NB_HEADING 6
    635 #define LIST "ul", "ol", "dir", "menu"
    636 #define NB_LIST 4
    637 #define MODIFIER
    638 #define NB_MODIFIER 0
    639 #define FLOW BLOCK,INLINE
    640 #define NB_FLOW NB_BLOCK + NB_INLINE
    641 #define EMPTY NULL
    642 
    643 
    644 static const char* const html_flow[] = { FLOW, NULL } ;
    645 static const char* const html_inline[] = { INLINE, NULL } ;
    646 
    647 /* placeholders: elts with content but no subelements */
    648 static const char* const html_pcdata[] = { NULL } ;
    649 #define html_cdata html_pcdata
    650 
    651 
    652 /* ... and for HTML Attributes */
    653 
    654 #define COREATTRS "id", "class", "style", "title"
    655 #define NB_COREATTRS 4
    656 #define I18N "lang", "dir"
    657 #define NB_I18N 2
    658 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
    659 #define NB_EVENTS 9
    660 #define ATTRS COREATTRS,I18N,EVENTS
    661 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
    662 #define CELLHALIGN "align", "char", "charoff"
    663 #define NB_CELLHALIGN 3
    664 #define CELLVALIGN "valign"
    665 #define NB_CELLVALIGN 1
    666 
    667 static const char* const html_attrs[] = { ATTRS, NULL } ;
    668 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
    669 static const char* const core_attrs[] = { COREATTRS, NULL } ;
    670 static const char* const i18n_attrs[] = { I18N, NULL } ;
    671 
    672 
    673 /* Other declarations that should go inline ... */
    674 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
    675 	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
    676 	"tabindex", "onfocus", "onblur", NULL } ;
    677 static const char* const target_attr[] = { "target", NULL } ;
    678 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
    679 static const char* const alt_attr[] = { "alt", NULL } ;
    680 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
    681 static const char* const href_attrs[] = { "href", NULL } ;
    682 static const char* const clear_attrs[] = { "clear", NULL } ;
    683 static const char* const inline_p[] = { INLINE, "p", NULL } ;
    684 
    685 static const char* const flow_param[] = { FLOW, "param", NULL } ;
    686 static const char* const applet_attrs[] = { COREATTRS , "codebase",
    687 		"archive", "alt", "name", "height", "width", "align",
    688 		"hspace", "vspace", NULL } ;
    689 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
    690 	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
    691 static const char* const basefont_attrs[] =
    692 	{ "id", "size", "color", "face", NULL } ;
    693 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
    694 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
    695 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
    696 static const char* const body_depr[] = { "background", "bgcolor", "text",
    697 	"link", "vlink", "alink", NULL } ;
    698 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
    699 	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
    700 
    701 
    702 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
    703 static const char* const col_elt[] = { "col", NULL } ;
    704 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
    705 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
    706 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
    707 static const char* const compact_attr[] = { "compact", NULL } ;
    708 static const char* const label_attr[] = { "label", NULL } ;
    709 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
    710 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
    711 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
    712 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
    713 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
    714 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
    715 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
    716 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
    717 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
    718 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
    719 static const char* const version_attr[] = { "version", NULL } ;
    720 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
    721 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
    722 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
    723 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
    724 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
    725 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
    726 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
    727 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
    728 static const char* const align_attr[] = { "align", NULL } ;
    729 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
    730 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
    731 static const char* const name_attr[] = { "name", NULL } ;
    732 static const char* const action_attr[] = { "action", NULL } ;
    733 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
    734 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
    735 static const char* const content_attr[] = { "content", NULL } ;
    736 static const char* const type_attr[] = { "type", NULL } ;
    737 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
    738 static const char* const object_contents[] = { FLOW, "param", NULL } ;
    739 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
    740 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
    741 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
    742 static const char* const option_elt[] = { "option", NULL } ;
    743 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
    744 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
    745 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
    746 static const char* const width_attr[] = { "width", NULL } ;
    747 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
    748 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
    749 static const char* const language_attr[] = { "language", NULL } ;
    750 static const char* const select_content[] = { "optgroup", "option", NULL } ;
    751 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
    752 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
    753 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
    754 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
    755 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
    756 static const char* const tr_elt[] = { "tr", NULL } ;
    757 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
    758 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
    759 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
    760 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
    761 static const char* const tr_contents[] = { "th", "td", NULL } ;
    762 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
    763 static const char* const li_elt[] = { "li", NULL } ;
    764 static const char* const ul_depr[] = { "type", "compact", NULL} ;
    765 static const char* const dir_attr[] = { "dir", NULL} ;
    766 
    767 #define DECL (const char**)
    768 
    769 static const htmlElemDesc
    770 html40ElementTable[] = {
    771 { "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
    772 	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
    773 },
    774 { "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
    775 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    776 },
    777 { "acronym",	0, 0, 0, 0, 0, 0, 1, "",
    778 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    779 },
    780 { "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
    781 	DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
    782 },
    783 { "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
    784 	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
    785 },
    786 { "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
    787 	EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
    788 },
    789 { "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
    790 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    791 },
    792 { "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
    793 	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
    794 },
    795 { "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
    796 	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
    797 },
    798 { "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
    799 	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
    800 },
    801 { "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
    802 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    803 },
    804 { "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
    805 	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
    806 },
    807 { "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
    808 	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
    809 },
    810 { "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
    811 	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
    812 },
    813 { "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
    814 	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
    815 },
    816 { "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
    817 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    818 },
    819 { "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
    820 	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
    821 },
    822 { "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
    823 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    824 },
    825 { "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
    826 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    827 },
    828 { "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
    829 	EMPTY , NULL , DECL col_attrs , NULL, NULL
    830 },
    831 { "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
    832 	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
    833 },
    834 { "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
    835 	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
    836 },
    837 { "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
    838 	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
    839 },
    840 { "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
    841 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    842 },
    843 { "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
    844 	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
    845 },
    846 { "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
    847 	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
    848 },
    849 { "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
    850 	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
    851 },
    852 { "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
    853 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    854 },
    855 { "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
    856 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    857 },
    858 { "embed",	0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
    859 	EMPTY, NULL, DECL embed_attrs, NULL, NULL
    860 },
    861 { "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
    862 	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
    863 },
    864 { "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
    865 	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
    866 },
    867 { "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
    868 	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
    869 },
    870 { "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
    871 	EMPTY, NULL, NULL, DECL frame_attrs, NULL
    872 },
    873 { "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
    874 	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
    875 },
    876 { "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
    877 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    878 },
    879 { "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
    880 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    881 },
    882 { "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
    883 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    884 },
    885 { "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
    886 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    887 },
    888 { "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
    889 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    890 },
    891 { "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
    892 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    893 },
    894 { "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
    895 	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
    896 },
    897 { "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
    898 	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
    899 },
    900 { "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
    901 	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
    902 },
    903 { "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
    904 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    905 },
    906 { "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
    907 	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
    908 },
    909 { "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
    910 	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
    911 },
    912 { "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
    913 	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
    914 },
    915 { "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
    916 	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
    917 },
    918 { "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
    919 	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
    920 },
    921 { "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
    922 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    923 },
    924 { "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
    925 	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
    926 },
    927 { "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
    928 	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
    929 },
    930 { "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
    931 	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
    932 },
    933 { "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
    934 	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
    935 },
    936 { "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
    937 	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
    938 },
    939 { "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
    940 	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
    941 },
    942 { "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
    943 	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
    944 },
    945 { "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
    946 	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
    947 },
    948 { "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
    949 	DECL html_flow, "div", DECL html_attrs, NULL, NULL
    950 },
    951 { "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
    952 	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
    953 },
    954 { "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
    955 	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
    956 },
    957 { "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
    958 	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
    959 },
    960 { "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
    961 	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
    962 },
    963 { "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
    964 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    965 },
    966 { "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
    967 	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
    968 },
    969 { "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
    970 	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
    971 },
    972 { "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
    973 	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
    974 },
    975 { "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
    976 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
    977 },
    978 { "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
    979 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    980 },
    981 { "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
    982 	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
    983 },
    984 { "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
    985 	DECL select_content, NULL, DECL select_attrs, NULL, NULL
    986 },
    987 { "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
    988 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    989 },
    990 { "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
    991 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    992 },
    993 { "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
    994 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
    995 },
    996 { "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
    997 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    998 },
    999 { "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
   1000 	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
   1001 },
   1002 { "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
   1003 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
   1004 },
   1005 { "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
   1006 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
   1007 },
   1008 { "table",	0, 0, 0, 0, 0, 0, 0, "",
   1009 	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
   1010 },
   1011 { "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
   1012 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
   1013 },
   1014 { "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
   1015 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
   1016 },
   1017 { "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
   1018 	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
   1019 },
   1020 { "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
   1021 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
   1022 },
   1023 { "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
   1024 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
   1025 },
   1026 { "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
   1027 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
   1028 },
   1029 { "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
   1030 	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
   1031 },
   1032 { "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
   1033 	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
   1034 },
   1035 { "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
   1036 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
   1037 },
   1038 { "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
   1039 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
   1040 },
   1041 { "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
   1042 	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
   1043 },
   1044 { "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
   1045 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
   1046 }
   1047 };
   1048 
   1049 /*
   1050  * start tags that imply the end of current element
   1051  */
   1052 static const char * const htmlStartClose[] = {
   1053 "form",		"form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
   1054 		"dl", "ul", "ol", "menu", "dir", "address", "pre",
   1055 		"listing", "xmp", "head", NULL,
   1056 "head",		"p", NULL,
   1057 "title",	"p", NULL,
   1058 "body",		"head", "style", "link", "title", "p", NULL,
   1059 "frameset",	"head", "style", "link", "title", "p", NULL,
   1060 "li",		"p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
   1061 		"pre", "listing", "xmp", "head", "li", NULL,
   1062 "hr",		"p", "head", NULL,
   1063 "h1",		"p", "head", NULL,
   1064 "h2",		"p", "head", NULL,
   1065 "h3",		"p", "head", NULL,
   1066 "h4",		"p", "head", NULL,
   1067 "h5",		"p", "head", NULL,
   1068 "h6",		"p", "head", NULL,
   1069 "dir",		"p", "head", NULL,
   1070 "address",	"p", "head", "ul", NULL,
   1071 "pre",		"p", "head", "ul", NULL,
   1072 "listing",	"p", "head", NULL,
   1073 "xmp",		"p", "head", NULL,
   1074 "blockquote",	"p", "head", NULL,
   1075 "dl",		"p", "dt", "menu", "dir", "address", "pre", "listing",
   1076 		"xmp", "head", NULL,
   1077 "dt",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
   1078                 "head", "dd", NULL,
   1079 "dd",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
   1080                 "head", "dt", NULL,
   1081 "ul",		"p", "head", "ol", "menu", "dir", "address", "pre",
   1082 		"listing", "xmp", NULL,
   1083 "ol",		"p", "head", "ul", NULL,
   1084 "menu",		"p", "head", "ul", NULL,
   1085 "p",		"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
   1086 "div",		"p", "head", NULL,
   1087 "noscript",	"script", NULL,
   1088 "center",	"font", "b", "i", "p", "head", NULL,
   1089 "a",		"a", "head", NULL,
   1090 "caption",	"p", NULL,
   1091 "colgroup",	"caption", "colgroup", "col", "p", NULL,
   1092 "col",		"caption", "col", "p", NULL,
   1093 "table",	"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
   1094 		"listing", "xmp", "a", NULL,
   1095 "th",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
   1096 "td",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
   1097 "tr",		"th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
   1098 "thead",	"caption", "col", "colgroup", NULL,
   1099 "tfoot",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
   1100 		"tbody", "p", NULL,
   1101 "tbody",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
   1102 		"tfoot", "tbody", "p", NULL,
   1103 "optgroup",	"option", NULL,
   1104 "option",	"option", NULL,
   1105 "fieldset",	"legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
   1106 		"pre", "listing", "xmp", "a", NULL,
   1107 /* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
   1108 "tt",		"head", NULL,
   1109 "i",		"head", NULL,
   1110 "b",		"head", NULL,
   1111 "u",		"head", NULL,
   1112 "s",		"head", NULL,
   1113 "strike",	"head", NULL,
   1114 "big",		"head", NULL,
   1115 "small",	"head", NULL,
   1116 
   1117 "em",		"head", NULL,
   1118 "strong",	"head", NULL,
   1119 "dfn",		"head", NULL,
   1120 "code",		"head", NULL,
   1121 "samp",		"head", NULL,
   1122 "kbd",		"head", NULL,
   1123 "var",		"head", NULL,
   1124 "cite",		"head", NULL,
   1125 "abbr",		"head", NULL,
   1126 "acronym",	"head", NULL,
   1127 
   1128 /* "a" */
   1129 "img",		"head", NULL,
   1130 /* "applet" */
   1131 /* "embed" */
   1132 /* "object" */
   1133 "font",		"head", NULL,
   1134 /* "basefont" */
   1135 "br",		"head", NULL,
   1136 /* "script" */
   1137 "map",		"head", NULL,
   1138 "q",		"head", NULL,
   1139 "sub",		"head", NULL,
   1140 "sup",		"head", NULL,
   1141 "span",		"head", NULL,
   1142 "bdo",		"head", NULL,
   1143 "iframe",	"head", NULL,
   1144 NULL
   1145 };
   1146 
   1147 /*
   1148  * The list of HTML elements which are supposed not to have
   1149  * CDATA content and where a p element will be implied
   1150  *
   1151  * TODO: extend that list by reading the HTML SGML DTD on
   1152  *       implied paragraph
   1153  */
   1154 static const char *const htmlNoContentElements[] = {
   1155     "html",
   1156     "head",
   1157     NULL
   1158 };
   1159 
   1160 /*
   1161  * The list of HTML attributes which are of content %Script;
   1162  * NOTE: when adding ones, check htmlIsScriptAttribute() since
   1163  *       it assumes the name starts with 'on'
   1164  */
   1165 static const char *const htmlScriptAttributes[] = {
   1166     "onclick",
   1167     "ondblclick",
   1168     "onmousedown",
   1169     "onmouseup",
   1170     "onmouseover",
   1171     "onmousemove",
   1172     "onmouseout",
   1173     "onkeypress",
   1174     "onkeydown",
   1175     "onkeyup",
   1176     "onload",
   1177     "onunload",
   1178     "onfocus",
   1179     "onblur",
   1180     "onsubmit",
   1181     "onreset",
   1182     "onchange",
   1183     "onselect"
   1184 };
   1185 
   1186 /*
   1187  * This table is used by the htmlparser to know what to do with
   1188  * broken html pages. By assigning different priorities to different
   1189  * elements the parser can decide how to handle extra endtags.
   1190  * Endtags are only allowed to close elements with lower or equal
   1191  * priority.
   1192  */
   1193 
   1194 typedef struct {
   1195     const char *name;
   1196     int priority;
   1197 } elementPriority;
   1198 
   1199 static const elementPriority htmlEndPriority[] = {
   1200     {"div",   150},
   1201     {"td",    160},
   1202     {"th",    160},
   1203     {"tr",    170},
   1204     {"thead", 180},
   1205     {"tbody", 180},
   1206     {"tfoot", 180},
   1207     {"table", 190},
   1208     {"head",  200},
   1209     {"body",  200},
   1210     {"html",  220},
   1211     {NULL,    100} /* Default priority */
   1212 };
   1213 
   1214 static const char** htmlStartCloseIndex[100];
   1215 static int htmlStartCloseIndexinitialized = 0;
   1216 
   1217 /************************************************************************
   1218  *									*
   1219  *	functions to handle HTML specific data			*
   1220  *									*
   1221  ************************************************************************/
   1222 
   1223 /**
   1224  * htmlInitAutoClose:
   1225  *
   1226  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
   1227  * This is not reentrant. Call xmlInitParser() once before processing in
   1228  * case of use in multithreaded programs.
   1229  */
   1230 void
   1231 htmlInitAutoClose(void) {
   1232     int indx, i = 0;
   1233 
   1234     if (htmlStartCloseIndexinitialized) return;
   1235 
   1236     for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
   1237     indx = 0;
   1238     while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
   1239         htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
   1240 	while (htmlStartClose[i] != NULL) i++;
   1241 	i++;
   1242     }
   1243     htmlStartCloseIndexinitialized = 1;
   1244 }
   1245 
   1246 /**
   1247  * htmlTagLookup:
   1248  * @tag:  The tag name in lowercase
   1249  *
   1250  * Lookup the HTML tag in the ElementTable
   1251  *
   1252  * Returns the related htmlElemDescPtr or NULL if not found.
   1253  */
   1254 const htmlElemDesc *
   1255 htmlTagLookup(const xmlChar *tag) {
   1256     unsigned int i;
   1257 
   1258     for (i = 0; i < (sizeof(html40ElementTable) /
   1259                      sizeof(html40ElementTable[0]));i++) {
   1260         if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
   1261 	    return((htmlElemDescPtr) &html40ElementTable[i]);
   1262     }
   1263     return(NULL);
   1264 }
   1265 
   1266 /**
   1267  * htmlGetEndPriority:
   1268  * @name: The name of the element to look up the priority for.
   1269  *
   1270  * Return value: The "endtag" priority.
   1271  **/
   1272 static int
   1273 htmlGetEndPriority (const xmlChar *name) {
   1274     int i = 0;
   1275 
   1276     while ((htmlEndPriority[i].name != NULL) &&
   1277 	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
   1278 	i++;
   1279 
   1280     return(htmlEndPriority[i].priority);
   1281 }
   1282 
   1283 
   1284 /**
   1285  * htmlCheckAutoClose:
   1286  * @newtag:  The new tag name
   1287  * @oldtag:  The old tag name
   1288  *
   1289  * Checks whether the new tag is one of the registered valid tags for
   1290  * closing old.
   1291  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
   1292  *
   1293  * Returns 0 if no, 1 if yes.
   1294  */
   1295 static int
   1296 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
   1297 {
   1298     int i, indx;
   1299     const char **closed = NULL;
   1300 
   1301     if (htmlStartCloseIndexinitialized == 0)
   1302         htmlInitAutoClose();
   1303 
   1304     /* inefficient, but not a big deal */
   1305     for (indx = 0; indx < 100; indx++) {
   1306         closed = htmlStartCloseIndex[indx];
   1307         if (closed == NULL)
   1308             return (0);
   1309         if (xmlStrEqual(BAD_CAST * closed, newtag))
   1310             break;
   1311     }
   1312 
   1313     i = closed - htmlStartClose;
   1314     i++;
   1315     while (htmlStartClose[i] != NULL) {
   1316         if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
   1317             return (1);
   1318         }
   1319         i++;
   1320     }
   1321     return (0);
   1322 }
   1323 
   1324 /**
   1325  * htmlAutoCloseOnClose:
   1326  * @ctxt:  an HTML parser context
   1327  * @newtag:  The new tag name
   1328  * @force:  force the tag closure
   1329  *
   1330  * The HTML DTD allows an ending tag to implicitly close other tags.
   1331  */
   1332 static void
   1333 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
   1334 {
   1335     const htmlElemDesc *info;
   1336     int i, priority;
   1337 
   1338     priority = htmlGetEndPriority(newtag);
   1339 
   1340     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
   1341 
   1342         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
   1343             break;
   1344         /*
   1345          * A missplaced endtag can only close elements with lower
   1346          * or equal priority, so if we find an element with higher
   1347          * priority before we find an element with
   1348          * matching name, we just ignore this endtag
   1349          */
   1350         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
   1351             return;
   1352     }
   1353     if (i < 0)
   1354         return;
   1355 
   1356     while (!xmlStrEqual(newtag, ctxt->name)) {
   1357         info = htmlTagLookup(ctxt->name);
   1358         if ((info != NULL) && (info->endTag == 3)) {
   1359             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
   1360 	                 "Opening and ending tag mismatch: %s and %s\n",
   1361 			 newtag, ctxt->name);
   1362         }
   1363         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   1364             ctxt->sax->endElement(ctxt->userData, ctxt->name);
   1365 	htmlnamePop(ctxt);
   1366     }
   1367 }
   1368 
   1369 /**
   1370  * htmlAutoCloseOnEnd:
   1371  * @ctxt:  an HTML parser context
   1372  *
   1373  * Close all remaining tags at the end of the stream
   1374  */
   1375 static void
   1376 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
   1377 {
   1378     int i;
   1379 
   1380     if (ctxt->nameNr == 0)
   1381         return;
   1382     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
   1383         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   1384             ctxt->sax->endElement(ctxt->userData, ctxt->name);
   1385 	htmlnamePop(ctxt);
   1386     }
   1387 }
   1388 
   1389 /**
   1390  * htmlAutoClose:
   1391  * @ctxt:  an HTML parser context
   1392  * @newtag:  The new tag name or NULL
   1393  *
   1394  * The HTML DTD allows a tag to implicitly close other tags.
   1395  * The list is kept in htmlStartClose array. This function is
   1396  * called when a new tag has been detected and generates the
   1397  * appropriates closes if possible/needed.
   1398  * If newtag is NULL this mean we are at the end of the resource
   1399  * and we should check
   1400  */
   1401 static void
   1402 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
   1403 {
   1404     while ((newtag != NULL) && (ctxt->name != NULL) &&
   1405            (htmlCheckAutoClose(newtag, ctxt->name))) {
   1406         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   1407             ctxt->sax->endElement(ctxt->userData, ctxt->name);
   1408 	htmlnamePop(ctxt);
   1409     }
   1410     if (newtag == NULL) {
   1411         htmlAutoCloseOnEnd(ctxt);
   1412         return;
   1413     }
   1414     while ((newtag == NULL) && (ctxt->name != NULL) &&
   1415            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
   1416             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
   1417             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
   1418         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   1419             ctxt->sax->endElement(ctxt->userData, ctxt->name);
   1420 	htmlnamePop(ctxt);
   1421     }
   1422 }
   1423 
   1424 /**
   1425  * htmlAutoCloseTag:
   1426  * @doc:  the HTML document
   1427  * @name:  The tag name
   1428  * @elem:  the HTML element
   1429  *
   1430  * The HTML DTD allows a tag to implicitly close other tags.
   1431  * The list is kept in htmlStartClose array. This function checks
   1432  * if the element or one of it's children would autoclose the
   1433  * given tag.
   1434  *
   1435  * Returns 1 if autoclose, 0 otherwise
   1436  */
   1437 int
   1438 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
   1439     htmlNodePtr child;
   1440 
   1441     if (elem == NULL) return(1);
   1442     if (xmlStrEqual(name, elem->name)) return(0);
   1443     if (htmlCheckAutoClose(elem->name, name)) return(1);
   1444     child = elem->children;
   1445     while (child != NULL) {
   1446         if (htmlAutoCloseTag(doc, name, child)) return(1);
   1447 	child = child->next;
   1448     }
   1449     return(0);
   1450 }
   1451 
   1452 /**
   1453  * htmlIsAutoClosed:
   1454  * @doc:  the HTML document
   1455  * @elem:  the HTML element
   1456  *
   1457  * The HTML DTD allows a tag to implicitly close other tags.
   1458  * The list is kept in htmlStartClose array. This function checks
   1459  * if a tag is autoclosed by one of it's child
   1460  *
   1461  * Returns 1 if autoclosed, 0 otherwise
   1462  */
   1463 int
   1464 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
   1465     htmlNodePtr child;
   1466 
   1467     if (elem == NULL) return(1);
   1468     child = elem->children;
   1469     while (child != NULL) {
   1470 	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
   1471 	child = child->next;
   1472     }
   1473     return(0);
   1474 }
   1475 
   1476 /**
   1477  * htmlCheckImplied:
   1478  * @ctxt:  an HTML parser context
   1479  * @newtag:  The new tag name
   1480  *
   1481  * The HTML DTD allows a tag to exists only implicitly
   1482  * called when a new tag has been detected and generates the
   1483  * appropriates implicit tags if missing
   1484  */
   1485 static void
   1486 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
   1487     int i;
   1488 
   1489     if (ctxt->options & HTML_PARSE_NOIMPLIED)
   1490         return;
   1491     if (!htmlOmittedDefaultValue)
   1492 	return;
   1493     if (xmlStrEqual(newtag, BAD_CAST"html"))
   1494 	return;
   1495     if (ctxt->nameNr <= 0) {
   1496 	htmlnamePush(ctxt, BAD_CAST"html");
   1497 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1498 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
   1499     }
   1500     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
   1501         return;
   1502     if ((ctxt->nameNr <= 1) &&
   1503         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
   1504 	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
   1505 	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
   1506 	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
   1507 	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
   1508 	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
   1509         if (ctxt->html >= 3) {
   1510             /* we already saw or generated an <head> before */
   1511             return;
   1512         }
   1513         /*
   1514          * dropped OBJECT ... i you put it first BODY will be
   1515          * assumed !
   1516          */
   1517         htmlnamePush(ctxt, BAD_CAST"head");
   1518         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1519             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
   1520     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
   1521 	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
   1522 	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
   1523         if (ctxt->html >= 10) {
   1524             /* we already saw or generated a <body> before */
   1525             return;
   1526         }
   1527 	for (i = 0;i < ctxt->nameNr;i++) {
   1528 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
   1529 		return;
   1530 	    }
   1531 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
   1532 		return;
   1533 	    }
   1534 	}
   1535 
   1536 	htmlnamePush(ctxt, BAD_CAST"body");
   1537 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1538 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
   1539     }
   1540 }
   1541 
   1542 /**
   1543  * htmlCheckParagraph
   1544  * @ctxt:  an HTML parser context
   1545  *
   1546  * Check whether a p element need to be implied before inserting
   1547  * characters in the current element.
   1548  *
   1549  * Returns 1 if a paragraph has been inserted, 0 if not and -1
   1550  *         in case of error.
   1551  */
   1552 
   1553 static int
   1554 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
   1555     const xmlChar *tag;
   1556     int i;
   1557 
   1558     if (ctxt == NULL)
   1559 	return(-1);
   1560     tag = ctxt->name;
   1561     if (tag == NULL) {
   1562 	htmlAutoClose(ctxt, BAD_CAST"p");
   1563 	htmlCheckImplied(ctxt, BAD_CAST"p");
   1564 	htmlnamePush(ctxt, BAD_CAST"p");
   1565 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1566 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
   1567 	return(1);
   1568     }
   1569     if (!htmlOmittedDefaultValue)
   1570 	return(0);
   1571     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
   1572 	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
   1573 	    htmlAutoClose(ctxt, BAD_CAST"p");
   1574 	    htmlCheckImplied(ctxt, BAD_CAST"p");
   1575 	    htmlnamePush(ctxt, BAD_CAST"p");
   1576 	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1577 		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
   1578 	    return(1);
   1579 	}
   1580     }
   1581     return(0);
   1582 }
   1583 
   1584 /**
   1585  * htmlIsScriptAttribute:
   1586  * @name:  an attribute name
   1587  *
   1588  * Check if an attribute is of content type Script
   1589  *
   1590  * Returns 1 is the attribute is a script 0 otherwise
   1591  */
   1592 int
   1593 htmlIsScriptAttribute(const xmlChar *name) {
   1594     unsigned int i;
   1595 
   1596     if (name == NULL)
   1597       return(0);
   1598     /*
   1599      * all script attributes start with 'on'
   1600      */
   1601     if ((name[0] != 'o') || (name[1] != 'n'))
   1602       return(0);
   1603     for (i = 0;
   1604 	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
   1605 	 i++) {
   1606 	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
   1607 	    return(1);
   1608     }
   1609     return(0);
   1610 }
   1611 
   1612 /************************************************************************
   1613  *									*
   1614  *	The list of HTML predefined entities			*
   1615  *									*
   1616  ************************************************************************/
   1617 
   1618 
   1619 static const htmlEntityDesc  html40EntitiesTable[] = {
   1620 /*
   1621  * the 4 absolute ones, plus apostrophe.
   1622  */
   1623 { 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
   1624 { 38,	"amp",	"ampersand, U+0026 ISOnum" },
   1625 { 39,	"apos",	"single quote" },
   1626 { 60,	"lt",	"less-than sign, U+003C ISOnum" },
   1627 { 62,	"gt",	"greater-than sign, U+003E ISOnum" },
   1628 
   1629 /*
   1630  * A bunch still in the 128-255 range
   1631  * Replacing them depend really on the charset used.
   1632  */
   1633 { 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
   1634 { 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
   1635 { 162,	"cent",	"cent sign, U+00A2 ISOnum" },
   1636 { 163,	"pound","pound sign, U+00A3 ISOnum" },
   1637 { 164,	"curren","currency sign, U+00A4 ISOnum" },
   1638 { 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
   1639 { 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
   1640 { 167,	"sect",	"section sign, U+00A7 ISOnum" },
   1641 { 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
   1642 { 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
   1643 { 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
   1644 { 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
   1645 { 172,	"not",	"not sign, U+00AC ISOnum" },
   1646 { 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
   1647 { 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
   1648 { 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
   1649 { 176,	"deg",	"degree sign, U+00B0 ISOnum" },
   1650 { 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
   1651 { 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
   1652 { 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
   1653 { 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
   1654 { 181,	"micro","micro sign, U+00B5 ISOnum" },
   1655 { 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
   1656 { 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
   1657 { 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
   1658 { 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
   1659 { 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
   1660 { 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
   1661 { 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
   1662 { 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
   1663 { 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
   1664 { 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
   1665 { 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
   1666 { 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
   1667 { 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
   1668 { 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
   1669 { 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
   1670 { 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
   1671 { 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
   1672 { 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
   1673 { 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
   1674 { 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
   1675 { 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
   1676 { 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
   1677 { 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
   1678 { 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
   1679 { 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
   1680 { 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
   1681 { 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
   1682 { 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
   1683 { 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
   1684 { 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
   1685 { 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
   1686 { 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
   1687 { 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
   1688 { 215,	"times","multiplication sign, U+00D7 ISOnum" },
   1689 { 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
   1690 { 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
   1691 { 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
   1692 { 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
   1693 { 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
   1694 { 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
   1695 { 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
   1696 { 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
   1697 { 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
   1698 { 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
   1699 { 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
   1700 { 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
   1701 { 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
   1702 { 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
   1703 { 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
   1704 { 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
   1705 { 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
   1706 { 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
   1707 { 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
   1708 { 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
   1709 { 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
   1710 { 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
   1711 { 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
   1712 { 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
   1713 { 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
   1714 { 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
   1715 { 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
   1716 { 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
   1717 { 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
   1718 { 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
   1719 { 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
   1720 { 247,	"divide","division sign, U+00F7 ISOnum" },
   1721 { 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
   1722 { 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
   1723 { 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
   1724 { 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
   1725 { 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
   1726 { 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
   1727 { 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
   1728 { 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
   1729 
   1730 { 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
   1731 { 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
   1732 { 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
   1733 { 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
   1734 { 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
   1735 
   1736 /*
   1737  * Anything below should really be kept as entities references
   1738  */
   1739 { 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
   1740 
   1741 { 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
   1742 { 732,	"tilde","small tilde, U+02DC ISOdia" },
   1743 
   1744 { 913,	"Alpha","greek capital letter alpha, U+0391" },
   1745 { 914,	"Beta",	"greek capital letter beta, U+0392" },
   1746 { 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
   1747 { 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
   1748 { 917,	"Epsilon","greek capital letter epsilon, U+0395" },
   1749 { 918,	"Zeta",	"greek capital letter zeta, U+0396" },
   1750 { 919,	"Eta",	"greek capital letter eta, U+0397" },
   1751 { 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
   1752 { 921,	"Iota",	"greek capital letter iota, U+0399" },
   1753 { 922,	"Kappa","greek capital letter kappa, U+039A" },
   1754 { 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
   1755 { 924,	"Mu",	"greek capital letter mu, U+039C" },
   1756 { 925,	"Nu",	"greek capital letter nu, U+039D" },
   1757 { 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
   1758 { 927,	"Omicron","greek capital letter omicron, U+039F" },
   1759 { 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
   1760 { 929,	"Rho",	"greek capital letter rho, U+03A1" },
   1761 { 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
   1762 { 932,	"Tau",	"greek capital letter tau, U+03A4" },
   1763 { 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
   1764 { 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
   1765 { 935,	"Chi",	"greek capital letter chi, U+03A7" },
   1766 { 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
   1767 { 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
   1768 
   1769 { 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
   1770 { 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
   1771 { 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
   1772 { 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
   1773 { 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
   1774 { 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
   1775 { 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
   1776 { 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
   1777 { 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
   1778 { 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
   1779 { 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
   1780 { 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
   1781 { 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
   1782 { 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
   1783 { 959,	"omicron","greek small letter omicron, U+03BF NEW" },
   1784 { 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
   1785 { 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
   1786 { 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
   1787 { 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
   1788 { 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
   1789 { 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
   1790 { 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
   1791 { 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
   1792 { 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
   1793 { 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
   1794 { 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
   1795 { 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
   1796 { 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
   1797 
   1798 { 8194,	"ensp",	"en space, U+2002 ISOpub" },
   1799 { 8195,	"emsp",	"em space, U+2003 ISOpub" },
   1800 { 8201,	"thinsp","thin space, U+2009 ISOpub" },
   1801 { 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
   1802 { 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
   1803 { 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
   1804 { 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
   1805 { 8211,	"ndash","en dash, U+2013 ISOpub" },
   1806 { 8212,	"mdash","em dash, U+2014 ISOpub" },
   1807 { 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
   1808 { 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
   1809 { 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
   1810 { 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
   1811 { 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
   1812 { 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
   1813 { 8224,	"dagger","dagger, U+2020 ISOpub" },
   1814 { 8225,	"Dagger","double dagger, U+2021 ISOpub" },
   1815 
   1816 { 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
   1817 { 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
   1818 
   1819 { 8240,	"permil","per mille sign, U+2030 ISOtech" },
   1820 
   1821 { 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
   1822 { 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
   1823 
   1824 { 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
   1825 { 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
   1826 
   1827 { 8254,	"oline","overline = spacing overscore, U+203E NEW" },
   1828 { 8260,	"frasl","fraction slash, U+2044 NEW" },
   1829 
   1830 { 8364,	"euro",	"euro sign, U+20AC NEW" },
   1831 
   1832 { 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
   1833 { 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
   1834 { 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
   1835 { 8482,	"trade","trade mark sign, U+2122 ISOnum" },
   1836 { 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
   1837 { 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
   1838 { 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
   1839 { 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
   1840 { 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
   1841 { 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
   1842 { 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
   1843 { 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
   1844 { 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
   1845 { 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
   1846 { 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
   1847 { 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
   1848 
   1849 { 8704,	"forall","for all, U+2200 ISOtech" },
   1850 { 8706,	"part",	"partial differential, U+2202 ISOtech" },
   1851 { 8707,	"exist","there exists, U+2203 ISOtech" },
   1852 { 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
   1853 { 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
   1854 { 8712,	"isin",	"element of, U+2208 ISOtech" },
   1855 { 8713,	"notin","not an element of, U+2209 ISOtech" },
   1856 { 8715,	"ni",	"contains as member, U+220B ISOtech" },
   1857 { 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
   1858 { 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
   1859 { 8722,	"minus","minus sign, U+2212 ISOtech" },
   1860 { 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
   1861 { 8730,	"radic","square root = radical sign, U+221A ISOtech" },
   1862 { 8733,	"prop",	"proportional to, U+221D ISOtech" },
   1863 { 8734,	"infin","infinity, U+221E ISOtech" },
   1864 { 8736,	"ang",	"angle, U+2220 ISOamso" },
   1865 { 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
   1866 { 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
   1867 { 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
   1868 { 8746,	"cup",	"union = cup, U+222A ISOtech" },
   1869 { 8747,	"int",	"integral, U+222B ISOtech" },
   1870 { 8756,	"there4","therefore, U+2234 ISOtech" },
   1871 { 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
   1872 { 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
   1873 { 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
   1874 { 8800,	"ne",	"not equal to, U+2260 ISOtech" },
   1875 { 8801,	"equiv","identical to, U+2261 ISOtech" },
   1876 { 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
   1877 { 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
   1878 { 8834,	"sub",	"subset of, U+2282 ISOtech" },
   1879 { 8835,	"sup",	"superset of, U+2283 ISOtech" },
   1880 { 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
   1881 { 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
   1882 { 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
   1883 { 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
   1884 { 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
   1885 { 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
   1886 { 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
   1887 { 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
   1888 { 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
   1889 { 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
   1890 { 8971,	"rfloor","right floor, U+230B ISOamsc" },
   1891 { 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
   1892 { 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
   1893 { 9674,	"loz",	"lozenge, U+25CA ISOpub" },
   1894 
   1895 { 9824,	"spades","black spade suit, U+2660 ISOpub" },
   1896 { 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
   1897 { 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
   1898 { 9830,	"diams","black diamond suit, U+2666 ISOpub" },
   1899 
   1900 };
   1901 
   1902 /************************************************************************
   1903  *									*
   1904  *		Commodity functions to handle entities			*
   1905  *									*
   1906  ************************************************************************/
   1907 
   1908 /*
   1909  * Macro used to grow the current buffer.
   1910  */
   1911 #define growBuffer(buffer) {						\
   1912     xmlChar *tmp;							\
   1913     buffer##_size *= 2;							\
   1914     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
   1915     if (tmp == NULL) {						\
   1916 	htmlErrMemory(ctxt, "growing buffer\n");			\
   1917 	xmlFree(buffer);						\
   1918 	return(NULL);							\
   1919     }									\
   1920     buffer = tmp;							\
   1921 }
   1922 
   1923 /**
   1924  * htmlEntityLookup:
   1925  * @name: the entity name
   1926  *
   1927  * Lookup the given entity in EntitiesTable
   1928  *
   1929  * TODO: the linear scan is really ugly, an hash table is really needed.
   1930  *
   1931  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
   1932  */
   1933 const htmlEntityDesc *
   1934 htmlEntityLookup(const xmlChar *name) {
   1935     unsigned int i;
   1936 
   1937     for (i = 0;i < (sizeof(html40EntitiesTable)/
   1938                     sizeof(html40EntitiesTable[0]));i++) {
   1939         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
   1940             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
   1941 	}
   1942     }
   1943     return(NULL);
   1944 }
   1945 
   1946 /**
   1947  * htmlEntityValueLookup:
   1948  * @value: the entity's unicode value
   1949  *
   1950  * Lookup the given entity in EntitiesTable
   1951  *
   1952  * TODO: the linear scan is really ugly, an hash table is really needed.
   1953  *
   1954  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
   1955  */
   1956 const htmlEntityDesc *
   1957 htmlEntityValueLookup(unsigned int value) {
   1958     unsigned int i;
   1959 
   1960     for (i = 0;i < (sizeof(html40EntitiesTable)/
   1961                     sizeof(html40EntitiesTable[0]));i++) {
   1962         if (html40EntitiesTable[i].value >= value) {
   1963 	    if (html40EntitiesTable[i].value > value)
   1964 		break;
   1965             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
   1966 	}
   1967     }
   1968     return(NULL);
   1969 }
   1970 
   1971 /**
   1972  * UTF8ToHtml:
   1973  * @out:  a pointer to an array of bytes to store the result
   1974  * @outlen:  the length of @out
   1975  * @in:  a pointer to an array of UTF-8 chars
   1976  * @inlen:  the length of @in
   1977  *
   1978  * Take a block of UTF-8 chars in and try to convert it to an ASCII
   1979  * plus HTML entities block of chars out.
   1980  *
   1981  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
   1982  * The value of @inlen after return is the number of octets consumed
   1983  *     as the return value is positive, else unpredictable.
   1984  * The value of @outlen after return is the number of octets consumed.
   1985  */
   1986 int
   1987 UTF8ToHtml(unsigned char* out, int *outlen,
   1988               const unsigned char* in, int *inlen) {
   1989     const unsigned char* processed = in;
   1990     const unsigned char* outend;
   1991     const unsigned char* outstart = out;
   1992     const unsigned char* instart = in;
   1993     const unsigned char* inend;
   1994     unsigned int c, d;
   1995     int trailing;
   1996 
   1997     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
   1998     if (in == NULL) {
   1999         /*
   2000 	 * initialization nothing to do
   2001 	 */
   2002 	*outlen = 0;
   2003 	*inlen = 0;
   2004 	return(0);
   2005     }
   2006     inend = in + (*inlen);
   2007     outend = out + (*outlen);
   2008     while (in < inend) {
   2009 	d = *in++;
   2010 	if      (d < 0x80)  { c= d; trailing= 0; }
   2011 	else if (d < 0xC0) {
   2012 	    /* trailing byte in leading position */
   2013 	    *outlen = out - outstart;
   2014 	    *inlen = processed - instart;
   2015 	    return(-2);
   2016         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
   2017         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
   2018         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
   2019 	else {
   2020 	    /* no chance for this in Ascii */
   2021 	    *outlen = out - outstart;
   2022 	    *inlen = processed - instart;
   2023 	    return(-2);
   2024 	}
   2025 
   2026 	if (inend - in < trailing) {
   2027 	    break;
   2028 	}
   2029 
   2030 	for ( ; trailing; trailing--) {
   2031 	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
   2032 		break;
   2033 	    c <<= 6;
   2034 	    c |= d & 0x3F;
   2035 	}
   2036 
   2037 	/* assertion: c is a single UTF-4 value */
   2038 	if (c < 0x80) {
   2039 	    if (out + 1 >= outend)
   2040 		break;
   2041 	    *out++ = c;
   2042 	} else {
   2043 	    int len;
   2044 	    const htmlEntityDesc * ent;
   2045 	    const char *cp;
   2046 	    char nbuf[16];
   2047 
   2048 	    /*
   2049 	     * Try to lookup a predefined HTML entity for it
   2050 	     */
   2051 
   2052 	    ent = htmlEntityValueLookup(c);
   2053 	    if (ent == NULL) {
   2054 	      snprintf(nbuf, sizeof(nbuf), "#%u", c);
   2055 	      cp = nbuf;
   2056 	    }
   2057 	    else
   2058 	      cp = ent->name;
   2059 	    len = strlen(cp);
   2060 	    if (out + 2 + len >= outend)
   2061 		break;
   2062 	    *out++ = '&';
   2063 	    memcpy(out, cp, len);
   2064 	    out += len;
   2065 	    *out++ = ';';
   2066 	}
   2067 	processed = in;
   2068     }
   2069     *outlen = out - outstart;
   2070     *inlen = processed - instart;
   2071     return(0);
   2072 }
   2073 
   2074 /**
   2075  * htmlEncodeEntities:
   2076  * @out:  a pointer to an array of bytes to store the result
   2077  * @outlen:  the length of @out
   2078  * @in:  a pointer to an array of UTF-8 chars
   2079  * @inlen:  the length of @in
   2080  * @quoteChar: the quote character to escape (' or ") or zero.
   2081  *
   2082  * Take a block of UTF-8 chars in and try to convert it to an ASCII
   2083  * plus HTML entities block of chars out.
   2084  *
   2085  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
   2086  * The value of @inlen after return is the number of octets consumed
   2087  *     as the return value is positive, else unpredictable.
   2088  * The value of @outlen after return is the number of octets consumed.
   2089  */
   2090 int
   2091 htmlEncodeEntities(unsigned char* out, int *outlen,
   2092 		   const unsigned char* in, int *inlen, int quoteChar) {
   2093     const unsigned char* processed = in;
   2094     const unsigned char* outend;
   2095     const unsigned char* outstart = out;
   2096     const unsigned char* instart = in;
   2097     const unsigned char* inend;
   2098     unsigned int c, d;
   2099     int trailing;
   2100 
   2101     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
   2102         return(-1);
   2103     outend = out + (*outlen);
   2104     inend = in + (*inlen);
   2105     while (in < inend) {
   2106 	d = *in++;
   2107 	if      (d < 0x80)  { c= d; trailing= 0; }
   2108 	else if (d < 0xC0) {
   2109 	    /* trailing byte in leading position */
   2110 	    *outlen = out - outstart;
   2111 	    *inlen = processed - instart;
   2112 	    return(-2);
   2113         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
   2114         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
   2115         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
   2116 	else {
   2117 	    /* no chance for this in Ascii */
   2118 	    *outlen = out - outstart;
   2119 	    *inlen = processed - instart;
   2120 	    return(-2);
   2121 	}
   2122 
   2123 	if (inend - in < trailing)
   2124 	    break;
   2125 
   2126 	while (trailing--) {
   2127 	    if (((d= *in++) & 0xC0) != 0x80) {
   2128 		*outlen = out - outstart;
   2129 		*inlen = processed - instart;
   2130 		return(-2);
   2131 	    }
   2132 	    c <<= 6;
   2133 	    c |= d & 0x3F;
   2134 	}
   2135 
   2136 	/* assertion: c is a single UTF-4 value */
   2137 	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
   2138 	    (c != '&') && (c != '<') && (c != '>')) {
   2139 	    if (out >= outend)
   2140 		break;
   2141 	    *out++ = c;
   2142 	} else {
   2143 	    const htmlEntityDesc * ent;
   2144 	    const char *cp;
   2145 	    char nbuf[16];
   2146 	    int len;
   2147 
   2148 	    /*
   2149 	     * Try to lookup a predefined HTML entity for it
   2150 	     */
   2151 	    ent = htmlEntityValueLookup(c);
   2152 	    if (ent == NULL) {
   2153 		snprintf(nbuf, sizeof(nbuf), "#%u", c);
   2154 		cp = nbuf;
   2155 	    }
   2156 	    else
   2157 		cp = ent->name;
   2158 	    len = strlen(cp);
   2159 	    if (out + 2 + len > outend)
   2160 		break;
   2161 	    *out++ = '&';
   2162 	    memcpy(out, cp, len);
   2163 	    out += len;
   2164 	    *out++ = ';';
   2165 	}
   2166 	processed = in;
   2167     }
   2168     *outlen = out - outstart;
   2169     *inlen = processed - instart;
   2170     return(0);
   2171 }
   2172 
   2173 /************************************************************************
   2174  *									*
   2175  *		Commodity functions to handle streams			*
   2176  *									*
   2177  ************************************************************************/
   2178 
   2179 /**
   2180  * htmlNewInputStream:
   2181  * @ctxt:  an HTML parser context
   2182  *
   2183  * Create a new input stream structure
   2184  * Returns the new input stream or NULL
   2185  */
   2186 static htmlParserInputPtr
   2187 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
   2188     htmlParserInputPtr input;
   2189 
   2190     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
   2191     if (input == NULL) {
   2192         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
   2193 	return(NULL);
   2194     }
   2195     memset(input, 0, sizeof(htmlParserInput));
   2196     input->filename = NULL;
   2197     input->directory = NULL;
   2198     input->base = NULL;
   2199     input->cur = NULL;
   2200     input->buf = NULL;
   2201     input->line = 1;
   2202     input->col = 1;
   2203     input->buf = NULL;
   2204     input->free = NULL;
   2205     input->version = NULL;
   2206     input->consumed = 0;
   2207     input->length = 0;
   2208     return(input);
   2209 }
   2210 
   2211 
   2212 /************************************************************************
   2213  *									*
   2214  *		Commodity functions, cleanup needed ?			*
   2215  *									*
   2216  ************************************************************************/
   2217 /*
   2218  * all tags allowing pc data from the html 4.01 loose dtd
   2219  * NOTE: it might be more apropriate to integrate this information
   2220  * into the html40ElementTable array but I don't want to risk any
   2221  * binary incomptibility
   2222  */
   2223 static const char *allowPCData[] = {
   2224     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
   2225     "blockquote", "body", "button", "caption", "center", "cite", "code",
   2226     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
   2227     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
   2228     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
   2229     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
   2230 };
   2231 
   2232 /**
   2233  * areBlanks:
   2234  * @ctxt:  an HTML parser context
   2235  * @str:  a xmlChar *
   2236  * @len:  the size of @str
   2237  *
   2238  * Is this a sequence of blank chars that one can ignore ?
   2239  *
   2240  * Returns 1 if ignorable 0 otherwise.
   2241  */
   2242 
   2243 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
   2244     unsigned int i;
   2245     int j;
   2246     xmlNodePtr lastChild;
   2247     xmlDtdPtr dtd;
   2248 
   2249     for (j = 0;j < len;j++)
   2250         if (!(IS_BLANK_CH(str[j]))) return(0);
   2251 
   2252     if (CUR == 0) return(1);
   2253     if (CUR != '<') return(0);
   2254     if (ctxt->name == NULL)
   2255 	return(1);
   2256     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
   2257 	return(1);
   2258     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
   2259 	return(1);
   2260 
   2261     /* Only strip CDATA children of the body tag for strict HTML DTDs */
   2262     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
   2263         dtd = xmlGetIntSubset(ctxt->myDoc);
   2264         if (dtd != NULL && dtd->ExternalID != NULL) {
   2265             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
   2266                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
   2267                 return(1);
   2268         }
   2269     }
   2270 
   2271     if (ctxt->node == NULL) return(0);
   2272     lastChild = xmlGetLastChild(ctxt->node);
   2273     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
   2274 	lastChild = lastChild->prev;
   2275     if (lastChild == NULL) {
   2276         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
   2277             (ctxt->node->content != NULL)) return(0);
   2278 	/* keep ws in constructs like ...<b> </b>...
   2279 	   for all tags "b" allowing PCDATA */
   2280 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
   2281 	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
   2282 		return(0);
   2283 	    }
   2284 	}
   2285     } else if (xmlNodeIsText(lastChild)) {
   2286         return(0);
   2287     } else {
   2288 	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
   2289 	   for all tags "p" allowing PCDATA */
   2290 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
   2291 	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
   2292 		return(0);
   2293 	    }
   2294 	}
   2295     }
   2296     return(1);
   2297 }
   2298 
   2299 /**
   2300  * htmlNewDocNoDtD:
   2301  * @URI:  URI for the dtd, or NULL
   2302  * @ExternalID:  the external ID of the DTD, or NULL
   2303  *
   2304  * Creates a new HTML document without a DTD node if @URI and @ExternalID
   2305  * are NULL
   2306  *
   2307  * Returns a new document, do not initialize the DTD if not provided
   2308  */
   2309 htmlDocPtr
   2310 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
   2311     xmlDocPtr cur;
   2312 
   2313     /*
   2314      * Allocate a new document and fill the fields.
   2315      */
   2316     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
   2317     if (cur == NULL) {
   2318 	htmlErrMemory(NULL, "HTML document creation failed\n");
   2319 	return(NULL);
   2320     }
   2321     memset(cur, 0, sizeof(xmlDoc));
   2322 
   2323     cur->type = XML_HTML_DOCUMENT_NODE;
   2324     cur->version = NULL;
   2325     cur->intSubset = NULL;
   2326     cur->doc = cur;
   2327     cur->name = NULL;
   2328     cur->children = NULL;
   2329     cur->extSubset = NULL;
   2330     cur->oldNs = NULL;
   2331     cur->encoding = NULL;
   2332     cur->standalone = 1;
   2333     cur->compression = 0;
   2334     cur->ids = NULL;
   2335     cur->refs = NULL;
   2336     cur->_private = NULL;
   2337     cur->charset = XML_CHAR_ENCODING_UTF8;
   2338     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
   2339     if ((ExternalID != NULL) ||
   2340 	(URI != NULL))
   2341 	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
   2342     return(cur);
   2343 }
   2344 
   2345 /**
   2346  * htmlNewDoc:
   2347  * @URI:  URI for the dtd, or NULL
   2348  * @ExternalID:  the external ID of the DTD, or NULL
   2349  *
   2350  * Creates a new HTML document
   2351  *
   2352  * Returns a new document
   2353  */
   2354 htmlDocPtr
   2355 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
   2356     if ((URI == NULL) && (ExternalID == NULL))
   2357 	return(htmlNewDocNoDtD(
   2358 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
   2359 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
   2360 
   2361     return(htmlNewDocNoDtD(URI, ExternalID));
   2362 }
   2363 
   2364 
   2365 /************************************************************************
   2366  *									*
   2367  *			The parser itself				*
   2368  *	Relates to http://www.w3.org/TR/html40				*
   2369  *									*
   2370  ************************************************************************/
   2371 
   2372 /************************************************************************
   2373  *									*
   2374  *			The parser itself				*
   2375  *									*
   2376  ************************************************************************/
   2377 
   2378 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
   2379 
   2380 /**
   2381  * htmlParseHTMLName:
   2382  * @ctxt:  an HTML parser context
   2383  *
   2384  * parse an HTML tag or attribute name, note that we convert it to lowercase
   2385  * since HTML names are not case-sensitive.
   2386  *
   2387  * Returns the Tag Name parsed or NULL
   2388  */
   2389 
   2390 static const xmlChar *
   2391 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
   2392     int i = 0;
   2393     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
   2394 
   2395     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
   2396         (CUR != ':') && (CUR != '.')) return(NULL);
   2397 
   2398     while ((i < HTML_PARSER_BUFFER_SIZE) &&
   2399            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
   2400 	   (CUR == ':') || (CUR == '-') || (CUR == '_') ||
   2401            (CUR == '.'))) {
   2402 	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
   2403         else loc[i] = CUR;
   2404 	i++;
   2405 
   2406 	NEXT;
   2407     }
   2408 
   2409     return(xmlDictLookup(ctxt->dict, loc, i));
   2410 }
   2411 
   2412 
   2413 /**
   2414  * htmlParseHTMLName_nonInvasive:
   2415  * @ctxt:  an HTML parser context
   2416  *
   2417  * parse an HTML tag or attribute name, note that we convert it to lowercase
   2418  * since HTML names are not case-sensitive, this doesn't consume the data
   2419  * from the stream, it's a look-ahead
   2420  *
   2421  * Returns the Tag Name parsed or NULL
   2422  */
   2423 
   2424 static const xmlChar *
   2425 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
   2426     int i = 0;
   2427     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
   2428 
   2429     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
   2430         (NXT(1) != ':')) return(NULL);
   2431 
   2432     while ((i < HTML_PARSER_BUFFER_SIZE) &&
   2433            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
   2434 	   (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
   2435 	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
   2436         else loc[i] = NXT(1+i);
   2437 	i++;
   2438     }
   2439 
   2440     return(xmlDictLookup(ctxt->dict, loc, i));
   2441 }
   2442 
   2443 
   2444 /**
   2445  * htmlParseName:
   2446  * @ctxt:  an HTML parser context
   2447  *
   2448  * parse an HTML name, this routine is case sensitive.
   2449  *
   2450  * Returns the Name parsed or NULL
   2451  */
   2452 
   2453 static const xmlChar *
   2454 htmlParseName(htmlParserCtxtPtr ctxt) {
   2455     const xmlChar *in;
   2456     const xmlChar *ret;
   2457     int count = 0;
   2458 
   2459     GROW;
   2460 
   2461     /*
   2462      * Accelerator for simple ASCII names
   2463      */
   2464     in = ctxt->input->cur;
   2465     if (((*in >= 0x61) && (*in <= 0x7A)) ||
   2466 	((*in >= 0x41) && (*in <= 0x5A)) ||
   2467 	(*in == '_') || (*in == ':')) {
   2468 	in++;
   2469 	while (((*in >= 0x61) && (*in <= 0x7A)) ||
   2470 	       ((*in >= 0x41) && (*in <= 0x5A)) ||
   2471 	       ((*in >= 0x30) && (*in <= 0x39)) ||
   2472 	       (*in == '_') || (*in == '-') ||
   2473 	       (*in == ':') || (*in == '.'))
   2474 	    in++;
   2475 
   2476 	if (in == ctxt->input->end)
   2477 	    return(NULL);
   2478 
   2479 	if ((*in > 0) && (*in < 0x80)) {
   2480 	    count = in - ctxt->input->cur;
   2481 	    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
   2482 	    ctxt->input->cur = in;
   2483 	    ctxt->nbChars += count;
   2484 	    ctxt->input->col += count;
   2485 	    return(ret);
   2486 	}
   2487     }
   2488     return(htmlParseNameComplex(ctxt));
   2489 }
   2490 
   2491 static const xmlChar *
   2492 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
   2493     int len = 0, l;
   2494     int c;
   2495     int count = 0;
   2496     const xmlChar *base = ctxt->input->base;
   2497 
   2498     /*
   2499      * Handler for more complex cases
   2500      */
   2501     GROW;
   2502     c = CUR_CHAR(l);
   2503     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
   2504 	(!IS_LETTER(c) && (c != '_') &&
   2505          (c != ':'))) {
   2506 	return(NULL);
   2507     }
   2508 
   2509     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
   2510 	   ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
   2511             (c == '.') || (c == '-') ||
   2512 	    (c == '_') || (c == ':') ||
   2513 	    (IS_COMBINING(c)) ||
   2514 	    (IS_EXTENDER(c)))) {
   2515 	if (count++ > 100) {
   2516 	    count = 0;
   2517 	    GROW;
   2518 	}
   2519 	len += l;
   2520 	NEXTL(l);
   2521 	c = CUR_CHAR(l);
   2522 	if (ctxt->input->base != base) {
   2523 	    /*
   2524 	     * We changed encoding from an unknown encoding
   2525 	     * Input buffer changed location, so we better start again
   2526 	     */
   2527 	    return(htmlParseNameComplex(ctxt));
   2528 	}
   2529     }
   2530 
   2531     if (ctxt->input->cur - ctxt->input->base < len) {
   2532         /* Sanity check */
   2533 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   2534                      "unexpected change of input buffer", NULL, NULL);
   2535         return (NULL);
   2536     }
   2537 
   2538     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
   2539 }
   2540 
   2541 
   2542 /**
   2543  * htmlParseHTMLAttribute:
   2544  * @ctxt:  an HTML parser context
   2545  * @stop:  a char stop value
   2546  *
   2547  * parse an HTML attribute value till the stop (quote), if
   2548  * stop is 0 then it stops at the first space
   2549  *
   2550  * Returns the attribute parsed or NULL
   2551  */
   2552 
   2553 static xmlChar *
   2554 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
   2555     xmlChar *buffer = NULL;
   2556     int buffer_size = 0;
   2557     xmlChar *out = NULL;
   2558     const xmlChar *name = NULL;
   2559     const xmlChar *cur = NULL;
   2560     const htmlEntityDesc * ent;
   2561 
   2562     /*
   2563      * allocate a translation buffer.
   2564      */
   2565     buffer_size = HTML_PARSER_BUFFER_SIZE;
   2566     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
   2567     if (buffer == NULL) {
   2568 	htmlErrMemory(ctxt, "buffer allocation failed\n");
   2569 	return(NULL);
   2570     }
   2571     out = buffer;
   2572 
   2573     /*
   2574      * Ok loop until we reach one of the ending chars
   2575      */
   2576     while ((CUR != 0) && (CUR != stop)) {
   2577 	if ((stop == 0) && (CUR == '>')) break;
   2578 	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
   2579         if (CUR == '&') {
   2580 	    if (NXT(1) == '#') {
   2581 		unsigned int c;
   2582 		int bits;
   2583 
   2584 		c = htmlParseCharRef(ctxt);
   2585 		if      (c <    0x80)
   2586 		        { *out++  = c;                bits= -6; }
   2587 		else if (c <   0x800)
   2588 		        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   2589 		else if (c < 0x10000)
   2590 		        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   2591 		else
   2592 		        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   2593 
   2594 		for ( ; bits >= 0; bits-= 6) {
   2595 		    *out++  = ((c >> bits) & 0x3F) | 0x80;
   2596 		}
   2597 
   2598 		if (out - buffer > buffer_size - 100) {
   2599 			int indx = out - buffer;
   2600 
   2601 			growBuffer(buffer);
   2602 			out = &buffer[indx];
   2603 		}
   2604 	    } else {
   2605 		ent = htmlParseEntityRef(ctxt, &name);
   2606 		if (name == NULL) {
   2607 		    *out++ = '&';
   2608 		    if (out - buffer > buffer_size - 100) {
   2609 			int indx = out - buffer;
   2610 
   2611 			growBuffer(buffer);
   2612 			out = &buffer[indx];
   2613 		    }
   2614 		} else if (ent == NULL) {
   2615 		    *out++ = '&';
   2616 		    cur = name;
   2617 		    while (*cur != 0) {
   2618 			if (out - buffer > buffer_size - 100) {
   2619 			    int indx = out - buffer;
   2620 
   2621 			    growBuffer(buffer);
   2622 			    out = &buffer[indx];
   2623 			}
   2624 			*out++ = *cur++;
   2625 		    }
   2626 		} else {
   2627 		    unsigned int c;
   2628 		    int bits;
   2629 
   2630 		    if (out - buffer > buffer_size - 100) {
   2631 			int indx = out - buffer;
   2632 
   2633 			growBuffer(buffer);
   2634 			out = &buffer[indx];
   2635 		    }
   2636 		    c = ent->value;
   2637 		    if      (c <    0x80)
   2638 			{ *out++  = c;                bits= -6; }
   2639 		    else if (c <   0x800)
   2640 			{ *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   2641 		    else if (c < 0x10000)
   2642 			{ *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   2643 		    else
   2644 			{ *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   2645 
   2646 		    for ( ; bits >= 0; bits-= 6) {
   2647 			*out++  = ((c >> bits) & 0x3F) | 0x80;
   2648 		    }
   2649 		}
   2650 	    }
   2651 	} else {
   2652 	    unsigned int c;
   2653 	    int bits, l;
   2654 
   2655 	    if (out - buffer > buffer_size - 100) {
   2656 		int indx = out - buffer;
   2657 
   2658 		growBuffer(buffer);
   2659 		out = &buffer[indx];
   2660 	    }
   2661 	    c = CUR_CHAR(l);
   2662 	    if      (c <    0x80)
   2663 		    { *out++  = c;                bits= -6; }
   2664 	    else if (c <   0x800)
   2665 		    { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   2666 	    else if (c < 0x10000)
   2667 		    { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   2668 	    else
   2669 		    { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   2670 
   2671 	    for ( ; bits >= 0; bits-= 6) {
   2672 		*out++  = ((c >> bits) & 0x3F) | 0x80;
   2673 	    }
   2674 	    NEXT;
   2675 	}
   2676     }
   2677     *out = 0;
   2678     return(buffer);
   2679 }
   2680 
   2681 /**
   2682  * htmlParseEntityRef:
   2683  * @ctxt:  an HTML parser context
   2684  * @str:  location to store the entity name
   2685  *
   2686  * parse an HTML ENTITY references
   2687  *
   2688  * [68] EntityRef ::= '&' Name ';'
   2689  *
   2690  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
   2691  *         if non-NULL *str will have to be freed by the caller.
   2692  */
   2693 const htmlEntityDesc *
   2694 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
   2695     const xmlChar *name;
   2696     const htmlEntityDesc * ent = NULL;
   2697 
   2698     if (str != NULL) *str = NULL;
   2699     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
   2700 
   2701     if (CUR == '&') {
   2702         NEXT;
   2703         name = htmlParseName(ctxt);
   2704 	if (name == NULL) {
   2705 	    htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   2706 	                 "htmlParseEntityRef: no name\n", NULL, NULL);
   2707 	} else {
   2708 	    GROW;
   2709 	    if (CUR == ';') {
   2710 	        if (str != NULL)
   2711 		    *str = name;
   2712 
   2713 		/*
   2714 		 * Lookup the entity in the table.
   2715 		 */
   2716 		ent = htmlEntityLookup(name);
   2717 		if (ent != NULL) /* OK that's ugly !!! */
   2718 		    NEXT;
   2719 	    } else {
   2720 		htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
   2721 		             "htmlParseEntityRef: expecting ';'\n",
   2722 			     NULL, NULL);
   2723 	        if (str != NULL)
   2724 		    *str = name;
   2725 	    }
   2726 	}
   2727     }
   2728     return(ent);
   2729 }
   2730 
   2731 /**
   2732  * htmlParseAttValue:
   2733  * @ctxt:  an HTML parser context
   2734  *
   2735  * parse a value for an attribute
   2736  * Note: the parser won't do substitution of entities here, this
   2737  * will be handled later in xmlStringGetNodeList, unless it was
   2738  * asked for ctxt->replaceEntities != 0
   2739  *
   2740  * Returns the AttValue parsed or NULL.
   2741  */
   2742 
   2743 static xmlChar *
   2744 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
   2745     xmlChar *ret = NULL;
   2746 
   2747     if (CUR == '"') {
   2748         NEXT;
   2749 	ret = htmlParseHTMLAttribute(ctxt, '"');
   2750         if (CUR != '"') {
   2751 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
   2752 	                 "AttValue: \" expected\n", NULL, NULL);
   2753 	} else
   2754 	    NEXT;
   2755     } else if (CUR == '\'') {
   2756         NEXT;
   2757 	ret = htmlParseHTMLAttribute(ctxt, '\'');
   2758         if (CUR != '\'') {
   2759 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
   2760 	                 "AttValue: ' expected\n", NULL, NULL);
   2761 	} else
   2762 	    NEXT;
   2763     } else {
   2764         /*
   2765 	 * That's an HTMLism, the attribute value may not be quoted
   2766 	 */
   2767 	ret = htmlParseHTMLAttribute(ctxt, 0);
   2768 	if (ret == NULL) {
   2769 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
   2770 	                 "AttValue: no value found\n", NULL, NULL);
   2771 	}
   2772     }
   2773     return(ret);
   2774 }
   2775 
   2776 /**
   2777  * htmlParseSystemLiteral:
   2778  * @ctxt:  an HTML parser context
   2779  *
   2780  * parse an HTML Literal
   2781  *
   2782  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
   2783  *
   2784  * Returns the SystemLiteral parsed or NULL
   2785  */
   2786 
   2787 static xmlChar *
   2788 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
   2789     size_t len = 0, startPosition = 0;
   2790     xmlChar *ret = NULL;
   2791 
   2792     if (CUR == '"') {
   2793         NEXT;
   2794 
   2795         if (CUR_PTR < BASE_PTR)
   2796             return(ret);
   2797         startPosition = CUR_PTR - BASE_PTR;
   2798 
   2799 	while ((IS_CHAR_CH(CUR)) && (CUR != '"')) {
   2800 	    NEXT;
   2801 	    len++;
   2802 	}
   2803 	if (!IS_CHAR_CH(CUR)) {
   2804 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
   2805 			 "Unfinished SystemLiteral\n", NULL, NULL);
   2806 	} else {
   2807 	    ret = xmlStrndup((BASE_PTR+startPosition), len);
   2808 	    NEXT;
   2809         }
   2810     } else if (CUR == '\'') {
   2811         NEXT;
   2812 
   2813         if (CUR_PTR < BASE_PTR)
   2814             return(ret);
   2815         startPosition = CUR_PTR - BASE_PTR;
   2816 
   2817 	while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) {
   2818 	    NEXT;
   2819 	    len++;
   2820 	}
   2821 	if (!IS_CHAR_CH(CUR)) {
   2822 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
   2823 			 "Unfinished SystemLiteral\n", NULL, NULL);
   2824 	} else {
   2825 	    ret = xmlStrndup((BASE_PTR+startPosition), len);
   2826 	    NEXT;
   2827         }
   2828     } else {
   2829 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
   2830 	             " or ' expected\n", NULL, NULL);
   2831     }
   2832 
   2833     return(ret);
   2834 }
   2835 
   2836 /**
   2837  * htmlParsePubidLiteral:
   2838  * @ctxt:  an HTML parser context
   2839  *
   2840  * parse an HTML public literal
   2841  *
   2842  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
   2843  *
   2844  * Returns the PubidLiteral parsed or NULL.
   2845  */
   2846 
   2847 static xmlChar *
   2848 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
   2849     size_t len = 0, startPosition = 0;
   2850     xmlChar *ret = NULL;
   2851     /*
   2852      * Name ::= (Letter | '_') (NameChar)*
   2853      */
   2854     if (CUR == '"') {
   2855         NEXT;
   2856 
   2857         if (CUR_PTR < BASE_PTR)
   2858             return(ret);
   2859         startPosition = CUR_PTR - BASE_PTR;
   2860 
   2861         while (IS_PUBIDCHAR_CH(CUR)) {
   2862             len++;
   2863             NEXT;
   2864         }
   2865 
   2866 	if (CUR != '"') {
   2867 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
   2868 	                 "Unfinished PubidLiteral\n", NULL, NULL);
   2869 	} else {
   2870 	    ret = xmlStrndup((BASE_PTR + startPosition), len);
   2871 	    NEXT;
   2872 	}
   2873     } else if (CUR == '\'') {
   2874         NEXT;
   2875 
   2876         if (CUR_PTR < BASE_PTR)
   2877             return(ret);
   2878         startPosition = CUR_PTR - BASE_PTR;
   2879 
   2880         while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){
   2881             len++;
   2882             NEXT;
   2883         }
   2884 
   2885 	if (CUR != '\'') {
   2886 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
   2887 	                 "Unfinished PubidLiteral\n", NULL, NULL);
   2888 	} else {
   2889 	    ret = xmlStrndup((BASE_PTR + startPosition), len);
   2890 	    NEXT;
   2891 	}
   2892     } else {
   2893 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
   2894 	             "PubidLiteral \" or ' expected\n", NULL, NULL);
   2895     }
   2896 
   2897     return(ret);
   2898 }
   2899 
   2900 /**
   2901  * htmlParseScript:
   2902  * @ctxt:  an HTML parser context
   2903  *
   2904  * parse the content of an HTML SCRIPT or STYLE element
   2905  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
   2906  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
   2907  * http://www.w3.org/TR/html4/types.html#type-script
   2908  * http://www.w3.org/TR/html4/types.html#h-6.15
   2909  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
   2910  *
   2911  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
   2912  * element and the value of intrinsic event attributes. User agents must
   2913  * not evaluate script data as HTML markup but instead must pass it on as
   2914  * data to a script engine.
   2915  * NOTES:
   2916  * - The content is passed like CDATA
   2917  * - the attributes for style and scripting "onXXX" are also described
   2918  *   as CDATA but SGML allows entities references in attributes so their
   2919  *   processing is identical as other attributes
   2920  */
   2921 static void
   2922 htmlParseScript(htmlParserCtxtPtr ctxt) {
   2923     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
   2924     int nbchar = 0;
   2925     int cur,l;
   2926 
   2927     SHRINK;
   2928     cur = CUR_CHAR(l);
   2929     while (IS_CHAR_CH(cur)) {
   2930 	if ((cur == '<') && (NXT(1) == '/')) {
   2931             /*
   2932              * One should break here, the specification is clear:
   2933              * Authors should therefore escape "</" within the content.
   2934              * Escape mechanisms are specific to each scripting or
   2935              * style sheet language.
   2936              *
   2937              * In recovery mode, only break if end tag match the
   2938              * current tag, effectively ignoring all tags inside the
   2939              * script/style block and treating the entire block as
   2940              * CDATA.
   2941              */
   2942             if (ctxt->recovery) {
   2943                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
   2944 				   xmlStrlen(ctxt->name)) == 0)
   2945                 {
   2946                     break; /* while */
   2947                 } else {
   2948 		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
   2949 				 "Element %s embeds close tag\n",
   2950 		                 ctxt->name, NULL);
   2951 		}
   2952             } else {
   2953                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
   2954                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
   2955                 {
   2956                     break; /* while */
   2957                 }
   2958             }
   2959 	}
   2960 	COPY_BUF(l,buf,nbchar,cur);
   2961 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
   2962 	    if (ctxt->sax->cdataBlock!= NULL) {
   2963 		/*
   2964 		 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
   2965 		 */
   2966 		ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
   2967 	    } else if (ctxt->sax->characters != NULL) {
   2968 		ctxt->sax->characters(ctxt->userData, buf, nbchar);
   2969 	    }
   2970 	    nbchar = 0;
   2971 	}
   2972 	GROW;
   2973 	NEXTL(l);
   2974 	cur = CUR_CHAR(l);
   2975     }
   2976 
   2977     if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
   2978         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
   2979                     "Invalid char in CDATA 0x%X\n", cur);
   2980         if (ctxt->input->cur < ctxt->input->end) {
   2981             NEXT;
   2982         }
   2983     }
   2984 
   2985     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
   2986 	if (ctxt->sax->cdataBlock!= NULL) {
   2987 	    /*
   2988 	     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
   2989 	     */
   2990 	    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
   2991 	} else if (ctxt->sax->characters != NULL) {
   2992 	    ctxt->sax->characters(ctxt->userData, buf, nbchar);
   2993 	}
   2994     }
   2995 }
   2996 
   2997 
   2998 /**
   2999  * htmlParseCharDataInternal:
   3000  * @ctxt:  an HTML parser context
   3001  * @readahead: optional read ahead character in ascii range
   3002  *
   3003  * parse a CharData section.
   3004  * if we are within a CDATA section ']]>' marks an end of section.
   3005  *
   3006  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
   3007  */
   3008 
   3009 static void
   3010 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
   3011     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
   3012     int nbchar = 0;
   3013     int cur, l;
   3014     int chunk = 0;
   3015 
   3016     if (readahead)
   3017         buf[nbchar++] = readahead;
   3018 
   3019     SHRINK;
   3020     cur = CUR_CHAR(l);
   3021     while (((cur != '<') || (ctxt->token == '<')) &&
   3022            ((cur != '&') || (ctxt->token == '&')) &&
   3023 	   (cur != 0)) {
   3024 	if (!(IS_CHAR(cur))) {
   3025 	    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
   3026 	                "Invalid char in CDATA 0x%X\n", cur);
   3027 	} else {
   3028 	    COPY_BUF(l,buf,nbchar,cur);
   3029 	}
   3030 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
   3031 	    /*
   3032 	     * Ok the segment is to be consumed as chars.
   3033 	     */
   3034 	    if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
   3035 		if (areBlanks(ctxt, buf, nbchar)) {
   3036 		    if (ctxt->keepBlanks) {
   3037 			if (ctxt->sax->characters != NULL)
   3038 			    ctxt->sax->characters(ctxt->userData, buf, nbchar);
   3039 		    } else {
   3040 			if (ctxt->sax->ignorableWhitespace != NULL)
   3041 			    ctxt->sax->ignorableWhitespace(ctxt->userData,
   3042 			                                   buf, nbchar);
   3043 		    }
   3044 		} else {
   3045 		    htmlCheckParagraph(ctxt);
   3046 		    if (ctxt->sax->characters != NULL)
   3047 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
   3048 		}
   3049 	    }
   3050 	    nbchar = 0;
   3051 	}
   3052 	NEXTL(l);
   3053         chunk++;
   3054         if (chunk > HTML_PARSER_BUFFER_SIZE) {
   3055             chunk = 0;
   3056             SHRINK;
   3057             GROW;
   3058         }
   3059 	cur = CUR_CHAR(l);
   3060 	if (cur == 0) {
   3061 	    SHRINK;
   3062 	    GROW;
   3063 	    cur = CUR_CHAR(l);
   3064 	}
   3065     }
   3066     if (nbchar != 0) {
   3067         buf[nbchar] = 0;
   3068 
   3069 	/*
   3070 	 * Ok the segment is to be consumed as chars.
   3071 	 */
   3072 	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
   3073 	    if (areBlanks(ctxt, buf, nbchar)) {
   3074 		if (ctxt->keepBlanks) {
   3075 		    if (ctxt->sax->characters != NULL)
   3076 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
   3077 		} else {
   3078 		    if (ctxt->sax->ignorableWhitespace != NULL)
   3079 			ctxt->sax->ignorableWhitespace(ctxt->userData,
   3080 			                               buf, nbchar);
   3081 		}
   3082 	    } else {
   3083 		htmlCheckParagraph(ctxt);
   3084 		if (ctxt->sax->characters != NULL)
   3085 		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
   3086 	    }
   3087 	}
   3088     } else {
   3089 	/*
   3090 	 * Loop detection
   3091 	 */
   3092 	if (cur == 0)
   3093 	    ctxt->instate = XML_PARSER_EOF;
   3094     }
   3095 }
   3096 
   3097 /**
   3098  * htmlParseCharData:
   3099  * @ctxt:  an HTML parser context
   3100  *
   3101  * parse a CharData section.
   3102  * if we are within a CDATA section ']]>' marks an end of section.
   3103  *
   3104  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
   3105  */
   3106 
   3107 static void
   3108 htmlParseCharData(htmlParserCtxtPtr ctxt) {
   3109     htmlParseCharDataInternal(ctxt, 0);
   3110 }
   3111 
   3112 /**
   3113  * htmlParseExternalID:
   3114  * @ctxt:  an HTML parser context
   3115  * @publicID:  a xmlChar** receiving PubidLiteral
   3116  *
   3117  * Parse an External ID or a Public ID
   3118  *
   3119  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
   3120  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
   3121  *
   3122  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
   3123  *
   3124  * Returns the function returns SystemLiteral and in the second
   3125  *                case publicID receives PubidLiteral, is strict is off
   3126  *                it is possible to return NULL and have publicID set.
   3127  */
   3128 
   3129 static xmlChar *
   3130 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
   3131     xmlChar *URI = NULL;
   3132 
   3133     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
   3134          (UPP(2) == 'S') && (UPP(3) == 'T') &&
   3135 	 (UPP(4) == 'E') && (UPP(5) == 'M')) {
   3136         SKIP(6);
   3137 	if (!IS_BLANK_CH(CUR)) {
   3138 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
   3139 	                 "Space required after 'SYSTEM'\n", NULL, NULL);
   3140 	}
   3141         SKIP_BLANKS;
   3142 	URI = htmlParseSystemLiteral(ctxt);
   3143 	if (URI == NULL) {
   3144 	    htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
   3145 	                 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
   3146         }
   3147     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
   3148 	       (UPP(2) == 'B') && (UPP(3) == 'L') &&
   3149 	       (UPP(4) == 'I') && (UPP(5) == 'C')) {
   3150         SKIP(6);
   3151 	if (!IS_BLANK_CH(CUR)) {
   3152 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
   3153 	                 "Space required after 'PUBLIC'\n", NULL, NULL);
   3154 	}
   3155         SKIP_BLANKS;
   3156 	*publicID = htmlParsePubidLiteral(ctxt);
   3157 	if (*publicID == NULL) {
   3158 	    htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
   3159 	                 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
   3160 			 NULL, NULL);
   3161 	}
   3162         SKIP_BLANKS;
   3163         if ((CUR == '"') || (CUR == '\'')) {
   3164 	    URI = htmlParseSystemLiteral(ctxt);
   3165 	}
   3166     }
   3167     return(URI);
   3168 }
   3169 
   3170 /**
   3171  * xmlParsePI:
   3172  * @ctxt:  an XML parser context
   3173  *
   3174  * parse an XML Processing Instruction.
   3175  *
   3176  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
   3177  */
   3178 static void
   3179 htmlParsePI(htmlParserCtxtPtr ctxt) {
   3180     xmlChar *buf = NULL;
   3181     int len = 0;
   3182     int size = HTML_PARSER_BUFFER_SIZE;
   3183     int cur, l;
   3184     const xmlChar *target;
   3185     xmlParserInputState state;
   3186     int count = 0;
   3187 
   3188     if ((RAW == '<') && (NXT(1) == '?')) {
   3189 	state = ctxt->instate;
   3190         ctxt->instate = XML_PARSER_PI;
   3191 	/*
   3192 	 * this is a Processing Instruction.
   3193 	 */
   3194 	SKIP(2);
   3195 	SHRINK;
   3196 
   3197 	/*
   3198 	 * Parse the target name and check for special support like
   3199 	 * namespace.
   3200 	 */
   3201         target = htmlParseName(ctxt);
   3202 	if (target != NULL) {
   3203 	    if (RAW == '>') {
   3204 		SKIP(1);
   3205 
   3206 		/*
   3207 		 * SAX: PI detected.
   3208 		 */
   3209 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
   3210 		    (ctxt->sax->processingInstruction != NULL))
   3211 		    ctxt->sax->processingInstruction(ctxt->userData,
   3212 		                                     target, NULL);
   3213 		ctxt->instate = state;
   3214 		return;
   3215 	    }
   3216 	    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
   3217 	    if (buf == NULL) {
   3218 		htmlErrMemory(ctxt, NULL);
   3219 		ctxt->instate = state;
   3220 		return;
   3221 	    }
   3222 	    cur = CUR;
   3223 	    if (!IS_BLANK(cur)) {
   3224 		htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
   3225 			  "ParsePI: PI %s space expected\n", target, NULL);
   3226 	    }
   3227             SKIP_BLANKS;
   3228 	    cur = CUR_CHAR(l);
   3229 	    while (IS_CHAR(cur) && (cur != '>')) {
   3230 		if (len + 5 >= size) {
   3231 		    xmlChar *tmp;
   3232 
   3233 		    size *= 2;
   3234 		    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
   3235 		    if (tmp == NULL) {
   3236 			htmlErrMemory(ctxt, NULL);
   3237 			xmlFree(buf);
   3238 			ctxt->instate = state;
   3239 			return;
   3240 		    }
   3241 		    buf = tmp;
   3242 		}
   3243 		count++;
   3244 		if (count > 50) {
   3245 		    GROW;
   3246 		    count = 0;
   3247 		}
   3248 		COPY_BUF(l,buf,len,cur);
   3249 		NEXTL(l);
   3250 		cur = CUR_CHAR(l);
   3251 		if (cur == 0) {
   3252 		    SHRINK;
   3253 		    GROW;
   3254 		    cur = CUR_CHAR(l);
   3255 		}
   3256 	    }
   3257 	    buf[len] = 0;
   3258 	    if (cur != '>') {
   3259 		htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
   3260 		      "ParsePI: PI %s never end ...\n", target, NULL);
   3261 	    } else {
   3262 		SKIP(1);
   3263 
   3264 		/*
   3265 		 * SAX: PI detected.
   3266 		 */
   3267 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
   3268 		    (ctxt->sax->processingInstruction != NULL))
   3269 		    ctxt->sax->processingInstruction(ctxt->userData,
   3270 		                                     target, buf);
   3271 	    }
   3272 	    xmlFree(buf);
   3273 	} else {
   3274 	    htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
   3275                          "PI is not started correctly", NULL, NULL);
   3276 	}
   3277 	ctxt->instate = state;
   3278     }
   3279 }
   3280 
   3281 /**
   3282  * htmlParseComment:
   3283  * @ctxt:  an HTML parser context
   3284  *
   3285  * Parse an XML (SGML) comment <!-- .... -->
   3286  *
   3287  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
   3288  */
   3289 static void
   3290 htmlParseComment(htmlParserCtxtPtr ctxt) {
   3291     xmlChar *buf = NULL;
   3292     int len;
   3293     int size = HTML_PARSER_BUFFER_SIZE;
   3294     int q, ql;
   3295     int r, rl;
   3296     int cur, l;
   3297     xmlParserInputState state;
   3298 
   3299     /*
   3300      * Check that there is a comment right here.
   3301      */
   3302     if ((RAW != '<') || (NXT(1) != '!') ||
   3303         (NXT(2) != '-') || (NXT(3) != '-')) return;
   3304 
   3305     state = ctxt->instate;
   3306     ctxt->instate = XML_PARSER_COMMENT;
   3307     SHRINK;
   3308     SKIP(4);
   3309     buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
   3310     if (buf == NULL) {
   3311         htmlErrMemory(ctxt, "buffer allocation failed\n");
   3312 	ctxt->instate = state;
   3313 	return;
   3314     }
   3315     len = 0;
   3316     buf[len] = 0;
   3317     q = CUR_CHAR(ql);
   3318     if (!IS_CHAR(q))
   3319         goto unfinished;
   3320     NEXTL(ql);
   3321     r = CUR_CHAR(rl);
   3322     if (!IS_CHAR(r))
   3323         goto unfinished;
   3324     NEXTL(rl);
   3325     cur = CUR_CHAR(l);
   3326     while (IS_CHAR(cur) &&
   3327            ((cur != '>') ||
   3328 	    (r != '-') || (q != '-'))) {
   3329 	if (len + 5 >= size) {
   3330 	    xmlChar *tmp;
   3331 
   3332 	    size *= 2;
   3333 	    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
   3334 	    if (tmp == NULL) {
   3335 	        xmlFree(buf);
   3336 	        htmlErrMemory(ctxt, "growing buffer failed\n");
   3337 		ctxt->instate = state;
   3338 		return;
   3339 	    }
   3340 	    buf = tmp;
   3341 	}
   3342 	COPY_BUF(ql,buf,len,q);
   3343 	q = r;
   3344 	ql = rl;
   3345 	r = cur;
   3346 	rl = l;
   3347 	NEXTL(l);
   3348 	cur = CUR_CHAR(l);
   3349 	if (cur == 0) {
   3350 	    SHRINK;
   3351 	    GROW;
   3352 	    cur = CUR_CHAR(l);
   3353 	}
   3354     }
   3355     buf[len] = 0;
   3356     if (IS_CHAR(cur)) {
   3357         NEXT;
   3358 	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
   3359 	    (!ctxt->disableSAX))
   3360 	    ctxt->sax->comment(ctxt->userData, buf);
   3361 	xmlFree(buf);
   3362 	ctxt->instate = state;
   3363 	return;
   3364     }
   3365 
   3366 unfinished:
   3367     htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
   3368 		 "Comment not terminated \n<!--%.50s\n", buf, NULL);
   3369     xmlFree(buf);
   3370 }
   3371 
   3372 /**
   3373  * htmlParseCharRef:
   3374  * @ctxt:  an HTML parser context
   3375  *
   3376  * parse Reference declarations
   3377  *
   3378  * [66] CharRef ::= '&#' [0-9]+ ';' |
   3379  *                  '&#x' [0-9a-fA-F]+ ';'
   3380  *
   3381  * Returns the value parsed (as an int)
   3382  */
   3383 int
   3384 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
   3385     int val = 0;
   3386 
   3387     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   3388 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   3389 		     "htmlParseCharRef: context error\n",
   3390 		     NULL, NULL);
   3391         return(0);
   3392     }
   3393     if ((CUR == '&') && (NXT(1) == '#') &&
   3394         ((NXT(2) == 'x') || NXT(2) == 'X')) {
   3395 	SKIP(3);
   3396 	while (CUR != ';') {
   3397 	    if ((CUR >= '0') && (CUR <= '9'))
   3398 	        val = val * 16 + (CUR - '0');
   3399 	    else if ((CUR >= 'a') && (CUR <= 'f'))
   3400 	        val = val * 16 + (CUR - 'a') + 10;
   3401 	    else if ((CUR >= 'A') && (CUR <= 'F'))
   3402 	        val = val * 16 + (CUR - 'A') + 10;
   3403 	    else {
   3404 	        htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
   3405 		             "htmlParseCharRef: missing semicolon\n",
   3406 			     NULL, NULL);
   3407 		break;
   3408 	    }
   3409 	    NEXT;
   3410 	}
   3411 	if (CUR == ';')
   3412 	    NEXT;
   3413     } else if  ((CUR == '&') && (NXT(1) == '#')) {
   3414 	SKIP(2);
   3415 	while (CUR != ';') {
   3416 	    if ((CUR >= '0') && (CUR <= '9'))
   3417 	        val = val * 10 + (CUR - '0');
   3418 	    else {
   3419 	        htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
   3420 		             "htmlParseCharRef: missing semicolon\n",
   3421 			     NULL, NULL);
   3422 		break;
   3423 	    }
   3424 	    NEXT;
   3425 	}
   3426 	if (CUR == ';')
   3427 	    NEXT;
   3428     } else {
   3429 	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
   3430 	             "htmlParseCharRef: invalid value\n", NULL, NULL);
   3431     }
   3432     /*
   3433      * Check the value IS_CHAR ...
   3434      */
   3435     if (IS_CHAR(val)) {
   3436         return(val);
   3437     } else {
   3438 	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
   3439 			"htmlParseCharRef: invalid xmlChar value %d\n",
   3440 			val);
   3441     }
   3442     return(0);
   3443 }
   3444 
   3445 
   3446 /**
   3447  * htmlParseDocTypeDecl:
   3448  * @ctxt:  an HTML parser context
   3449  *
   3450  * parse a DOCTYPE declaration
   3451  *
   3452  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
   3453  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
   3454  */
   3455 
   3456 static void
   3457 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
   3458     const xmlChar *name;
   3459     xmlChar *ExternalID = NULL;
   3460     xmlChar *URI = NULL;
   3461 
   3462     /*
   3463      * We know that '<!DOCTYPE' has been detected.
   3464      */
   3465     SKIP(9);
   3466 
   3467     SKIP_BLANKS;
   3468 
   3469     /*
   3470      * Parse the DOCTYPE name.
   3471      */
   3472     name = htmlParseName(ctxt);
   3473     if (name == NULL) {
   3474 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   3475 	             "htmlParseDocTypeDecl : no DOCTYPE name !\n",
   3476 		     NULL, NULL);
   3477     }
   3478     /*
   3479      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
   3480      */
   3481 
   3482     SKIP_BLANKS;
   3483 
   3484     /*
   3485      * Check for SystemID and ExternalID
   3486      */
   3487     URI = htmlParseExternalID(ctxt, &ExternalID);
   3488     SKIP_BLANKS;
   3489 
   3490     /*
   3491      * We should be at the end of the DOCTYPE declaration.
   3492      */
   3493     if (CUR != '>') {
   3494 	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
   3495 	             "DOCTYPE improperly terminated\n", NULL, NULL);
   3496         /* We shouldn't try to resynchronize ... */
   3497     }
   3498     NEXT;
   3499 
   3500     /*
   3501      * Create or update the document accordingly to the DOCTYPE
   3502      */
   3503     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
   3504 	(!ctxt->disableSAX))
   3505 	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
   3506 
   3507     /*
   3508      * Cleanup, since we don't use all those identifiers
   3509      */
   3510     if (URI != NULL) xmlFree(URI);
   3511     if (ExternalID != NULL) xmlFree(ExternalID);
   3512 }
   3513 
   3514 /**
   3515  * htmlParseAttribute:
   3516  * @ctxt:  an HTML parser context
   3517  * @value:  a xmlChar ** used to store the value of the attribute
   3518  *
   3519  * parse an attribute
   3520  *
   3521  * [41] Attribute ::= Name Eq AttValue
   3522  *
   3523  * [25] Eq ::= S? '=' S?
   3524  *
   3525  * With namespace:
   3526  *
   3527  * [NS 11] Attribute ::= QName Eq AttValue
   3528  *
   3529  * Also the case QName == xmlns:??? is handled independently as a namespace
   3530  * definition.
   3531  *
   3532  * Returns the attribute name, and the value in *value.
   3533  */
   3534 
   3535 static const xmlChar *
   3536 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
   3537     const xmlChar *name;
   3538     xmlChar *val = NULL;
   3539 
   3540     *value = NULL;
   3541     name = htmlParseHTMLName(ctxt);
   3542     if (name == NULL) {
   3543 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   3544 	             "error parsing attribute name\n", NULL, NULL);
   3545         return(NULL);
   3546     }
   3547 
   3548     /*
   3549      * read the value
   3550      */
   3551     SKIP_BLANKS;
   3552     if (CUR == '=') {
   3553         NEXT;
   3554 	SKIP_BLANKS;
   3555 	val = htmlParseAttValue(ctxt);
   3556     }
   3557 
   3558     *value = val;
   3559     return(name);
   3560 }
   3561 
   3562 /**
   3563  * htmlCheckEncodingDirect:
   3564  * @ctxt:  an HTML parser context
   3565  * @attvalue: the attribute value
   3566  *
   3567  * Checks an attribute value to detect
   3568  * the encoding
   3569  * If a new encoding is detected the parser is switched to decode
   3570  * it and pass UTF8
   3571  */
   3572 static void
   3573 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
   3574 
   3575     if ((ctxt == NULL) || (encoding == NULL) ||
   3576         (ctxt->options & HTML_PARSE_IGNORE_ENC))
   3577 	return;
   3578 
   3579     /* do not change encoding */
   3580     if (ctxt->input->encoding != NULL)
   3581         return;
   3582 
   3583     if (encoding != NULL) {
   3584 	xmlCharEncoding enc;
   3585 	xmlCharEncodingHandlerPtr handler;
   3586 
   3587 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
   3588 
   3589 	if (ctxt->input->encoding != NULL)
   3590 	    xmlFree((xmlChar *) ctxt->input->encoding);
   3591 	ctxt->input->encoding = xmlStrdup(encoding);
   3592 
   3593 	enc = xmlParseCharEncoding((const char *) encoding);
   3594 	/*
   3595 	 * registered set of known encodings
   3596 	 */
   3597 	if (enc != XML_CHAR_ENCODING_ERROR) {
   3598 	    if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
   3599 	         (enc == XML_CHAR_ENCODING_UTF16BE) ||
   3600 		 (enc == XML_CHAR_ENCODING_UCS4LE) ||
   3601 		 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
   3602 		(ctxt->input->buf != NULL) &&
   3603 		(ctxt->input->buf->encoder == NULL)) {
   3604 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
   3605 		             "htmlCheckEncoding: wrong encoding meta\n",
   3606 			     NULL, NULL);
   3607 	    } else {
   3608 		xmlSwitchEncoding(ctxt, enc);
   3609 	    }
   3610 	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
   3611 	} else {
   3612 	    /*
   3613 	     * fallback for unknown encodings
   3614 	     */
   3615 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
   3616 	    if (handler != NULL) {
   3617 		xmlSwitchToEncoding(ctxt, handler);
   3618 		ctxt->charset = XML_CHAR_ENCODING_UTF8;
   3619 	    } else {
   3620 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
   3621 		             "htmlCheckEncoding: unknown encoding %s\n",
   3622 			     encoding, NULL);
   3623 	    }
   3624 	}
   3625 
   3626 	if ((ctxt->input->buf != NULL) &&
   3627 	    (ctxt->input->buf->encoder != NULL) &&
   3628 	    (ctxt->input->buf->raw != NULL) &&
   3629 	    (ctxt->input->buf->buffer != NULL)) {
   3630 	    int nbchars;
   3631 	    int processed;
   3632 
   3633 	    /*
   3634 	     * convert as much as possible to the parser reading buffer.
   3635 	     */
   3636 	    processed = ctxt->input->cur - ctxt->input->base;
   3637 	    xmlBufShrink(ctxt->input->buf->buffer, processed);
   3638 	    nbchars = xmlCharEncInput(ctxt->input->buf, 1);
   3639             xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
   3640 	    if (nbchars < 0) {
   3641 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
   3642 		             "htmlCheckEncoding: encoder error\n",
   3643 			     NULL, NULL);
   3644 	    }
   3645 	}
   3646     }
   3647 }
   3648 
   3649 /**
   3650  * htmlCheckEncoding:
   3651  * @ctxt:  an HTML parser context
   3652  * @attvalue: the attribute value
   3653  *
   3654  * Checks an http-equiv attribute from a Meta tag to detect
   3655  * the encoding
   3656  * If a new encoding is detected the parser is switched to decode
   3657  * it and pass UTF8
   3658  */
   3659 static void
   3660 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
   3661     const xmlChar *encoding;
   3662 
   3663     if (!attvalue)
   3664 	return;
   3665 
   3666     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
   3667     if (encoding != NULL) {
   3668 	encoding += 7;
   3669     }
   3670     /*
   3671      * skip blank
   3672      */
   3673     if (encoding && IS_BLANK_CH(*encoding))
   3674 	encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
   3675     if (encoding && *encoding == '=') {
   3676 	encoding ++;
   3677 	htmlCheckEncodingDirect(ctxt, encoding);
   3678     }
   3679 }
   3680 
   3681 /**
   3682  * htmlCheckMeta:
   3683  * @ctxt:  an HTML parser context
   3684  * @atts:  the attributes values
   3685  *
   3686  * Checks an attributes from a Meta tag
   3687  */
   3688 static void
   3689 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
   3690     int i;
   3691     const xmlChar *att, *value;
   3692     int http = 0;
   3693     const xmlChar *content = NULL;
   3694 
   3695     if ((ctxt == NULL) || (atts == NULL))
   3696 	return;
   3697 
   3698     i = 0;
   3699     att = atts[i++];
   3700     while (att != NULL) {
   3701 	value = atts[i++];
   3702 	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
   3703 	 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
   3704 	    http = 1;
   3705 	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
   3706 	    htmlCheckEncodingDirect(ctxt, value);
   3707 	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
   3708 	    content = value;
   3709 	att = atts[i++];
   3710     }
   3711     if ((http) && (content != NULL))
   3712 	htmlCheckEncoding(ctxt, content);
   3713 
   3714 }
   3715 
   3716 /**
   3717  * htmlParseStartTag:
   3718  * @ctxt:  an HTML parser context
   3719  *
   3720  * parse a start of tag either for rule element or
   3721  * EmptyElement. In both case we don't parse the tag closing chars.
   3722  *
   3723  * [40] STag ::= '<' Name (S Attribute)* S? '>'
   3724  *
   3725  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
   3726  *
   3727  * With namespace:
   3728  *
   3729  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
   3730  *
   3731  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
   3732  *
   3733  * Returns 0 in case of success, -1 in case of error and 1 if discarded
   3734  */
   3735 
   3736 static int
   3737 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
   3738     const xmlChar *name;
   3739     const xmlChar *attname;
   3740     xmlChar *attvalue;
   3741     const xmlChar **atts;
   3742     int nbatts = 0;
   3743     int maxatts;
   3744     int meta = 0;
   3745     int i;
   3746     int discardtag = 0;
   3747 
   3748     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   3749 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   3750 		     "htmlParseStartTag: context error\n", NULL, NULL);
   3751 	return -1;
   3752     }
   3753     if (ctxt->instate == XML_PARSER_EOF)
   3754         return(-1);
   3755     if (CUR != '<') return -1;
   3756     NEXT;
   3757 
   3758     atts = ctxt->atts;
   3759     maxatts = ctxt->maxatts;
   3760 
   3761     GROW;
   3762     name = htmlParseHTMLName(ctxt);
   3763     if (name == NULL) {
   3764 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   3765 	             "htmlParseStartTag: invalid element name\n",
   3766 		     NULL, NULL);
   3767 	/* if recover preserve text on classic misconstructs */
   3768 	if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
   3769 	    (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
   3770 	    htmlParseCharDataInternal(ctxt, '<');
   3771 	    return(-1);
   3772 	}
   3773 
   3774 
   3775 	/* Dump the bogus tag like browsers do */
   3776 	while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
   3777                (ctxt->instate != XML_PARSER_EOF))
   3778 	    NEXT;
   3779         return -1;
   3780     }
   3781     if (xmlStrEqual(name, BAD_CAST"meta"))
   3782 	meta = 1;
   3783 
   3784     /*
   3785      * Check for auto-closure of HTML elements.
   3786      */
   3787     htmlAutoClose(ctxt, name);
   3788 
   3789     /*
   3790      * Check for implied HTML elements.
   3791      */
   3792     htmlCheckImplied(ctxt, name);
   3793 
   3794     /*
   3795      * Avoid html at any level > 0, head at any level != 1
   3796      * or any attempt to recurse body
   3797      */
   3798     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
   3799 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   3800 	             "htmlParseStartTag: misplaced <html> tag\n",
   3801 		     name, NULL);
   3802 	discardtag = 1;
   3803 	ctxt->depth++;
   3804     }
   3805     if ((ctxt->nameNr != 1) &&
   3806 	(xmlStrEqual(name, BAD_CAST"head"))) {
   3807 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   3808 	             "htmlParseStartTag: misplaced <head> tag\n",
   3809 		     name, NULL);
   3810 	discardtag = 1;
   3811 	ctxt->depth++;
   3812     }
   3813     if (xmlStrEqual(name, BAD_CAST"body")) {
   3814 	int indx;
   3815 	for (indx = 0;indx < ctxt->nameNr;indx++) {
   3816 	    if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
   3817 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   3818 		             "htmlParseStartTag: misplaced <body> tag\n",
   3819 			     name, NULL);
   3820 		discardtag = 1;
   3821 		ctxt->depth++;
   3822 	    }
   3823 	}
   3824     }
   3825 
   3826     /*
   3827      * Now parse the attributes, it ends up with the ending
   3828      *
   3829      * (S Attribute)* S?
   3830      */
   3831     SKIP_BLANKS;
   3832     while ((IS_CHAR_CH(CUR)) &&
   3833            (CUR != '>') &&
   3834 	   ((CUR != '/') || (NXT(1) != '>'))) {
   3835 	long cons = ctxt->nbChars;
   3836 
   3837 	GROW;
   3838 	attname = htmlParseAttribute(ctxt, &attvalue);
   3839         if (attname != NULL) {
   3840 
   3841 	    /*
   3842 	     * Well formedness requires at most one declaration of an attribute
   3843 	     */
   3844 	    for (i = 0; i < nbatts;i += 2) {
   3845 	        if (xmlStrEqual(atts[i], attname)) {
   3846 		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
   3847 		                 "Attribute %s redefined\n", attname, NULL);
   3848 		    if (attvalue != NULL)
   3849 			xmlFree(attvalue);
   3850 		    goto failed;
   3851 		}
   3852 	    }
   3853 
   3854 	    /*
   3855 	     * Add the pair to atts
   3856 	     */
   3857 	    if (atts == NULL) {
   3858 	        maxatts = 22; /* allow for 10 attrs by default */
   3859 	        atts = (const xmlChar **)
   3860 		       xmlMalloc(maxatts * sizeof(xmlChar *));
   3861 		if (atts == NULL) {
   3862 		    htmlErrMemory(ctxt, NULL);
   3863 		    if (attvalue != NULL)
   3864 			xmlFree(attvalue);
   3865 		    goto failed;
   3866 		}
   3867 		ctxt->atts = atts;
   3868 		ctxt->maxatts = maxatts;
   3869 	    } else if (nbatts + 4 > maxatts) {
   3870 	        const xmlChar **n;
   3871 
   3872 	        maxatts *= 2;
   3873 	        n = (const xmlChar **) xmlRealloc((void *) atts,
   3874 					     maxatts * sizeof(const xmlChar *));
   3875 		if (n == NULL) {
   3876 		    htmlErrMemory(ctxt, NULL);
   3877 		    if (attvalue != NULL)
   3878 			xmlFree(attvalue);
   3879 		    goto failed;
   3880 		}
   3881 		atts = n;
   3882 		ctxt->atts = atts;
   3883 		ctxt->maxatts = maxatts;
   3884 	    }
   3885 	    atts[nbatts++] = attname;
   3886 	    atts[nbatts++] = attvalue;
   3887 	    atts[nbatts] = NULL;
   3888 	    atts[nbatts + 1] = NULL;
   3889 	}
   3890 	else {
   3891 	    if (attvalue != NULL)
   3892 	        xmlFree(attvalue);
   3893 	    /* Dump the bogus attribute string up to the next blank or
   3894 	     * the end of the tag. */
   3895 	    while ((IS_CHAR_CH(CUR)) &&
   3896 	           !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
   3897 		   ((CUR != '/') || (NXT(1) != '>')))
   3898 		NEXT;
   3899 	}
   3900 
   3901 failed:
   3902 	SKIP_BLANKS;
   3903         if (cons == ctxt->nbChars) {
   3904 	    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   3905 	                 "htmlParseStartTag: problem parsing attributes\n",
   3906 			 NULL, NULL);
   3907 	    break;
   3908 	}
   3909     }
   3910 
   3911     /*
   3912      * Handle specific association to the META tag
   3913      */
   3914     if (meta && (nbatts != 0))
   3915 	htmlCheckMeta(ctxt, atts);
   3916 
   3917     /*
   3918      * SAX: Start of Element !
   3919      */
   3920     if (!discardtag) {
   3921 	htmlnamePush(ctxt, name);
   3922 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
   3923 	    if (nbatts != 0)
   3924 		ctxt->sax->startElement(ctxt->userData, name, atts);
   3925 	    else
   3926 		ctxt->sax->startElement(ctxt->userData, name, NULL);
   3927 	}
   3928     }
   3929 
   3930     if (atts != NULL) {
   3931         for (i = 1;i < nbatts;i += 2) {
   3932 	    if (atts[i] != NULL)
   3933 		xmlFree((xmlChar *) atts[i]);
   3934 	}
   3935     }
   3936 
   3937     return(discardtag);
   3938 }
   3939 
   3940 /**
   3941  * htmlParseEndTag:
   3942  * @ctxt:  an HTML parser context
   3943  *
   3944  * parse an end of tag
   3945  *
   3946  * [42] ETag ::= '</' Name S? '>'
   3947  *
   3948  * With namespace
   3949  *
   3950  * [NS 9] ETag ::= '</' QName S? '>'
   3951  *
   3952  * Returns 1 if the current level should be closed.
   3953  */
   3954 
   3955 static int
   3956 htmlParseEndTag(htmlParserCtxtPtr ctxt)
   3957 {
   3958     const xmlChar *name;
   3959     const xmlChar *oldname;
   3960     int i, ret;
   3961 
   3962     if ((CUR != '<') || (NXT(1) != '/')) {
   3963         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
   3964 	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
   3965         return (0);
   3966     }
   3967     SKIP(2);
   3968 
   3969     name = htmlParseHTMLName(ctxt);
   3970     if (name == NULL)
   3971         return (0);
   3972     /*
   3973      * We should definitely be at the ending "S? '>'" part
   3974      */
   3975     SKIP_BLANKS;
   3976     if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
   3977         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
   3978 	             "End tag : expected '>'\n", NULL, NULL);
   3979 	if (ctxt->recovery) {
   3980 	    /*
   3981 	     * We're not at the ending > !!
   3982 	     * Error, unless in recover mode where we search forwards
   3983 	     * until we find a >
   3984 	     */
   3985 	    while (CUR != '\0' && CUR != '>') NEXT;
   3986 	    NEXT;
   3987 	}
   3988     } else
   3989         NEXT;
   3990 
   3991     /*
   3992      * if we ignored misplaced tags in htmlParseStartTag don't pop them
   3993      * out now.
   3994      */
   3995     if ((ctxt->depth > 0) &&
   3996         (xmlStrEqual(name, BAD_CAST "html") ||
   3997          xmlStrEqual(name, BAD_CAST "body") ||
   3998 	 xmlStrEqual(name, BAD_CAST "head"))) {
   3999 	ctxt->depth--;
   4000 	return (0);
   4001     }
   4002 
   4003     /*
   4004      * If the name read is not one of the element in the parsing stack
   4005      * then return, it's just an error.
   4006      */
   4007     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
   4008         if (xmlStrEqual(name, ctxt->nameTab[i]))
   4009             break;
   4010     }
   4011     if (i < 0) {
   4012         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
   4013 	             "Unexpected end tag : %s\n", name, NULL);
   4014         return (0);
   4015     }
   4016 
   4017 
   4018     /*
   4019      * Check for auto-closure of HTML elements.
   4020      */
   4021 
   4022     htmlAutoCloseOnClose(ctxt, name);
   4023 
   4024     /*
   4025      * Well formedness constraints, opening and closing must match.
   4026      * With the exception that the autoclose may have popped stuff out
   4027      * of the stack.
   4028      */
   4029     if (!xmlStrEqual(name, ctxt->name)) {
   4030         if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
   4031             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
   4032 	                 "Opening and ending tag mismatch: %s and %s\n",
   4033 			 name, ctxt->name);
   4034         }
   4035     }
   4036 
   4037     /*
   4038      * SAX: End of Tag
   4039      */
   4040     oldname = ctxt->name;
   4041     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
   4042         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   4043             ctxt->sax->endElement(ctxt->userData, name);
   4044 	htmlNodeInfoPop(ctxt);
   4045         htmlnamePop(ctxt);
   4046         ret = 1;
   4047     } else {
   4048         ret = 0;
   4049     }
   4050 
   4051     return (ret);
   4052 }
   4053 
   4054 
   4055 /**
   4056  * htmlParseReference:
   4057  * @ctxt:  an HTML parser context
   4058  *
   4059  * parse and handle entity references in content,
   4060  * this will end-up in a call to character() since this is either a
   4061  * CharRef, or a predefined entity.
   4062  */
   4063 static void
   4064 htmlParseReference(htmlParserCtxtPtr ctxt) {
   4065     const htmlEntityDesc * ent;
   4066     xmlChar out[6];
   4067     const xmlChar *name;
   4068     if (CUR != '&') return;
   4069 
   4070     if (NXT(1) == '#') {
   4071 	unsigned int c;
   4072 	int bits, i = 0;
   4073 
   4074 	c = htmlParseCharRef(ctxt);
   4075 	if (c == 0)
   4076 	    return;
   4077 
   4078         if      (c <    0x80) { out[i++]= c;                bits= -6; }
   4079         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   4080         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   4081         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   4082 
   4083         for ( ; bits >= 0; bits-= 6) {
   4084             out[i++]= ((c >> bits) & 0x3F) | 0x80;
   4085         }
   4086 	out[i] = 0;
   4087 
   4088 	htmlCheckParagraph(ctxt);
   4089 	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
   4090 	    ctxt->sax->characters(ctxt->userData, out, i);
   4091     } else {
   4092 	ent = htmlParseEntityRef(ctxt, &name);
   4093 	if (name == NULL) {
   4094 	    htmlCheckParagraph(ctxt);
   4095 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
   4096 	        ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
   4097 	    return;
   4098 	}
   4099 	if ((ent == NULL) || !(ent->value > 0)) {
   4100 	    htmlCheckParagraph(ctxt);
   4101 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
   4102 		ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
   4103 		ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
   4104 		/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
   4105 	    }
   4106 	} else {
   4107 	    unsigned int c;
   4108 	    int bits, i = 0;
   4109 
   4110 	    c = ent->value;
   4111 	    if      (c <    0x80)
   4112 	            { out[i++]= c;                bits= -6; }
   4113 	    else if (c <   0x800)
   4114 	            { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   4115 	    else if (c < 0x10000)
   4116 	            { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   4117 	    else
   4118 	            { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   4119 
   4120 	    for ( ; bits >= 0; bits-= 6) {
   4121 		out[i++]= ((c >> bits) & 0x3F) | 0x80;
   4122 	    }
   4123 	    out[i] = 0;
   4124 
   4125 	    htmlCheckParagraph(ctxt);
   4126 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
   4127 		ctxt->sax->characters(ctxt->userData, out, i);
   4128 	}
   4129     }
   4130 }
   4131 
   4132 /**
   4133  * htmlParseContent:
   4134  * @ctxt:  an HTML parser context
   4135  *
   4136  * Parse a content: comment, sub-element, reference or text.
   4137  * Kept for compatibility with old code
   4138  */
   4139 
   4140 static void
   4141 htmlParseContent(htmlParserCtxtPtr ctxt) {
   4142     xmlChar *currentNode;
   4143     int depth;
   4144     const xmlChar *name;
   4145 
   4146     currentNode = xmlStrdup(ctxt->name);
   4147     depth = ctxt->nameNr;
   4148     while (1) {
   4149 	long cons = ctxt->nbChars;
   4150 
   4151         GROW;
   4152 
   4153         if (ctxt->instate == XML_PARSER_EOF)
   4154             break;
   4155 
   4156 	/*
   4157 	 * Our tag or one of it's parent or children is ending.
   4158 	 */
   4159         if ((CUR == '<') && (NXT(1) == '/')) {
   4160 	    if (htmlParseEndTag(ctxt) &&
   4161 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
   4162 		if (currentNode != NULL)
   4163 		    xmlFree(currentNode);
   4164 		return;
   4165 	    }
   4166 	    continue; /* while */
   4167         }
   4168 
   4169 	else if ((CUR == '<') &&
   4170 	         ((IS_ASCII_LETTER(NXT(1))) ||
   4171 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
   4172 	    name = htmlParseHTMLName_nonInvasive(ctxt);
   4173 	    if (name == NULL) {
   4174 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   4175 			 "htmlParseStartTag: invalid element name\n",
   4176 			 NULL, NULL);
   4177 	        /* Dump the bogus tag like browsers do */
   4178         while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
   4179 	            NEXT;
   4180 
   4181 	        if (currentNode != NULL)
   4182 	            xmlFree(currentNode);
   4183 	        return;
   4184 	    }
   4185 
   4186 	    if (ctxt->name != NULL) {
   4187 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
   4188 	            htmlAutoClose(ctxt, name);
   4189 	            continue;
   4190 	        }
   4191 	    }
   4192 	}
   4193 
   4194 	/*
   4195 	 * Has this node been popped out during parsing of
   4196 	 * the next element
   4197 	 */
   4198         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
   4199 	    (!xmlStrEqual(currentNode, ctxt->name)))
   4200 	     {
   4201 	    if (currentNode != NULL) xmlFree(currentNode);
   4202 	    return;
   4203 	}
   4204 
   4205 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
   4206 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
   4207 	    /*
   4208 	     * Handle SCRIPT/STYLE separately
   4209 	     */
   4210 	    htmlParseScript(ctxt);
   4211 	} else {
   4212 	    /*
   4213 	     * Sometimes DOCTYPE arrives in the middle of the document
   4214 	     */
   4215 	    if ((CUR == '<') && (NXT(1) == '!') &&
   4216 		(UPP(2) == 'D') && (UPP(3) == 'O') &&
   4217 		(UPP(4) == 'C') && (UPP(5) == 'T') &&
   4218 		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
   4219 		(UPP(8) == 'E')) {
   4220 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   4221 		             "Misplaced DOCTYPE declaration\n",
   4222 			     BAD_CAST "DOCTYPE" , NULL);
   4223 		htmlParseDocTypeDecl(ctxt);
   4224 	    }
   4225 
   4226 	    /*
   4227 	     * First case :  a comment
   4228 	     */
   4229 	    if ((CUR == '<') && (NXT(1) == '!') &&
   4230 		(NXT(2) == '-') && (NXT(3) == '-')) {
   4231 		htmlParseComment(ctxt);
   4232 	    }
   4233 
   4234 	    /*
   4235 	     * Second case : a Processing Instruction.
   4236 	     */
   4237 	    else if ((CUR == '<') && (NXT(1) == '?')) {
   4238 		htmlParsePI(ctxt);
   4239 	    }
   4240 
   4241 	    /*
   4242 	     * Third case :  a sub-element.
   4243 	     */
   4244 	    else if (CUR == '<') {
   4245 		htmlParseElement(ctxt);
   4246 	    }
   4247 
   4248 	    /*
   4249 	     * Fourth case : a reference. If if has not been resolved,
   4250 	     *    parsing returns it's Name, create the node
   4251 	     */
   4252 	    else if (CUR == '&') {
   4253 		htmlParseReference(ctxt);
   4254 	    }
   4255 
   4256 	    /*
   4257 	     * Fifth case : end of the resource
   4258 	     */
   4259 	    else if (CUR == 0) {
   4260 		htmlAutoCloseOnEnd(ctxt);
   4261 		break;
   4262 	    }
   4263 
   4264 	    /*
   4265 	     * Last case, text. Note that References are handled directly.
   4266 	     */
   4267 	    else {
   4268 		htmlParseCharData(ctxt);
   4269 	    }
   4270 
   4271 	    if (cons == ctxt->nbChars) {
   4272 		if (ctxt->node != NULL) {
   4273 		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   4274 		                 "detected an error in element content\n",
   4275 				 NULL, NULL);
   4276 		}
   4277 		break;
   4278 	    }
   4279 	}
   4280         GROW;
   4281     }
   4282     if (currentNode != NULL) xmlFree(currentNode);
   4283 }
   4284 
   4285 /**
   4286  * htmlParseElement:
   4287  * @ctxt:  an HTML parser context
   4288  *
   4289  * parse an HTML element, this is highly recursive
   4290  * this is kept for compatibility with previous code versions
   4291  *
   4292  * [39] element ::= EmptyElemTag | STag content ETag
   4293  *
   4294  * [41] Attribute ::= Name Eq AttValue
   4295  */
   4296 
   4297 void
   4298 htmlParseElement(htmlParserCtxtPtr ctxt) {
   4299     const xmlChar *name;
   4300     xmlChar *currentNode = NULL;
   4301     const htmlElemDesc * info;
   4302     htmlParserNodeInfo node_info;
   4303     int failed;
   4304     int depth;
   4305     const xmlChar *oldptr;
   4306 
   4307     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   4308 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   4309 		     "htmlParseElement: context error\n", NULL, NULL);
   4310 	return;
   4311     }
   4312 
   4313     if (ctxt->instate == XML_PARSER_EOF)
   4314         return;
   4315 
   4316     /* Capture start position */
   4317     if (ctxt->record_info) {
   4318         node_info.begin_pos = ctxt->input->consumed +
   4319                           (CUR_PTR - ctxt->input->base);
   4320 	node_info.begin_line = ctxt->input->line;
   4321     }
   4322 
   4323     failed = htmlParseStartTag(ctxt);
   4324     name = ctxt->name;
   4325     if ((failed == -1) || (name == NULL)) {
   4326 	if (CUR == '>')
   4327 	    NEXT;
   4328         return;
   4329     }
   4330 
   4331     /*
   4332      * Lookup the info for that element.
   4333      */
   4334     info = htmlTagLookup(name);
   4335     if (info == NULL) {
   4336 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
   4337 	             "Tag %s invalid\n", name, NULL);
   4338     }
   4339 
   4340     /*
   4341      * Check for an Empty Element labeled the XML/SGML way
   4342      */
   4343     if ((CUR == '/') && (NXT(1) == '>')) {
   4344         SKIP(2);
   4345 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   4346 	    ctxt->sax->endElement(ctxt->userData, name);
   4347 	htmlnamePop(ctxt);
   4348 	return;
   4349     }
   4350 
   4351     if (CUR == '>') {
   4352         NEXT;
   4353     } else {
   4354 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
   4355 	             "Couldn't find end of Start Tag %s\n", name, NULL);
   4356 
   4357 	/*
   4358 	 * end of parsing of this node.
   4359 	 */
   4360 	if (xmlStrEqual(name, ctxt->name)) {
   4361 	    nodePop(ctxt);
   4362 	    htmlnamePop(ctxt);
   4363 	}
   4364 
   4365 	/*
   4366 	 * Capture end position and add node
   4367 	 */
   4368 	if (ctxt->record_info) {
   4369 	   node_info.end_pos = ctxt->input->consumed +
   4370 			      (CUR_PTR - ctxt->input->base);
   4371 	   node_info.end_line = ctxt->input->line;
   4372 	   node_info.node = ctxt->node;
   4373 	   xmlParserAddNodeInfo(ctxt, &node_info);
   4374 	}
   4375 	return;
   4376     }
   4377 
   4378     /*
   4379      * Check for an Empty Element from DTD definition
   4380      */
   4381     if ((info != NULL) && (info->empty)) {
   4382 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   4383 	    ctxt->sax->endElement(ctxt->userData, name);
   4384 	htmlnamePop(ctxt);
   4385 	return;
   4386     }
   4387 
   4388     /*
   4389      * Parse the content of the element:
   4390      */
   4391     currentNode = xmlStrdup(ctxt->name);
   4392     depth = ctxt->nameNr;
   4393     while (IS_CHAR_CH(CUR)) {
   4394 	oldptr = ctxt->input->cur;
   4395 	htmlParseContent(ctxt);
   4396 	if (oldptr==ctxt->input->cur) break;
   4397 	if (ctxt->nameNr < depth) break;
   4398     }
   4399 
   4400     /*
   4401      * Capture end position and add node
   4402      */
   4403     if ( currentNode != NULL && ctxt->record_info ) {
   4404        node_info.end_pos = ctxt->input->consumed +
   4405                           (CUR_PTR - ctxt->input->base);
   4406        node_info.end_line = ctxt->input->line;
   4407        node_info.node = ctxt->node;
   4408        xmlParserAddNodeInfo(ctxt, &node_info);
   4409     }
   4410     if (!IS_CHAR_CH(CUR)) {
   4411 	htmlAutoCloseOnEnd(ctxt);
   4412     }
   4413 
   4414     if (currentNode != NULL)
   4415 	xmlFree(currentNode);
   4416 }
   4417 
   4418 static void
   4419 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
   4420     /*
   4421      * Capture end position and add node
   4422      */
   4423     if ( ctxt->node != NULL && ctxt->record_info ) {
   4424        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
   4425                                 (CUR_PTR - ctxt->input->base);
   4426        ctxt->nodeInfo->end_line = ctxt->input->line;
   4427        ctxt->nodeInfo->node = ctxt->node;
   4428        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
   4429        htmlNodeInfoPop(ctxt);
   4430     }
   4431     if (!IS_CHAR_CH(CUR)) {
   4432        htmlAutoCloseOnEnd(ctxt);
   4433     }
   4434 }
   4435 
   4436 /**
   4437  * htmlParseElementInternal:
   4438  * @ctxt:  an HTML parser context
   4439  *
   4440  * parse an HTML element, new version, non recursive
   4441  *
   4442  * [39] element ::= EmptyElemTag | STag content ETag
   4443  *
   4444  * [41] Attribute ::= Name Eq AttValue
   4445  */
   4446 
   4447 static void
   4448 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
   4449     const xmlChar *name;
   4450     const htmlElemDesc * info;
   4451     htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
   4452     int failed;
   4453 
   4454     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   4455 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   4456 		     "htmlParseElementInternal: context error\n", NULL, NULL);
   4457 	return;
   4458     }
   4459 
   4460     if (ctxt->instate == XML_PARSER_EOF)
   4461         return;
   4462 
   4463     /* Capture start position */
   4464     if (ctxt->record_info) {
   4465         node_info.begin_pos = ctxt->input->consumed +
   4466                           (CUR_PTR - ctxt->input->base);
   4467 	node_info.begin_line = ctxt->input->line;
   4468     }
   4469 
   4470     failed = htmlParseStartTag(ctxt);
   4471     name = ctxt->name;
   4472     if ((failed == -1) || (name == NULL)) {
   4473 	if (CUR == '>')
   4474 	    NEXT;
   4475         return;
   4476     }
   4477 
   4478     /*
   4479      * Lookup the info for that element.
   4480      */
   4481     info = htmlTagLookup(name);
   4482     if (info == NULL) {
   4483 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
   4484 	             "Tag %s invalid\n", name, NULL);
   4485     }
   4486 
   4487     /*
   4488      * Check for an Empty Element labeled the XML/SGML way
   4489      */
   4490     if ((CUR == '/') && (NXT(1) == '>')) {
   4491         SKIP(2);
   4492 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   4493 	    ctxt->sax->endElement(ctxt->userData, name);
   4494 	htmlnamePop(ctxt);
   4495 	return;
   4496     }
   4497 
   4498     if (CUR == '>') {
   4499         NEXT;
   4500     } else {
   4501 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
   4502 	             "Couldn't find end of Start Tag %s\n", name, NULL);
   4503 
   4504 	/*
   4505 	 * end of parsing of this node.
   4506 	 */
   4507 	if (xmlStrEqual(name, ctxt->name)) {
   4508 	    nodePop(ctxt);
   4509 	    htmlnamePop(ctxt);
   4510 	}
   4511 
   4512         if (ctxt->record_info)
   4513             htmlNodeInfoPush(ctxt, &node_info);
   4514         htmlParserFinishElementParsing(ctxt);
   4515 	return;
   4516     }
   4517 
   4518     /*
   4519      * Check for an Empty Element from DTD definition
   4520      */
   4521     if ((info != NULL) && (info->empty)) {
   4522 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   4523 	    ctxt->sax->endElement(ctxt->userData, name);
   4524 	htmlnamePop(ctxt);
   4525 	return;
   4526     }
   4527 
   4528     if (ctxt->record_info)
   4529         htmlNodeInfoPush(ctxt, &node_info);
   4530 }
   4531 
   4532 /**
   4533  * htmlParseContentInternal:
   4534  * @ctxt:  an HTML parser context
   4535  *
   4536  * Parse a content: comment, sub-element, reference or text.
   4537  * New version for non recursive htmlParseElementInternal
   4538  */
   4539 
   4540 static void
   4541 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
   4542     xmlChar *currentNode;
   4543     int depth;
   4544     const xmlChar *name;
   4545 
   4546     currentNode = xmlStrdup(ctxt->name);
   4547     depth = ctxt->nameNr;
   4548     while (1) {
   4549 	long cons = ctxt->nbChars;
   4550 
   4551         GROW;
   4552 
   4553         if (ctxt->instate == XML_PARSER_EOF)
   4554             break;
   4555 
   4556 	/*
   4557 	 * Our tag or one of it's parent or children is ending.
   4558 	 */
   4559         if ((CUR == '<') && (NXT(1) == '/')) {
   4560 	    if (htmlParseEndTag(ctxt) &&
   4561 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
   4562 		if (currentNode != NULL)
   4563 		    xmlFree(currentNode);
   4564 
   4565 	        currentNode = xmlStrdup(ctxt->name);
   4566 	        depth = ctxt->nameNr;
   4567 	    }
   4568 	    continue; /* while */
   4569         }
   4570 
   4571 	else if ((CUR == '<') &&
   4572 	         ((IS_ASCII_LETTER(NXT(1))) ||
   4573 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
   4574 	    name = htmlParseHTMLName_nonInvasive(ctxt);
   4575 	    if (name == NULL) {
   4576 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   4577 			 "htmlParseStartTag: invalid element name\n",
   4578 			 NULL, NULL);
   4579 	        /* Dump the bogus tag like browsers do */
   4580 	        while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
   4581 	            NEXT;
   4582 
   4583 	        htmlParserFinishElementParsing(ctxt);
   4584 	        if (currentNode != NULL)
   4585 	            xmlFree(currentNode);
   4586 
   4587 	        currentNode = xmlStrdup(ctxt->name);
   4588 	        depth = ctxt->nameNr;
   4589 	        continue;
   4590 	    }
   4591 
   4592 	    if (ctxt->name != NULL) {
   4593 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
   4594 	            htmlAutoClose(ctxt, name);
   4595 	            continue;
   4596 	        }
   4597 	    }
   4598 	}
   4599 
   4600 	/*
   4601 	 * Has this node been popped out during parsing of
   4602 	 * the next element
   4603 	 */
   4604         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
   4605 	    (!xmlStrEqual(currentNode, ctxt->name)))
   4606 	     {
   4607 	    htmlParserFinishElementParsing(ctxt);
   4608 	    if (currentNode != NULL) xmlFree(currentNode);
   4609 
   4610 	    currentNode = xmlStrdup(ctxt->name);
   4611 	    depth = ctxt->nameNr;
   4612 	    continue;
   4613 	}
   4614 
   4615 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
   4616 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
   4617 	    /*
   4618 	     * Handle SCRIPT/STYLE separately
   4619 	     */
   4620 	    htmlParseScript(ctxt);
   4621 	} else {
   4622 	    /*
   4623 	     * Sometimes DOCTYPE arrives in the middle of the document
   4624 	     */
   4625 	    if ((CUR == '<') && (NXT(1) == '!') &&
   4626 		(UPP(2) == 'D') && (UPP(3) == 'O') &&
   4627 		(UPP(4) == 'C') && (UPP(5) == 'T') &&
   4628 		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
   4629 		(UPP(8) == 'E')) {
   4630 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   4631 		             "Misplaced DOCTYPE declaration\n",
   4632 			     BAD_CAST "DOCTYPE" , NULL);
   4633 		htmlParseDocTypeDecl(ctxt);
   4634 	    }
   4635 
   4636 	    /*
   4637 	     * First case :  a comment
   4638 	     */
   4639 	    if ((CUR == '<') && (NXT(1) == '!') &&
   4640 		(NXT(2) == '-') && (NXT(3) == '-')) {
   4641 		htmlParseComment(ctxt);
   4642 	    }
   4643 
   4644 	    /*
   4645 	     * Second case : a Processing Instruction.
   4646 	     */
   4647 	    else if ((CUR == '<') && (NXT(1) == '?')) {
   4648 		htmlParsePI(ctxt);
   4649 	    }
   4650 
   4651 	    /*
   4652 	     * Third case :  a sub-element.
   4653 	     */
   4654 	    else if (CUR == '<') {
   4655 		htmlParseElementInternal(ctxt);
   4656 		if (currentNode != NULL) xmlFree(currentNode);
   4657 
   4658 		currentNode = xmlStrdup(ctxt->name);
   4659 		depth = ctxt->nameNr;
   4660 	    }
   4661 
   4662 	    /*
   4663 	     * Fourth case : a reference. If if has not been resolved,
   4664 	     *    parsing returns it's Name, create the node
   4665 	     */
   4666 	    else if (CUR == '&') {
   4667 		htmlParseReference(ctxt);
   4668 	    }
   4669 
   4670 	    /*
   4671 	     * Fifth case : end of the resource
   4672 	     */
   4673 	    else if (CUR == 0) {
   4674 		htmlAutoCloseOnEnd(ctxt);
   4675 		break;
   4676 	    }
   4677 
   4678 	    /*
   4679 	     * Last case, text. Note that References are handled directly.
   4680 	     */
   4681 	    else {
   4682 		htmlParseCharData(ctxt);
   4683 	    }
   4684 
   4685 	    if (cons == ctxt->nbChars) {
   4686 		if (ctxt->node != NULL) {
   4687 		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   4688 		                 "detected an error in element content\n",
   4689 				 NULL, NULL);
   4690 		}
   4691 		break;
   4692 	    }
   4693 	}
   4694         GROW;
   4695     }
   4696     if (currentNode != NULL) xmlFree(currentNode);
   4697 }
   4698 
   4699 /**
   4700  * htmlParseContent:
   4701  * @ctxt:  an HTML parser context
   4702  *
   4703  * Parse a content: comment, sub-element, reference or text.
   4704  * This is the entry point when called from parser.c
   4705  */
   4706 
   4707 void
   4708 __htmlParseContent(void *ctxt) {
   4709     if (ctxt != NULL)
   4710 	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
   4711 }
   4712 
   4713 /**
   4714  * htmlParseDocument:
   4715  * @ctxt:  an HTML parser context
   4716  *
   4717  * parse an HTML document (and build a tree if using the standard SAX
   4718  * interface).
   4719  *
   4720  * Returns 0, -1 in case of error. the parser context is augmented
   4721  *                as a result of the parsing.
   4722  */
   4723 
   4724 int
   4725 htmlParseDocument(htmlParserCtxtPtr ctxt) {
   4726     xmlChar start[4];
   4727     xmlCharEncoding enc;
   4728     xmlDtdPtr dtd;
   4729 
   4730     xmlInitParser();
   4731 
   4732     htmlDefaultSAXHandlerInit();
   4733 
   4734     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   4735 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   4736 		     "htmlParseDocument: context error\n", NULL, NULL);
   4737 	return(XML_ERR_INTERNAL_ERROR);
   4738     }
   4739     ctxt->html = 1;
   4740     ctxt->linenumbers = 1;
   4741     GROW;
   4742     /*
   4743      * SAX: beginning of the document processing.
   4744      */
   4745     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
   4746         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
   4747 
   4748     if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
   4749         ((ctxt->input->end - ctxt->input->cur) >= 4)) {
   4750 	/*
   4751 	 * Get the 4 first bytes and decode the charset
   4752 	 * if enc != XML_CHAR_ENCODING_NONE
   4753 	 * plug some encoding conversion routines.
   4754 	 */
   4755 	start[0] = RAW;
   4756 	start[1] = NXT(1);
   4757 	start[2] = NXT(2);
   4758 	start[3] = NXT(3);
   4759 	enc = xmlDetectCharEncoding(&start[0], 4);
   4760 	if (enc != XML_CHAR_ENCODING_NONE) {
   4761 	    xmlSwitchEncoding(ctxt, enc);
   4762 	}
   4763     }
   4764 
   4765     /*
   4766      * Wipe out everything which is before the first '<'
   4767      */
   4768     SKIP_BLANKS;
   4769     if (CUR == 0) {
   4770 	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
   4771 	             "Document is empty\n", NULL, NULL);
   4772     }
   4773 
   4774     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
   4775 	ctxt->sax->startDocument(ctxt->userData);
   4776 
   4777 
   4778     /*
   4779      * Parse possible comments and PIs before any content
   4780      */
   4781     while (((CUR == '<') && (NXT(1) == '!') &&
   4782             (NXT(2) == '-') && (NXT(3) == '-')) ||
   4783 	   ((CUR == '<') && (NXT(1) == '?'))) {
   4784         htmlParseComment(ctxt);
   4785         htmlParsePI(ctxt);
   4786 	SKIP_BLANKS;
   4787     }
   4788 
   4789 
   4790     /*
   4791      * Then possibly doc type declaration(s) and more Misc
   4792      * (doctypedecl Misc*)?
   4793      */
   4794     if ((CUR == '<') && (NXT(1) == '!') &&
   4795 	(UPP(2) == 'D') && (UPP(3) == 'O') &&
   4796 	(UPP(4) == 'C') && (UPP(5) == 'T') &&
   4797 	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
   4798 	(UPP(8) == 'E')) {
   4799 	htmlParseDocTypeDecl(ctxt);
   4800     }
   4801     SKIP_BLANKS;
   4802 
   4803     /*
   4804      * Parse possible comments and PIs before any content
   4805      */
   4806     while (((CUR == '<') && (NXT(1) == '!') &&
   4807             (NXT(2) == '-') && (NXT(3) == '-')) ||
   4808 	   ((CUR == '<') && (NXT(1) == '?'))) {
   4809         htmlParseComment(ctxt);
   4810         htmlParsePI(ctxt);
   4811 	SKIP_BLANKS;
   4812     }
   4813 
   4814     /*
   4815      * Time to start parsing the tree itself
   4816      */
   4817     htmlParseContentInternal(ctxt);
   4818 
   4819     /*
   4820      * autoclose
   4821      */
   4822     if (CUR == 0)
   4823 	htmlAutoCloseOnEnd(ctxt);
   4824 
   4825 
   4826     /*
   4827      * SAX: end of the document processing.
   4828      */
   4829     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
   4830         ctxt->sax->endDocument(ctxt->userData);
   4831 
   4832     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
   4833 	dtd = xmlGetIntSubset(ctxt->myDoc);
   4834 	if (dtd == NULL)
   4835 	    ctxt->myDoc->intSubset =
   4836 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
   4837 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
   4838 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
   4839     }
   4840     if (! ctxt->wellFormed) return(-1);
   4841     return(0);
   4842 }
   4843 
   4844 
   4845 /************************************************************************
   4846  *									*
   4847  *			Parser contexts handling			*
   4848  *									*
   4849  ************************************************************************/
   4850 
   4851 /**
   4852  * htmlInitParserCtxt:
   4853  * @ctxt:  an HTML parser context
   4854  *
   4855  * Initialize a parser context
   4856  *
   4857  * Returns 0 in case of success and -1 in case of error
   4858  */
   4859 
   4860 static int
   4861 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
   4862 {
   4863     htmlSAXHandler *sax;
   4864 
   4865     if (ctxt == NULL) return(-1);
   4866     memset(ctxt, 0, sizeof(htmlParserCtxt));
   4867 
   4868     ctxt->dict = xmlDictCreate();
   4869     if (ctxt->dict == NULL) {
   4870         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
   4871 	return(-1);
   4872     }
   4873     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
   4874     if (sax == NULL) {
   4875         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
   4876 	return(-1);
   4877     }
   4878     else
   4879         memset(sax, 0, sizeof(htmlSAXHandler));
   4880 
   4881     /* Allocate the Input stack */
   4882     ctxt->inputTab = (htmlParserInputPtr *)
   4883                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
   4884     if (ctxt->inputTab == NULL) {
   4885         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
   4886 	ctxt->inputNr = 0;
   4887 	ctxt->inputMax = 0;
   4888 	ctxt->input = NULL;
   4889 	return(-1);
   4890     }
   4891     ctxt->inputNr = 0;
   4892     ctxt->inputMax = 5;
   4893     ctxt->input = NULL;
   4894     ctxt->version = NULL;
   4895     ctxt->encoding = NULL;
   4896     ctxt->standalone = -1;
   4897     ctxt->instate = XML_PARSER_START;
   4898 
   4899     /* Allocate the Node stack */
   4900     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
   4901     if (ctxt->nodeTab == NULL) {
   4902         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
   4903 	ctxt->nodeNr = 0;
   4904 	ctxt->nodeMax = 0;
   4905 	ctxt->node = NULL;
   4906 	ctxt->inputNr = 0;
   4907 	ctxt->inputMax = 0;
   4908 	ctxt->input = NULL;
   4909 	return(-1);
   4910     }
   4911     ctxt->nodeNr = 0;
   4912     ctxt->nodeMax = 10;
   4913     ctxt->node = NULL;
   4914 
   4915     /* Allocate the Name stack */
   4916     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
   4917     if (ctxt->nameTab == NULL) {
   4918         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
   4919 	ctxt->nameNr = 0;
   4920 	ctxt->nameMax = 0;
   4921 	ctxt->name = NULL;
   4922 	ctxt->nodeNr = 0;
   4923 	ctxt->nodeMax = 0;
   4924 	ctxt->node = NULL;
   4925 	ctxt->inputNr = 0;
   4926 	ctxt->inputMax = 0;
   4927 	ctxt->input = NULL;
   4928 	return(-1);
   4929     }
   4930     ctxt->nameNr = 0;
   4931     ctxt->nameMax = 10;
   4932     ctxt->name = NULL;
   4933 
   4934     ctxt->nodeInfoTab = NULL;
   4935     ctxt->nodeInfoNr  = 0;
   4936     ctxt->nodeInfoMax = 0;
   4937 
   4938     if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
   4939     else {
   4940         ctxt->sax = sax;
   4941 	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
   4942     }
   4943     ctxt->userData = ctxt;
   4944     ctxt->myDoc = NULL;
   4945     ctxt->wellFormed = 1;
   4946     ctxt->replaceEntities = 0;
   4947     ctxt->linenumbers = xmlLineNumbersDefaultValue;
   4948     ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
   4949     ctxt->html = 1;
   4950     ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
   4951     ctxt->vctxt.userData = ctxt;
   4952     ctxt->vctxt.error = xmlParserValidityError;
   4953     ctxt->vctxt.warning = xmlParserValidityWarning;
   4954     ctxt->record_info = 0;
   4955     ctxt->validate = 0;
   4956     ctxt->nbChars = 0;
   4957     ctxt->checkIndex = 0;
   4958     ctxt->catalogs = NULL;
   4959     xmlInitNodeInfoSeq(&ctxt->node_seq);
   4960     return(0);
   4961 }
   4962 
   4963 /**
   4964  * htmlFreeParserCtxt:
   4965  * @ctxt:  an HTML parser context
   4966  *
   4967  * Free all the memory used by a parser context. However the parsed
   4968  * document in ctxt->myDoc is not freed.
   4969  */
   4970 
   4971 void
   4972 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
   4973 {
   4974     xmlFreeParserCtxt(ctxt);
   4975 }
   4976 
   4977 /**
   4978  * htmlNewParserCtxt:
   4979  *
   4980  * Allocate and initialize a new parser context.
   4981  *
   4982  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
   4983  */
   4984 
   4985 htmlParserCtxtPtr
   4986 htmlNewParserCtxt(void)
   4987 {
   4988     xmlParserCtxtPtr ctxt;
   4989 
   4990     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
   4991     if (ctxt == NULL) {
   4992         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
   4993 	return(NULL);
   4994     }
   4995     memset(ctxt, 0, sizeof(xmlParserCtxt));
   4996     if (htmlInitParserCtxt(ctxt) < 0) {
   4997         htmlFreeParserCtxt(ctxt);
   4998 	return(NULL);
   4999     }
   5000     return(ctxt);
   5001 }
   5002 
   5003 /**
   5004  * htmlCreateMemoryParserCtxt:
   5005  * @buffer:  a pointer to a char array
   5006  * @size:  the size of the array
   5007  *
   5008  * Create a parser context for an HTML in-memory document.
   5009  *
   5010  * Returns the new parser context or NULL
   5011  */
   5012 htmlParserCtxtPtr
   5013 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
   5014     xmlParserCtxtPtr ctxt;
   5015     xmlParserInputPtr input;
   5016     xmlParserInputBufferPtr buf;
   5017 
   5018     if (buffer == NULL)
   5019 	return(NULL);
   5020     if (size <= 0)
   5021 	return(NULL);
   5022 
   5023     ctxt = htmlNewParserCtxt();
   5024     if (ctxt == NULL)
   5025 	return(NULL);
   5026 
   5027     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
   5028     if (buf == NULL) return(NULL);
   5029 
   5030     input = xmlNewInputStream(ctxt);
   5031     if (input == NULL) {
   5032 	xmlFreeParserCtxt(ctxt);
   5033 	return(NULL);
   5034     }
   5035 
   5036     input->filename = NULL;
   5037     input->buf = buf;
   5038     xmlBufResetInput(buf->buffer, input);
   5039 
   5040     inputPush(ctxt, input);
   5041     return(ctxt);
   5042 }
   5043 
   5044 /**
   5045  * htmlCreateDocParserCtxt:
   5046  * @cur:  a pointer to an array of xmlChar
   5047  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   5048  *
   5049  * Create a parser context for an HTML document.
   5050  *
   5051  * TODO: check the need to add encoding handling there
   5052  *
   5053  * Returns the new parser context or NULL
   5054  */
   5055 static htmlParserCtxtPtr
   5056 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
   5057     int len;
   5058     htmlParserCtxtPtr ctxt;
   5059 
   5060     if (cur == NULL)
   5061 	return(NULL);
   5062     len = xmlStrlen(cur);
   5063     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
   5064     if (ctxt == NULL)
   5065 	return(NULL);
   5066 
   5067     if (encoding != NULL) {
   5068 	xmlCharEncoding enc;
   5069 	xmlCharEncodingHandlerPtr handler;
   5070 
   5071 	if (ctxt->input->encoding != NULL)
   5072 	    xmlFree((xmlChar *) ctxt->input->encoding);
   5073 	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
   5074 
   5075 	enc = xmlParseCharEncoding(encoding);
   5076 	/*
   5077 	 * registered set of known encodings
   5078 	 */
   5079 	if (enc != XML_CHAR_ENCODING_ERROR) {
   5080 	    xmlSwitchEncoding(ctxt, enc);
   5081 	    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
   5082 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
   5083 		             "Unsupported encoding %s\n",
   5084 			     (const xmlChar *) encoding, NULL);
   5085 	    }
   5086 	} else {
   5087 	    /*
   5088 	     * fallback for unknown encodings
   5089 	     */
   5090 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
   5091 	    if (handler != NULL) {
   5092 		xmlSwitchToEncoding(ctxt, handler);
   5093 	    } else {
   5094 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
   5095 		             "Unsupported encoding %s\n",
   5096 			     (const xmlChar *) encoding, NULL);
   5097 	    }
   5098 	}
   5099     }
   5100     return(ctxt);
   5101 }
   5102 
   5103 #ifdef LIBXML_PUSH_ENABLED
   5104 /************************************************************************
   5105  *									*
   5106  *	Progressive parsing interfaces				*
   5107  *									*
   5108  ************************************************************************/
   5109 
   5110 /**
   5111  * htmlParseLookupSequence:
   5112  * @ctxt:  an HTML parser context
   5113  * @first:  the first char to lookup
   5114  * @next:  the next char to lookup or zero
   5115  * @third:  the next char to lookup or zero
   5116  * @comment: flag to force checking inside comments
   5117  *
   5118  * Try to find if a sequence (first, next, third) or  just (first next) or
   5119  * (first) is available in the input stream.
   5120  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
   5121  * to avoid rescanning sequences of bytes, it DOES change the state of the
   5122  * parser, do not use liberally.
   5123  * This is basically similar to xmlParseLookupSequence()
   5124  *
   5125  * Returns the index to the current parsing point if the full sequence
   5126  *      is available, -1 otherwise.
   5127  */
   5128 static int
   5129 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
   5130                         xmlChar next, xmlChar third, int iscomment,
   5131                         int ignoreattrval)
   5132 {
   5133     int base, len;
   5134     htmlParserInputPtr in;
   5135     const xmlChar *buf;
   5136     int incomment = 0;
   5137     int invalue = 0;
   5138     char valdellim = 0x0;
   5139 
   5140     in = ctxt->input;
   5141     if (in == NULL)
   5142         return (-1);
   5143 
   5144     base = in->cur - in->base;
   5145     if (base < 0)
   5146         return (-1);
   5147 
   5148     if (ctxt->checkIndex > base)
   5149         base = ctxt->checkIndex;
   5150 
   5151     if (in->buf == NULL) {
   5152         buf = in->base;
   5153         len = in->length;
   5154     } else {
   5155         buf = xmlBufContent(in->buf->buffer);
   5156         len = xmlBufUse(in->buf->buffer);
   5157     }
   5158 
   5159     /* take into account the sequence length */
   5160     if (third)
   5161         len -= 2;
   5162     else if (next)
   5163         len--;
   5164     for (; base < len; base++) {
   5165         if ((!incomment) && (base + 4 < len) && (!iscomment)) {
   5166             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
   5167                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
   5168                 incomment = 1;
   5169                 /* do not increment past <! - some people use <!--> */
   5170                 base += 2;
   5171             }
   5172         }
   5173         if (ignoreattrval) {
   5174             if (buf[base] == '"' || buf[base] == '\'') {
   5175                 if (invalue) {
   5176                     if (buf[base] == valdellim) {
   5177                         invalue = 0;
   5178                         continue;
   5179                     }
   5180                 } else {
   5181                     valdellim = buf[base];
   5182                     invalue = 1;
   5183                     continue;
   5184                 }
   5185             } else if (invalue) {
   5186                 continue;
   5187             }
   5188         }
   5189         if (incomment) {
   5190             if (base + 3 > len)
   5191                 return (-1);
   5192             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
   5193                 (buf[base + 2] == '>')) {
   5194                 incomment = 0;
   5195                 base += 2;
   5196             }
   5197             continue;
   5198         }
   5199         if (buf[base] == first) {
   5200             if (third != 0) {
   5201                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
   5202                     continue;
   5203             } else if (next != 0) {
   5204                 if (buf[base + 1] != next)
   5205                     continue;
   5206             }
   5207             ctxt->checkIndex = 0;
   5208 #ifdef DEBUG_PUSH
   5209             if (next == 0)
   5210                 xmlGenericError(xmlGenericErrorContext,
   5211                                 "HPP: lookup '%c' found at %d\n",
   5212                                 first, base);
   5213             else if (third == 0)
   5214                 xmlGenericError(xmlGenericErrorContext,
   5215                                 "HPP: lookup '%c%c' found at %d\n",
   5216                                 first, next, base);
   5217             else
   5218                 xmlGenericError(xmlGenericErrorContext,
   5219                                 "HPP: lookup '%c%c%c' found at %d\n",
   5220                                 first, next, third, base);
   5221 #endif
   5222             return (base - (in->cur - in->base));
   5223         }
   5224     }
   5225     if ((!incomment) && (!invalue))
   5226         ctxt->checkIndex = base;
   5227 #ifdef DEBUG_PUSH
   5228     if (next == 0)
   5229         xmlGenericError(xmlGenericErrorContext,
   5230                         "HPP: lookup '%c' failed\n", first);
   5231     else if (third == 0)
   5232         xmlGenericError(xmlGenericErrorContext,
   5233                         "HPP: lookup '%c%c' failed\n", first, next);
   5234     else
   5235         xmlGenericError(xmlGenericErrorContext,
   5236                         "HPP: lookup '%c%c%c' failed\n", first, next,
   5237                         third);
   5238 #endif
   5239     return (-1);
   5240 }
   5241 
   5242 /**
   5243  * htmlParseLookupChars:
   5244  * @ctxt: an HTML parser context
   5245  * @stop: Array of chars, which stop the lookup.
   5246  * @stopLen: Length of stop-Array
   5247  *
   5248  * Try to find if any char of the stop-Array is available in the input
   5249  * stream.
   5250  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
   5251  * to avoid rescanning sequences of bytes, it DOES change the state of the
   5252  * parser, do not use liberally.
   5253  *
   5254  * Returns the index to the current parsing point if a stopChar
   5255  *      is available, -1 otherwise.
   5256  */
   5257 static int
   5258 htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
   5259                      int stopLen)
   5260 {
   5261     int base, len;
   5262     htmlParserInputPtr in;
   5263     const xmlChar *buf;
   5264     int incomment = 0;
   5265     int i;
   5266 
   5267     in = ctxt->input;
   5268     if (in == NULL)
   5269         return (-1);
   5270 
   5271     base = in->cur - in->base;
   5272     if (base < 0)
   5273         return (-1);
   5274 
   5275     if (ctxt->checkIndex > base)
   5276         base = ctxt->checkIndex;
   5277 
   5278     if (in->buf == NULL) {
   5279         buf = in->base;
   5280         len = in->length;
   5281     } else {
   5282         buf = xmlBufContent(in->buf->buffer);
   5283         len = xmlBufUse(in->buf->buffer);
   5284     }
   5285 
   5286     for (; base < len; base++) {
   5287         if (!incomment && (base + 4 < len)) {
   5288             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
   5289                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
   5290                 incomment = 1;
   5291                 /* do not increment past <! - some people use <!--> */
   5292                 base += 2;
   5293             }
   5294         }
   5295         if (incomment) {
   5296             if (base + 3 > len)
   5297                 return (-1);
   5298             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
   5299                 (buf[base + 2] == '>')) {
   5300                 incomment = 0;
   5301                 base += 2;
   5302             }
   5303             continue;
   5304         }
   5305         for (i = 0; i < stopLen; ++i) {
   5306             if (buf[base] == stop[i]) {
   5307                 ctxt->checkIndex = 0;
   5308                 return (base - (in->cur - in->base));
   5309             }
   5310         }
   5311     }
   5312     ctxt->checkIndex = base;
   5313     return (-1);
   5314 }
   5315 
   5316 /**
   5317  * htmlParseTryOrFinish:
   5318  * @ctxt:  an HTML parser context
   5319  * @terminate:  last chunk indicator
   5320  *
   5321  * Try to progress on parsing
   5322  *
   5323  * Returns zero if no parsing was possible
   5324  */
   5325 static int
   5326 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
   5327     int ret = 0;
   5328     htmlParserInputPtr in;
   5329     int avail = 0;
   5330     xmlChar cur, next;
   5331 
   5332     htmlParserNodeInfo node_info;
   5333 
   5334 #ifdef DEBUG_PUSH
   5335     switch (ctxt->instate) {
   5336 	case XML_PARSER_EOF:
   5337 	    xmlGenericError(xmlGenericErrorContext,
   5338 		    "HPP: try EOF\n"); break;
   5339 	case XML_PARSER_START:
   5340 	    xmlGenericError(xmlGenericErrorContext,
   5341 		    "HPP: try START\n"); break;
   5342 	case XML_PARSER_MISC:
   5343 	    xmlGenericError(xmlGenericErrorContext,
   5344 		    "HPP: try MISC\n");break;
   5345 	case XML_PARSER_COMMENT:
   5346 	    xmlGenericError(xmlGenericErrorContext,
   5347 		    "HPP: try COMMENT\n");break;
   5348 	case XML_PARSER_PROLOG:
   5349 	    xmlGenericError(xmlGenericErrorContext,
   5350 		    "HPP: try PROLOG\n");break;
   5351 	case XML_PARSER_START_TAG:
   5352 	    xmlGenericError(xmlGenericErrorContext,
   5353 		    "HPP: try START_TAG\n");break;
   5354 	case XML_PARSER_CONTENT:
   5355 	    xmlGenericError(xmlGenericErrorContext,
   5356 		    "HPP: try CONTENT\n");break;
   5357 	case XML_PARSER_CDATA_SECTION:
   5358 	    xmlGenericError(xmlGenericErrorContext,
   5359 		    "HPP: try CDATA_SECTION\n");break;
   5360 	case XML_PARSER_END_TAG:
   5361 	    xmlGenericError(xmlGenericErrorContext,
   5362 		    "HPP: try END_TAG\n");break;
   5363 	case XML_PARSER_ENTITY_DECL:
   5364 	    xmlGenericError(xmlGenericErrorContext,
   5365 		    "HPP: try ENTITY_DECL\n");break;
   5366 	case XML_PARSER_ENTITY_VALUE:
   5367 	    xmlGenericError(xmlGenericErrorContext,
   5368 		    "HPP: try ENTITY_VALUE\n");break;
   5369 	case XML_PARSER_ATTRIBUTE_VALUE:
   5370 	    xmlGenericError(xmlGenericErrorContext,
   5371 		    "HPP: try ATTRIBUTE_VALUE\n");break;
   5372 	case XML_PARSER_DTD:
   5373 	    xmlGenericError(xmlGenericErrorContext,
   5374 		    "HPP: try DTD\n");break;
   5375 	case XML_PARSER_EPILOG:
   5376 	    xmlGenericError(xmlGenericErrorContext,
   5377 		    "HPP: try EPILOG\n");break;
   5378 	case XML_PARSER_PI:
   5379 	    xmlGenericError(xmlGenericErrorContext,
   5380 		    "HPP: try PI\n");break;
   5381 	case XML_PARSER_SYSTEM_LITERAL:
   5382 	    xmlGenericError(xmlGenericErrorContext,
   5383 		    "HPP: try SYSTEM_LITERAL\n");break;
   5384     }
   5385 #endif
   5386 
   5387     while (1) {
   5388 
   5389 	in = ctxt->input;
   5390 	if (in == NULL) break;
   5391 	if (in->buf == NULL)
   5392 	    avail = in->length - (in->cur - in->base);
   5393 	else
   5394 	    avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
   5395 	if ((avail == 0) && (terminate)) {
   5396 	    htmlAutoCloseOnEnd(ctxt);
   5397 	    if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
   5398 		/*
   5399 		 * SAX: end of the document processing.
   5400 		 */
   5401 		ctxt->instate = XML_PARSER_EOF;
   5402 		if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
   5403 		    ctxt->sax->endDocument(ctxt->userData);
   5404 	    }
   5405 	}
   5406         if (avail < 1)
   5407 	    goto done;
   5408 	cur = in->cur[0];
   5409 	if (cur == 0) {
   5410 	    SKIP(1);
   5411 	    continue;
   5412 	}
   5413 
   5414         switch (ctxt->instate) {
   5415             case XML_PARSER_EOF:
   5416 	        /*
   5417 		 * Document parsing is done !
   5418 		 */
   5419 	        goto done;
   5420             case XML_PARSER_START:
   5421 	        /*
   5422 		 * Very first chars read from the document flow.
   5423 		 */
   5424 		cur = in->cur[0];
   5425 		if (IS_BLANK_CH(cur)) {
   5426 		    SKIP_BLANKS;
   5427 		    if (in->buf == NULL)
   5428 			avail = in->length - (in->cur - in->base);
   5429 		    else
   5430 			avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
   5431 		}
   5432 		if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
   5433 		    ctxt->sax->setDocumentLocator(ctxt->userData,
   5434 						  &xmlDefaultSAXLocator);
   5435 		if ((ctxt->sax) && (ctxt->sax->startDocument) &&
   5436 	            (!ctxt->disableSAX))
   5437 		    ctxt->sax->startDocument(ctxt->userData);
   5438 
   5439 		cur = in->cur[0];
   5440 		next = in->cur[1];
   5441 		if ((cur == '<') && (next == '!') &&
   5442 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
   5443 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
   5444 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
   5445 		    (UPP(8) == 'E')) {
   5446 		    if ((!terminate) &&
   5447 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5448 			goto done;
   5449 #ifdef DEBUG_PUSH
   5450 		    xmlGenericError(xmlGenericErrorContext,
   5451 			    "HPP: Parsing internal subset\n");
   5452 #endif
   5453 		    htmlParseDocTypeDecl(ctxt);
   5454 		    ctxt->instate = XML_PARSER_PROLOG;
   5455 #ifdef DEBUG_PUSH
   5456 		    xmlGenericError(xmlGenericErrorContext,
   5457 			    "HPP: entering PROLOG\n");
   5458 #endif
   5459                 } else {
   5460 		    ctxt->instate = XML_PARSER_MISC;
   5461 #ifdef DEBUG_PUSH
   5462 		    xmlGenericError(xmlGenericErrorContext,
   5463 			    "HPP: entering MISC\n");
   5464 #endif
   5465 		}
   5466 		break;
   5467             case XML_PARSER_MISC:
   5468 		SKIP_BLANKS;
   5469 		if (in->buf == NULL)
   5470 		    avail = in->length - (in->cur - in->base);
   5471 		else
   5472 		    avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
   5473 		/*
   5474 		 * no chars in buffer
   5475 		 */
   5476 		if (avail < 1)
   5477 		    goto done;
   5478 		/*
   5479 		 * not enouth chars in buffer
   5480 		 */
   5481 		if (avail < 2) {
   5482 		    if (!terminate)
   5483 			goto done;
   5484 		    else
   5485 			next = ' ';
   5486 		} else {
   5487 		    next = in->cur[1];
   5488 		}
   5489 		cur = in->cur[0];
   5490 	        if ((cur == '<') && (next == '!') &&
   5491 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
   5492 		    if ((!terminate) &&
   5493 		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
   5494 			goto done;
   5495 #ifdef DEBUG_PUSH
   5496 		    xmlGenericError(xmlGenericErrorContext,
   5497 			    "HPP: Parsing Comment\n");
   5498 #endif
   5499 		    htmlParseComment(ctxt);
   5500 		    ctxt->instate = XML_PARSER_MISC;
   5501 	        } else if ((cur == '<') && (next == '?')) {
   5502 		    if ((!terminate) &&
   5503 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5504 			goto done;
   5505 #ifdef DEBUG_PUSH
   5506 		    xmlGenericError(xmlGenericErrorContext,
   5507 			    "HPP: Parsing PI\n");
   5508 #endif
   5509 		    htmlParsePI(ctxt);
   5510 		    ctxt->instate = XML_PARSER_MISC;
   5511 		} else if ((cur == '<') && (next == '!') &&
   5512 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
   5513 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
   5514 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
   5515 		    (UPP(8) == 'E')) {
   5516 		    if ((!terminate) &&
   5517 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5518 			goto done;
   5519 #ifdef DEBUG_PUSH
   5520 		    xmlGenericError(xmlGenericErrorContext,
   5521 			    "HPP: Parsing internal subset\n");
   5522 #endif
   5523 		    htmlParseDocTypeDecl(ctxt);
   5524 		    ctxt->instate = XML_PARSER_PROLOG;
   5525 #ifdef DEBUG_PUSH
   5526 		    xmlGenericError(xmlGenericErrorContext,
   5527 			    "HPP: entering PROLOG\n");
   5528 #endif
   5529 		} else if ((cur == '<') && (next == '!') &&
   5530 		           (avail < 9)) {
   5531 		    goto done;
   5532 		} else {
   5533 		    ctxt->instate = XML_PARSER_START_TAG;
   5534 #ifdef DEBUG_PUSH
   5535 		    xmlGenericError(xmlGenericErrorContext,
   5536 			    "HPP: entering START_TAG\n");
   5537 #endif
   5538 		}
   5539 		break;
   5540             case XML_PARSER_PROLOG:
   5541 		SKIP_BLANKS;
   5542 		if (in->buf == NULL)
   5543 		    avail = in->length - (in->cur - in->base);
   5544 		else
   5545 		    avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
   5546 		if (avail < 2)
   5547 		    goto done;
   5548 		cur = in->cur[0];
   5549 		next = in->cur[1];
   5550 		if ((cur == '<') && (next == '!') &&
   5551 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
   5552 		    if ((!terminate) &&
   5553 		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
   5554 			goto done;
   5555 #ifdef DEBUG_PUSH
   5556 		    xmlGenericError(xmlGenericErrorContext,
   5557 			    "HPP: Parsing Comment\n");
   5558 #endif
   5559 		    htmlParseComment(ctxt);
   5560 		    ctxt->instate = XML_PARSER_PROLOG;
   5561 	        } else if ((cur == '<') && (next == '?')) {
   5562 		    if ((!terminate) &&
   5563 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5564 			goto done;
   5565 #ifdef DEBUG_PUSH
   5566 		    xmlGenericError(xmlGenericErrorContext,
   5567 			    "HPP: Parsing PI\n");
   5568 #endif
   5569 		    htmlParsePI(ctxt);
   5570 		    ctxt->instate = XML_PARSER_PROLOG;
   5571 		} else if ((cur == '<') && (next == '!') &&
   5572 		           (avail < 4)) {
   5573 		    goto done;
   5574 		} else {
   5575 		    ctxt->instate = XML_PARSER_START_TAG;
   5576 #ifdef DEBUG_PUSH
   5577 		    xmlGenericError(xmlGenericErrorContext,
   5578 			    "HPP: entering START_TAG\n");
   5579 #endif
   5580 		}
   5581 		break;
   5582             case XML_PARSER_EPILOG:
   5583 		if (in->buf == NULL)
   5584 		    avail = in->length - (in->cur - in->base);
   5585 		else
   5586 		    avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
   5587 		if (avail < 1)
   5588 		    goto done;
   5589 		cur = in->cur[0];
   5590 		if (IS_BLANK_CH(cur)) {
   5591 		    htmlParseCharData(ctxt);
   5592 		    goto done;
   5593 		}
   5594 		if (avail < 2)
   5595 		    goto done;
   5596 		next = in->cur[1];
   5597 	        if ((cur == '<') && (next == '!') &&
   5598 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
   5599 		    if ((!terminate) &&
   5600 		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
   5601 			goto done;
   5602 #ifdef DEBUG_PUSH
   5603 		    xmlGenericError(xmlGenericErrorContext,
   5604 			    "HPP: Parsing Comment\n");
   5605 #endif
   5606 		    htmlParseComment(ctxt);
   5607 		    ctxt->instate = XML_PARSER_EPILOG;
   5608 	        } else if ((cur == '<') && (next == '?')) {
   5609 		    if ((!terminate) &&
   5610 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5611 			goto done;
   5612 #ifdef DEBUG_PUSH
   5613 		    xmlGenericError(xmlGenericErrorContext,
   5614 			    "HPP: Parsing PI\n");
   5615 #endif
   5616 		    htmlParsePI(ctxt);
   5617 		    ctxt->instate = XML_PARSER_EPILOG;
   5618 		} else if ((cur == '<') && (next == '!') &&
   5619 		           (avail < 4)) {
   5620 		    goto done;
   5621 		} else {
   5622 		    ctxt->errNo = XML_ERR_DOCUMENT_END;
   5623 		    ctxt->wellFormed = 0;
   5624 		    ctxt->instate = XML_PARSER_EOF;
   5625 #ifdef DEBUG_PUSH
   5626 		    xmlGenericError(xmlGenericErrorContext,
   5627 			    "HPP: entering EOF\n");
   5628 #endif
   5629 		    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
   5630 			ctxt->sax->endDocument(ctxt->userData);
   5631 		    goto done;
   5632 		}
   5633 		break;
   5634             case XML_PARSER_START_TAG: {
   5635 	        const xmlChar *name;
   5636 		int failed;
   5637 		const htmlElemDesc * info;
   5638 
   5639 		/*
   5640 		 * no chars in buffer
   5641 		 */
   5642 		if (avail < 1)
   5643 		    goto done;
   5644 		/*
   5645 		 * not enouth chars in buffer
   5646 		 */
   5647 		if (avail < 2) {
   5648 		    if (!terminate)
   5649 			goto done;
   5650 		    else
   5651 			next = ' ';
   5652 		} else {
   5653 		    next = in->cur[1];
   5654 		}
   5655 		cur = in->cur[0];
   5656 	        if (cur != '<') {
   5657 		    ctxt->instate = XML_PARSER_CONTENT;
   5658 #ifdef DEBUG_PUSH
   5659 		    xmlGenericError(xmlGenericErrorContext,
   5660 			    "HPP: entering CONTENT\n");
   5661 #endif
   5662 		    break;
   5663 		}
   5664 		if (next == '/') {
   5665 		    ctxt->instate = XML_PARSER_END_TAG;
   5666 		    ctxt->checkIndex = 0;
   5667 #ifdef DEBUG_PUSH
   5668 		    xmlGenericError(xmlGenericErrorContext,
   5669 			    "HPP: entering END_TAG\n");
   5670 #endif
   5671 		    break;
   5672 		}
   5673 		if ((!terminate) &&
   5674 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5675 		    goto done;
   5676 
   5677                 /* Capture start position */
   5678 	        if (ctxt->record_info) {
   5679 	             node_info.begin_pos = ctxt->input->consumed +
   5680 	                                (CUR_PTR - ctxt->input->base);
   5681 	             node_info.begin_line = ctxt->input->line;
   5682 	        }
   5683 
   5684 
   5685 		failed = htmlParseStartTag(ctxt);
   5686 		name = ctxt->name;
   5687 		if ((failed == -1) ||
   5688 		    (name == NULL)) {
   5689 		    if (CUR == '>')
   5690 			NEXT;
   5691 		    break;
   5692 		}
   5693 
   5694 		/*
   5695 		 * Lookup the info for that element.
   5696 		 */
   5697 		info = htmlTagLookup(name);
   5698 		if (info == NULL) {
   5699 		    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
   5700 		                 "Tag %s invalid\n", name, NULL);
   5701 		}
   5702 
   5703 		/*
   5704 		 * Check for an Empty Element labeled the XML/SGML way
   5705 		 */
   5706 		if ((CUR == '/') && (NXT(1) == '>')) {
   5707 		    SKIP(2);
   5708 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   5709 			ctxt->sax->endElement(ctxt->userData, name);
   5710 		    htmlnamePop(ctxt);
   5711 		    ctxt->instate = XML_PARSER_CONTENT;
   5712 #ifdef DEBUG_PUSH
   5713 		    xmlGenericError(xmlGenericErrorContext,
   5714 			    "HPP: entering CONTENT\n");
   5715 #endif
   5716 		    break;
   5717 		}
   5718 
   5719 		if (CUR == '>') {
   5720 		    NEXT;
   5721 		} else {
   5722 		    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
   5723 		                 "Couldn't find end of Start Tag %s\n",
   5724 				 name, NULL);
   5725 
   5726 		    /*
   5727 		     * end of parsing of this node.
   5728 		     */
   5729 		    if (xmlStrEqual(name, ctxt->name)) {
   5730 			nodePop(ctxt);
   5731 			htmlnamePop(ctxt);
   5732 		    }
   5733 
   5734 		    if (ctxt->record_info)
   5735 		        htmlNodeInfoPush(ctxt, &node_info);
   5736 
   5737 		    ctxt->instate = XML_PARSER_CONTENT;
   5738 #ifdef DEBUG_PUSH
   5739 		    xmlGenericError(xmlGenericErrorContext,
   5740 			    "HPP: entering CONTENT\n");
   5741 #endif
   5742 		    break;
   5743 		}
   5744 
   5745 		/*
   5746 		 * Check for an Empty Element from DTD definition
   5747 		 */
   5748 		if ((info != NULL) && (info->empty)) {
   5749 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   5750 			ctxt->sax->endElement(ctxt->userData, name);
   5751 		    htmlnamePop(ctxt);
   5752 		}
   5753 
   5754                 if (ctxt->record_info)
   5755 	            htmlNodeInfoPush(ctxt, &node_info);
   5756 
   5757 		ctxt->instate = XML_PARSER_CONTENT;
   5758 #ifdef DEBUG_PUSH
   5759 		xmlGenericError(xmlGenericErrorContext,
   5760 			"HPP: entering CONTENT\n");
   5761 #endif
   5762                 break;
   5763 	    }
   5764             case XML_PARSER_CONTENT: {
   5765 		long cons;
   5766                 /*
   5767 		 * Handle preparsed entities and charRef
   5768 		 */
   5769 		if (ctxt->token != 0) {
   5770 		    xmlChar chr[2] = { 0 , 0 } ;
   5771 
   5772 		    chr[0] = (xmlChar) ctxt->token;
   5773 		    htmlCheckParagraph(ctxt);
   5774 		    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
   5775 			ctxt->sax->characters(ctxt->userData, chr, 1);
   5776 		    ctxt->token = 0;
   5777 		    ctxt->checkIndex = 0;
   5778 		}
   5779 		if ((avail == 1) && (terminate)) {
   5780 		    cur = in->cur[0];
   5781 		    if ((cur != '<') && (cur != '&')) {
   5782 			if (ctxt->sax != NULL) {
   5783 			    if (IS_BLANK_CH(cur)) {
   5784 				if (ctxt->keepBlanks) {
   5785 				    if (ctxt->sax->characters != NULL)
   5786 					ctxt->sax->characters(
   5787 						ctxt->userData, &in->cur[0], 1);
   5788 				} else {
   5789 				    if (ctxt->sax->ignorableWhitespace != NULL)
   5790 					ctxt->sax->ignorableWhitespace(
   5791 						ctxt->userData, &in->cur[0], 1);
   5792 				}
   5793 			    } else {
   5794 				htmlCheckParagraph(ctxt);
   5795 				if (ctxt->sax->characters != NULL)
   5796 				    ctxt->sax->characters(
   5797 					    ctxt->userData, &in->cur[0], 1);
   5798 			    }
   5799 			}
   5800 			ctxt->token = 0;
   5801 			ctxt->checkIndex = 0;
   5802 			in->cur++;
   5803 			break;
   5804 		    }
   5805 		}
   5806 		if (avail < 2)
   5807 		    goto done;
   5808 		cur = in->cur[0];
   5809 		next = in->cur[1];
   5810 		cons = ctxt->nbChars;
   5811 		if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
   5812 		    (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
   5813 		    /*
   5814 		     * Handle SCRIPT/STYLE separately
   5815 		     */
   5816 		    if (!terminate) {
   5817 		        int idx;
   5818 			xmlChar val;
   5819 
   5820 			idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
   5821 			if (idx < 0)
   5822 			    goto done;
   5823 		        val = in->cur[idx + 2];
   5824 			if (val == 0) /* bad cut of input */
   5825 			    goto done;
   5826 		    }
   5827 		    htmlParseScript(ctxt);
   5828 		    if ((cur == '<') && (next == '/')) {
   5829 			ctxt->instate = XML_PARSER_END_TAG;
   5830 			ctxt->checkIndex = 0;
   5831 #ifdef DEBUG_PUSH
   5832 			xmlGenericError(xmlGenericErrorContext,
   5833 				"HPP: entering END_TAG\n");
   5834 #endif
   5835 			break;
   5836 		    }
   5837 		} else {
   5838 		    /*
   5839 		     * Sometimes DOCTYPE arrives in the middle of the document
   5840 		     */
   5841 		    if ((cur == '<') && (next == '!') &&
   5842 			(UPP(2) == 'D') && (UPP(3) == 'O') &&
   5843 			(UPP(4) == 'C') && (UPP(5) == 'T') &&
   5844 			(UPP(6) == 'Y') && (UPP(7) == 'P') &&
   5845 			(UPP(8) == 'E')) {
   5846 			if ((!terminate) &&
   5847 			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5848 			    goto done;
   5849 			htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   5850 			             "Misplaced DOCTYPE declaration\n",
   5851 				     BAD_CAST "DOCTYPE" , NULL);
   5852 			htmlParseDocTypeDecl(ctxt);
   5853 		    } else if ((cur == '<') && (next == '!') &&
   5854 			(in->cur[2] == '-') && (in->cur[3] == '-')) {
   5855 			if ((!terminate) &&
   5856 			    (htmlParseLookupSequence(
   5857 				ctxt, '-', '-', '>', 1, 1) < 0))
   5858 			    goto done;
   5859 #ifdef DEBUG_PUSH
   5860 			xmlGenericError(xmlGenericErrorContext,
   5861 				"HPP: Parsing Comment\n");
   5862 #endif
   5863 			htmlParseComment(ctxt);
   5864 			ctxt->instate = XML_PARSER_CONTENT;
   5865 		    } else if ((cur == '<') && (next == '?')) {
   5866 			if ((!terminate) &&
   5867 			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5868 			    goto done;
   5869 #ifdef DEBUG_PUSH
   5870 			xmlGenericError(xmlGenericErrorContext,
   5871 				"HPP: Parsing PI\n");
   5872 #endif
   5873 			htmlParsePI(ctxt);
   5874 			ctxt->instate = XML_PARSER_CONTENT;
   5875 		    } else if ((cur == '<') && (next == '!') && (avail < 4)) {
   5876 			goto done;
   5877 		    } else if ((cur == '<') && (next == '/')) {
   5878 			ctxt->instate = XML_PARSER_END_TAG;
   5879 			ctxt->checkIndex = 0;
   5880 #ifdef DEBUG_PUSH
   5881 			xmlGenericError(xmlGenericErrorContext,
   5882 				"HPP: entering END_TAG\n");
   5883 #endif
   5884 			break;
   5885 		    } else if (cur == '<') {
   5886 			ctxt->instate = XML_PARSER_START_TAG;
   5887 			ctxt->checkIndex = 0;
   5888 #ifdef DEBUG_PUSH
   5889 			xmlGenericError(xmlGenericErrorContext,
   5890 				"HPP: entering START_TAG\n");
   5891 #endif
   5892 			break;
   5893 		    } else if (cur == '&') {
   5894 			if ((!terminate) &&
   5895 			    (htmlParseLookupChars(ctxt,
   5896                                                   BAD_CAST "; >/", 4) < 0))
   5897 			    goto done;
   5898 #ifdef DEBUG_PUSH
   5899 			xmlGenericError(xmlGenericErrorContext,
   5900 				"HPP: Parsing Reference\n");
   5901 #endif
   5902 			/* TODO: check generation of subtrees if noent !!! */
   5903 			htmlParseReference(ctxt);
   5904 		    } else {
   5905 		        /*
   5906 			 * check that the text sequence is complete
   5907 			 * before handing out the data to the parser
   5908 			 * to avoid problems with erroneous end of
   5909 			 * data detection.
   5910 			 */
   5911 			if ((!terminate) &&
   5912                             (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
   5913 			    goto done;
   5914 			ctxt->checkIndex = 0;
   5915 #ifdef DEBUG_PUSH
   5916 			xmlGenericError(xmlGenericErrorContext,
   5917 				"HPP: Parsing char data\n");
   5918 #endif
   5919 			htmlParseCharData(ctxt);
   5920 		    }
   5921 		}
   5922 		if (cons == ctxt->nbChars) {
   5923 		    if (ctxt->node != NULL) {
   5924 			htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5925 			             "detected an error in element content\n",
   5926 				     NULL, NULL);
   5927 		    }
   5928 		    NEXT;
   5929 		    break;
   5930 		}
   5931 
   5932 		break;
   5933 	    }
   5934             case XML_PARSER_END_TAG:
   5935 		if (avail < 2)
   5936 		    goto done;
   5937 		if ((!terminate) &&
   5938 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5939 		    goto done;
   5940 		htmlParseEndTag(ctxt);
   5941 		if (ctxt->nameNr == 0) {
   5942 		    ctxt->instate = XML_PARSER_EPILOG;
   5943 		} else {
   5944 		    ctxt->instate = XML_PARSER_CONTENT;
   5945 		}
   5946 		ctxt->checkIndex = 0;
   5947 #ifdef DEBUG_PUSH
   5948 		xmlGenericError(xmlGenericErrorContext,
   5949 			"HPP: entering CONTENT\n");
   5950 #endif
   5951 	        break;
   5952             case XML_PARSER_CDATA_SECTION:
   5953 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5954 			"HPP: internal error, state == CDATA\n",
   5955 			     NULL, NULL);
   5956 		ctxt->instate = XML_PARSER_CONTENT;
   5957 		ctxt->checkIndex = 0;
   5958 #ifdef DEBUG_PUSH
   5959 		xmlGenericError(xmlGenericErrorContext,
   5960 			"HPP: entering CONTENT\n");
   5961 #endif
   5962 		break;
   5963             case XML_PARSER_DTD:
   5964 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5965 			"HPP: internal error, state == DTD\n",
   5966 			     NULL, NULL);
   5967 		ctxt->instate = XML_PARSER_CONTENT;
   5968 		ctxt->checkIndex = 0;
   5969 #ifdef DEBUG_PUSH
   5970 		xmlGenericError(xmlGenericErrorContext,
   5971 			"HPP: entering CONTENT\n");
   5972 #endif
   5973 		break;
   5974             case XML_PARSER_COMMENT:
   5975 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5976 			"HPP: internal error, state == COMMENT\n",
   5977 			     NULL, NULL);
   5978 		ctxt->instate = XML_PARSER_CONTENT;
   5979 		ctxt->checkIndex = 0;
   5980 #ifdef DEBUG_PUSH
   5981 		xmlGenericError(xmlGenericErrorContext,
   5982 			"HPP: entering CONTENT\n");
   5983 #endif
   5984 		break;
   5985             case XML_PARSER_PI:
   5986 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5987 			"HPP: internal error, state == PI\n",
   5988 			     NULL, NULL);
   5989 		ctxt->instate = XML_PARSER_CONTENT;
   5990 		ctxt->checkIndex = 0;
   5991 #ifdef DEBUG_PUSH
   5992 		xmlGenericError(xmlGenericErrorContext,
   5993 			"HPP: entering CONTENT\n");
   5994 #endif
   5995 		break;
   5996             case XML_PARSER_ENTITY_DECL:
   5997 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5998 			"HPP: internal error, state == ENTITY_DECL\n",
   5999 			     NULL, NULL);
   6000 		ctxt->instate = XML_PARSER_CONTENT;
   6001 		ctxt->checkIndex = 0;
   6002 #ifdef DEBUG_PUSH
   6003 		xmlGenericError(xmlGenericErrorContext,
   6004 			"HPP: entering CONTENT\n");
   6005 #endif
   6006 		break;
   6007             case XML_PARSER_ENTITY_VALUE:
   6008 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   6009 			"HPP: internal error, state == ENTITY_VALUE\n",
   6010 			     NULL, NULL);
   6011 		ctxt->instate = XML_PARSER_CONTENT;
   6012 		ctxt->checkIndex = 0;
   6013 #ifdef DEBUG_PUSH
   6014 		xmlGenericError(xmlGenericErrorContext,
   6015 			"HPP: entering DTD\n");
   6016 #endif
   6017 		break;
   6018             case XML_PARSER_ATTRIBUTE_VALUE:
   6019 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   6020 			"HPP: internal error, state == ATTRIBUTE_VALUE\n",
   6021 			     NULL, NULL);
   6022 		ctxt->instate = XML_PARSER_START_TAG;
   6023 		ctxt->checkIndex = 0;
   6024 #ifdef DEBUG_PUSH
   6025 		xmlGenericError(xmlGenericErrorContext,
   6026 			"HPP: entering START_TAG\n");
   6027 #endif
   6028 		break;
   6029 	    case XML_PARSER_SYSTEM_LITERAL:
   6030 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   6031 		    "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
   6032 			     NULL, NULL);
   6033 		ctxt->instate = XML_PARSER_CONTENT;
   6034 		ctxt->checkIndex = 0;
   6035 #ifdef DEBUG_PUSH
   6036 		xmlGenericError(xmlGenericErrorContext,
   6037 			"HPP: entering CONTENT\n");
   6038 #endif
   6039 		break;
   6040 	    case XML_PARSER_IGNORE:
   6041 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   6042 			"HPP: internal error, state == XML_PARSER_IGNORE\n",
   6043 			     NULL, NULL);
   6044 		ctxt->instate = XML_PARSER_CONTENT;
   6045 		ctxt->checkIndex = 0;
   6046 #ifdef DEBUG_PUSH
   6047 		xmlGenericError(xmlGenericErrorContext,
   6048 			"HPP: entering CONTENT\n");
   6049 #endif
   6050 		break;
   6051 	    case XML_PARSER_PUBLIC_LITERAL:
   6052 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   6053 			"HPP: internal error, state == XML_PARSER_LITERAL\n",
   6054 			     NULL, NULL);
   6055 		ctxt->instate = XML_PARSER_CONTENT;
   6056 		ctxt->checkIndex = 0;
   6057 #ifdef DEBUG_PUSH
   6058 		xmlGenericError(xmlGenericErrorContext,
   6059 			"HPP: entering CONTENT\n");
   6060 #endif
   6061 		break;
   6062 
   6063 	}
   6064     }
   6065 done:
   6066     if ((avail == 0) && (terminate)) {
   6067 	htmlAutoCloseOnEnd(ctxt);
   6068 	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
   6069 	    /*
   6070 	     * SAX: end of the document processing.
   6071 	     */
   6072 	    ctxt->instate = XML_PARSER_EOF;
   6073 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
   6074 		ctxt->sax->endDocument(ctxt->userData);
   6075 	}
   6076     }
   6077     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
   6078 	((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
   6079 	 (ctxt->instate == XML_PARSER_EPILOG))) {
   6080 	xmlDtdPtr dtd;
   6081 	dtd = xmlGetIntSubset(ctxt->myDoc);
   6082 	if (dtd == NULL)
   6083 	    ctxt->myDoc->intSubset =
   6084 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
   6085 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
   6086 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
   6087     }
   6088 #ifdef DEBUG_PUSH
   6089     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
   6090 #endif
   6091     return(ret);
   6092 }
   6093 
   6094 /**
   6095  * htmlParseChunk:
   6096  * @ctxt:  an HTML parser context
   6097  * @chunk:  an char array
   6098  * @size:  the size in byte of the chunk
   6099  * @terminate:  last chunk indicator
   6100  *
   6101  * Parse a Chunk of memory
   6102  *
   6103  * Returns zero if no error, the xmlParserErrors otherwise.
   6104  */
   6105 int
   6106 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
   6107               int terminate) {
   6108     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   6109 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   6110 		     "htmlParseChunk: context error\n", NULL, NULL);
   6111 	return(XML_ERR_INTERNAL_ERROR);
   6112     }
   6113     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
   6114         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
   6115 	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
   6116 	size_t cur = ctxt->input->cur - ctxt->input->base;
   6117 	int res;
   6118 
   6119 	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
   6120 	if (res < 0) {
   6121 	    ctxt->errNo = XML_PARSER_EOF;
   6122 	    ctxt->disableSAX = 1;
   6123 	    return (XML_PARSER_EOF);
   6124 	}
   6125         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
   6126 #ifdef DEBUG_PUSH
   6127 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
   6128 #endif
   6129 
   6130 #if 0
   6131 	if ((terminate) || (ctxt->input->buf->buffer->use > 80))
   6132 	    htmlParseTryOrFinish(ctxt, terminate);
   6133 #endif
   6134     } else if (ctxt->instate != XML_PARSER_EOF) {
   6135 	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
   6136 	    xmlParserInputBufferPtr in = ctxt->input->buf;
   6137 	    if ((in->encoder != NULL) && (in->buffer != NULL) &&
   6138 		    (in->raw != NULL)) {
   6139 		int nbchars;
   6140 		size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
   6141 		size_t current = ctxt->input->cur - ctxt->input->base;
   6142 
   6143 		nbchars = xmlCharEncInput(in, terminate);
   6144 		if (nbchars < 0) {
   6145 		    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
   6146 			         "encoder error\n", NULL, NULL);
   6147 		    return(XML_ERR_INVALID_ENCODING);
   6148 		}
   6149 		xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
   6150 	    }
   6151 	}
   6152     }
   6153     htmlParseTryOrFinish(ctxt, terminate);
   6154     if (terminate) {
   6155 	if ((ctxt->instate != XML_PARSER_EOF) &&
   6156 	    (ctxt->instate != XML_PARSER_EPILOG) &&
   6157 	    (ctxt->instate != XML_PARSER_MISC)) {
   6158 	    ctxt->errNo = XML_ERR_DOCUMENT_END;
   6159 	    ctxt->wellFormed = 0;
   6160 	}
   6161 	if (ctxt->instate != XML_PARSER_EOF) {
   6162 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
   6163 		ctxt->sax->endDocument(ctxt->userData);
   6164 	}
   6165 	ctxt->instate = XML_PARSER_EOF;
   6166     }
   6167     return((xmlParserErrors) ctxt->errNo);
   6168 }
   6169 
   6170 /************************************************************************
   6171  *									*
   6172  *			User entry points				*
   6173  *									*
   6174  ************************************************************************/
   6175 
   6176 /**
   6177  * htmlCreatePushParserCtxt:
   6178  * @sax:  a SAX handler
   6179  * @user_data:  The user data returned on SAX callbacks
   6180  * @chunk:  a pointer to an array of chars
   6181  * @size:  number of chars in the array
   6182  * @filename:  an optional file name or URI
   6183  * @enc:  an optional encoding
   6184  *
   6185  * Create a parser context for using the HTML parser in push mode
   6186  * The value of @filename is used for fetching external entities
   6187  * and error/warning reports.
   6188  *
   6189  * Returns the new parser context or NULL
   6190  */
   6191 htmlParserCtxtPtr
   6192 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
   6193                          const char *chunk, int size, const char *filename,
   6194 			 xmlCharEncoding enc) {
   6195     htmlParserCtxtPtr ctxt;
   6196     htmlParserInputPtr inputStream;
   6197     xmlParserInputBufferPtr buf;
   6198 
   6199     xmlInitParser();
   6200 
   6201     buf = xmlAllocParserInputBuffer(enc);
   6202     if (buf == NULL) return(NULL);
   6203 
   6204     ctxt = htmlNewParserCtxt();
   6205     if (ctxt == NULL) {
   6206 	xmlFreeParserInputBuffer(buf);
   6207 	return(NULL);
   6208     }
   6209     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
   6210 	ctxt->charset=XML_CHAR_ENCODING_UTF8;
   6211     if (sax != NULL) {
   6212 	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
   6213 	    xmlFree(ctxt->sax);
   6214 	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
   6215 	if (ctxt->sax == NULL) {
   6216 	    xmlFree(buf);
   6217 	    xmlFree(ctxt);
   6218 	    return(NULL);
   6219 	}
   6220 	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
   6221 	if (user_data != NULL)
   6222 	    ctxt->userData = user_data;
   6223     }
   6224     if (filename == NULL) {
   6225 	ctxt->directory = NULL;
   6226     } else {
   6227         ctxt->directory = xmlParserGetDirectory(filename);
   6228     }
   6229 
   6230     inputStream = htmlNewInputStream(ctxt);
   6231     if (inputStream == NULL) {
   6232 	xmlFreeParserCtxt(ctxt);
   6233 	xmlFree(buf);
   6234 	return(NULL);
   6235     }
   6236 
   6237     if (filename == NULL)
   6238 	inputStream->filename = NULL;
   6239     else
   6240 	inputStream->filename = (char *)
   6241 	    xmlCanonicPath((const xmlChar *) filename);
   6242     inputStream->buf = buf;
   6243     xmlBufResetInput(buf->buffer, inputStream);
   6244 
   6245     inputPush(ctxt, inputStream);
   6246 
   6247     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
   6248         (ctxt->input->buf != NULL))  {
   6249 	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
   6250 	size_t cur = ctxt->input->cur - ctxt->input->base;
   6251 
   6252 	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
   6253 
   6254         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
   6255 #ifdef DEBUG_PUSH
   6256 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
   6257 #endif
   6258     }
   6259     ctxt->progressive = 1;
   6260 
   6261     return(ctxt);
   6262 }
   6263 #endif /* LIBXML_PUSH_ENABLED */
   6264 
   6265 /**
   6266  * htmlSAXParseDoc:
   6267  * @cur:  a pointer to an array of xmlChar
   6268  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   6269  * @sax:  the SAX handler block
   6270  * @userData: if using SAX, this pointer will be provided on callbacks.
   6271  *
   6272  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
   6273  * to handle parse events. If sax is NULL, fallback to the default DOM
   6274  * behavior and return a tree.
   6275  *
   6276  * Returns the resulting document tree unless SAX is NULL or the document is
   6277  *     not well formed.
   6278  */
   6279 
   6280 htmlDocPtr
   6281 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
   6282                 htmlSAXHandlerPtr sax, void *userData) {
   6283     htmlDocPtr ret;
   6284     htmlParserCtxtPtr ctxt;
   6285 
   6286     xmlInitParser();
   6287 
   6288     if (cur == NULL) return(NULL);
   6289 
   6290 
   6291     ctxt = htmlCreateDocParserCtxt(cur, encoding);
   6292     if (ctxt == NULL) return(NULL);
   6293     if (sax != NULL) {
   6294         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
   6295         ctxt->sax = sax;
   6296         ctxt->userData = userData;
   6297     }
   6298 
   6299     htmlParseDocument(ctxt);
   6300     ret = ctxt->myDoc;
   6301     if (sax != NULL) {
   6302 	ctxt->sax = NULL;
   6303 	ctxt->userData = NULL;
   6304     }
   6305     htmlFreeParserCtxt(ctxt);
   6306 
   6307     return(ret);
   6308 }
   6309 
   6310 /**
   6311  * htmlParseDoc:
   6312  * @cur:  a pointer to an array of xmlChar
   6313  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   6314  *
   6315  * parse an HTML in-memory document and build a tree.
   6316  *
   6317  * Returns the resulting document tree
   6318  */
   6319 
   6320 htmlDocPtr
   6321 htmlParseDoc(const xmlChar *cur, const char *encoding) {
   6322     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
   6323 }
   6324 
   6325 
   6326 /**
   6327  * htmlCreateFileParserCtxt:
   6328  * @filename:  the filename
   6329  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   6330  *
   6331  * Create a parser context for a file content.
   6332  * Automatic support for ZLIB/Compress compressed document is provided
   6333  * by default if found at compile-time.
   6334  *
   6335  * Returns the new parser context or NULL
   6336  */
   6337 htmlParserCtxtPtr
   6338 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
   6339 {
   6340     htmlParserCtxtPtr ctxt;
   6341     htmlParserInputPtr inputStream;
   6342     char *canonicFilename;
   6343     /* htmlCharEncoding enc; */
   6344     xmlChar *content, *content_line = (xmlChar *) "charset=";
   6345 
   6346     if (filename == NULL)
   6347         return(NULL);
   6348 
   6349     ctxt = htmlNewParserCtxt();
   6350     if (ctxt == NULL) {
   6351 	return(NULL);
   6352     }
   6353     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
   6354     if (canonicFilename == NULL) {
   6355 #ifdef LIBXML_SAX1_ENABLED
   6356 	if (xmlDefaultSAXHandler.error != NULL) {
   6357 	    xmlDefaultSAXHandler.error(NULL, "out of memory\n");
   6358 	}
   6359 #endif
   6360 	xmlFreeParserCtxt(ctxt);
   6361 	return(NULL);
   6362     }
   6363 
   6364     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
   6365     xmlFree(canonicFilename);
   6366     if (inputStream == NULL) {
   6367 	xmlFreeParserCtxt(ctxt);
   6368 	return(NULL);
   6369     }
   6370 
   6371     inputPush(ctxt, inputStream);
   6372 
   6373     /* set encoding */
   6374     if (encoding) {
   6375         size_t l = strlen(encoding);
   6376 
   6377 	if (l < 1000) {
   6378 	    content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
   6379 	    if (content) {
   6380 		strcpy ((char *)content, (char *)content_line);
   6381 		strcat ((char *)content, (char *)encoding);
   6382 		htmlCheckEncoding (ctxt, content);
   6383 		xmlFree (content);
   6384 	    }
   6385 	}
   6386     }
   6387 
   6388     return(ctxt);
   6389 }
   6390 
   6391 /**
   6392  * htmlSAXParseFile:
   6393  * @filename:  the filename
   6394  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   6395  * @sax:  the SAX handler block
   6396  * @userData: if using SAX, this pointer will be provided on callbacks.
   6397  *
   6398  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
   6399  * compressed document is provided by default if found at compile-time.
   6400  * It use the given SAX function block to handle the parsing callback.
   6401  * If sax is NULL, fallback to the default DOM tree building routines.
   6402  *
   6403  * Returns the resulting document tree unless SAX is NULL or the document is
   6404  *     not well formed.
   6405  */
   6406 
   6407 htmlDocPtr
   6408 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
   6409                  void *userData) {
   6410     htmlDocPtr ret;
   6411     htmlParserCtxtPtr ctxt;
   6412     htmlSAXHandlerPtr oldsax = NULL;
   6413 
   6414     xmlInitParser();
   6415 
   6416     ctxt = htmlCreateFileParserCtxt(filename, encoding);
   6417     if (ctxt == NULL) return(NULL);
   6418     if (sax != NULL) {
   6419 	oldsax = ctxt->sax;
   6420         ctxt->sax = sax;
   6421         ctxt->userData = userData;
   6422     }
   6423 
   6424     htmlParseDocument(ctxt);
   6425 
   6426     ret = ctxt->myDoc;
   6427     if (sax != NULL) {
   6428         ctxt->sax = oldsax;
   6429         ctxt->userData = NULL;
   6430     }
   6431     htmlFreeParserCtxt(ctxt);
   6432 
   6433     return(ret);
   6434 }
   6435 
   6436 /**
   6437  * htmlParseFile:
   6438  * @filename:  the filename
   6439  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   6440  *
   6441  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
   6442  * compressed document is provided by default if found at compile-time.
   6443  *
   6444  * Returns the resulting document tree
   6445  */
   6446 
   6447 htmlDocPtr
   6448 htmlParseFile(const char *filename, const char *encoding) {
   6449     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
   6450 }
   6451 
   6452 /**
   6453  * htmlHandleOmittedElem:
   6454  * @val:  int 0 or 1
   6455  *
   6456  * Set and return the previous value for handling HTML omitted tags.
   6457  *
   6458  * Returns the last value for 0 for no handling, 1 for auto insertion.
   6459  */
   6460 
   6461 int
   6462 htmlHandleOmittedElem(int val) {
   6463     int old = htmlOmittedDefaultValue;
   6464 
   6465     htmlOmittedDefaultValue = val;
   6466     return(old);
   6467 }
   6468 
   6469 /**
   6470  * htmlElementAllowedHere:
   6471  * @parent: HTML parent element
   6472  * @elt: HTML element
   6473  *
   6474  * Checks whether an HTML element may be a direct child of a parent element.
   6475  * Note - doesn't check for deprecated elements
   6476  *
   6477  * Returns 1 if allowed; 0 otherwise.
   6478  */
   6479 int
   6480 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
   6481   const char** p ;
   6482 
   6483   if ( ! elt || ! parent || ! parent->subelts )
   6484 	return 0 ;
   6485 
   6486   for ( p = parent->subelts; *p; ++p )
   6487     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
   6488       return 1 ;
   6489 
   6490   return 0 ;
   6491 }
   6492 /**
   6493  * htmlElementStatusHere:
   6494  * @parent: HTML parent element
   6495  * @elt: HTML element
   6496  *
   6497  * Checks whether an HTML element may be a direct child of a parent element.
   6498  * and if so whether it is valid or deprecated.
   6499  *
   6500  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
   6501  */
   6502 htmlStatus
   6503 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
   6504   if ( ! parent || ! elt )
   6505     return HTML_INVALID ;
   6506   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
   6507     return HTML_INVALID ;
   6508 
   6509   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
   6510 }
   6511 /**
   6512  * htmlAttrAllowed:
   6513  * @elt: HTML element
   6514  * @attr: HTML attribute
   6515  * @legacy: whether to allow deprecated attributes
   6516  *
   6517  * Checks whether an attribute is valid for an element
   6518  * Has full knowledge of Required and Deprecated attributes
   6519  *
   6520  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
   6521  */
   6522 htmlStatus
   6523 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
   6524   const char** p ;
   6525 
   6526   if ( !elt || ! attr )
   6527 	return HTML_INVALID ;
   6528 
   6529   if ( elt->attrs_req )
   6530     for ( p = elt->attrs_req; *p; ++p)
   6531       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
   6532         return HTML_REQUIRED ;
   6533 
   6534   if ( elt->attrs_opt )
   6535     for ( p = elt->attrs_opt; *p; ++p)
   6536       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
   6537         return HTML_VALID ;
   6538 
   6539   if ( legacy && elt->attrs_depr )
   6540     for ( p = elt->attrs_depr; *p; ++p)
   6541       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
   6542         return HTML_DEPRECATED ;
   6543 
   6544   return HTML_INVALID ;
   6545 }
   6546 /**
   6547  * htmlNodeStatus:
   6548  * @node: an htmlNodePtr in a tree
   6549  * @legacy: whether to allow deprecated elements (YES is faster here
   6550  *	for Element nodes)
   6551  *
   6552  * Checks whether the tree node is valid.  Experimental (the author
   6553  *     only uses the HTML enhancements in a SAX parser)
   6554  *
   6555  * Return: for Element nodes, a return from htmlElementAllowedHere (if
   6556  *	legacy allowed) or htmlElementStatusHere (otherwise).
   6557  *	for Attribute nodes, a return from htmlAttrAllowed
   6558  *	for other nodes, HTML_NA (no checks performed)
   6559  */
   6560 htmlStatus
   6561 htmlNodeStatus(const htmlNodePtr node, int legacy) {
   6562   if ( ! node )
   6563     return HTML_INVALID ;
   6564 
   6565   switch ( node->type ) {
   6566     case XML_ELEMENT_NODE:
   6567       return legacy
   6568 	? ( htmlElementAllowedHere (
   6569 		htmlTagLookup(node->parent->name) , node->name
   6570 		) ? HTML_VALID : HTML_INVALID )
   6571 	: htmlElementStatusHere(
   6572 		htmlTagLookup(node->parent->name) ,
   6573 		htmlTagLookup(node->name) )
   6574 	;
   6575     case XML_ATTRIBUTE_NODE:
   6576       return htmlAttrAllowed(
   6577 	htmlTagLookup(node->parent->name) , node->name, legacy) ;
   6578     default: return HTML_NA ;
   6579   }
   6580 }
   6581 /************************************************************************
   6582  *									*
   6583  *	New set (2.6.0) of simpler and more flexible APIs		*
   6584  *									*
   6585  ************************************************************************/
   6586 /**
   6587  * DICT_FREE:
   6588  * @str:  a string
   6589  *
   6590  * Free a string if it is not owned by the "dict" dictionary in the
   6591  * current scope
   6592  */
   6593 #define DICT_FREE(str)						\
   6594 	if ((str) && ((!dict) ||				\
   6595 	    (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))	\
   6596 	    xmlFree((char *)(str));
   6597 
   6598 /**
   6599  * htmlCtxtReset:
   6600  * @ctxt: an HTML parser context
   6601  *
   6602  * Reset a parser context
   6603  */
   6604 void
   6605 htmlCtxtReset(htmlParserCtxtPtr ctxt)
   6606 {
   6607     xmlParserInputPtr input;
   6608     xmlDictPtr dict;
   6609 
   6610     if (ctxt == NULL)
   6611         return;
   6612 
   6613     xmlInitParser();
   6614     dict = ctxt->dict;
   6615 
   6616     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
   6617         xmlFreeInputStream(input);
   6618     }
   6619     ctxt->inputNr = 0;
   6620     ctxt->input = NULL;
   6621 
   6622     ctxt->spaceNr = 0;
   6623     if (ctxt->spaceTab != NULL) {
   6624 	ctxt->spaceTab[0] = -1;
   6625 	ctxt->space = &ctxt->spaceTab[0];
   6626     } else {
   6627 	ctxt->space = NULL;
   6628     }
   6629 
   6630 
   6631     ctxt->nodeNr = 0;
   6632     ctxt->node = NULL;
   6633 
   6634     ctxt->nameNr = 0;
   6635     ctxt->name = NULL;
   6636 
   6637     DICT_FREE(ctxt->version);
   6638     ctxt->version = NULL;
   6639     DICT_FREE(ctxt->encoding);
   6640     ctxt->encoding = NULL;
   6641     DICT_FREE(ctxt->directory);
   6642     ctxt->directory = NULL;
   6643     DICT_FREE(ctxt->extSubURI);
   6644     ctxt->extSubURI = NULL;
   6645     DICT_FREE(ctxt->extSubSystem);
   6646     ctxt->extSubSystem = NULL;
   6647     if (ctxt->myDoc != NULL)
   6648         xmlFreeDoc(ctxt->myDoc);
   6649     ctxt->myDoc = NULL;
   6650 
   6651     ctxt->standalone = -1;
   6652     ctxt->hasExternalSubset = 0;
   6653     ctxt->hasPErefs = 0;
   6654     ctxt->html = 1;
   6655     ctxt->external = 0;
   6656     ctxt->instate = XML_PARSER_START;
   6657     ctxt->token = 0;
   6658 
   6659     ctxt->wellFormed = 1;
   6660     ctxt->nsWellFormed = 1;
   6661     ctxt->disableSAX = 0;
   6662     ctxt->valid = 1;
   6663     ctxt->vctxt.userData = ctxt;
   6664     ctxt->vctxt.error = xmlParserValidityError;
   6665     ctxt->vctxt.warning = xmlParserValidityWarning;
   6666     ctxt->record_info = 0;
   6667     ctxt->nbChars = 0;
   6668     ctxt->checkIndex = 0;
   6669     ctxt->inSubset = 0;
   6670     ctxt->errNo = XML_ERR_OK;
   6671     ctxt->depth = 0;
   6672     ctxt->charset = XML_CHAR_ENCODING_NONE;
   6673     ctxt->catalogs = NULL;
   6674     xmlInitNodeInfoSeq(&ctxt->node_seq);
   6675 
   6676     if (ctxt->attsDefault != NULL) {
   6677         xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
   6678         ctxt->attsDefault = NULL;
   6679     }
   6680     if (ctxt->attsSpecial != NULL) {
   6681         xmlHashFree(ctxt->attsSpecial, NULL);
   6682         ctxt->attsSpecial = NULL;
   6683     }
   6684 }
   6685 
   6686 /**
   6687  * htmlCtxtUseOptions:
   6688  * @ctxt: an HTML parser context
   6689  * @options:  a combination of htmlParserOption(s)
   6690  *
   6691  * Applies the options to the parser context
   6692  *
   6693  * Returns 0 in case of success, the set of unknown or unimplemented options
   6694  *         in case of error.
   6695  */
   6696 int
   6697 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
   6698 {
   6699     if (ctxt == NULL)
   6700         return(-1);
   6701 
   6702     if (options & HTML_PARSE_NOWARNING) {
   6703         ctxt->sax->warning = NULL;
   6704         ctxt->vctxt.warning = NULL;
   6705         options -= XML_PARSE_NOWARNING;
   6706 	ctxt->options |= XML_PARSE_NOWARNING;
   6707     }
   6708     if (options & HTML_PARSE_NOERROR) {
   6709         ctxt->sax->error = NULL;
   6710         ctxt->vctxt.error = NULL;
   6711         ctxt->sax->fatalError = NULL;
   6712         options -= XML_PARSE_NOERROR;
   6713 	ctxt->options |= XML_PARSE_NOERROR;
   6714     }
   6715     if (options & HTML_PARSE_PEDANTIC) {
   6716         ctxt->pedantic = 1;
   6717         options -= XML_PARSE_PEDANTIC;
   6718 	ctxt->options |= XML_PARSE_PEDANTIC;
   6719     } else
   6720         ctxt->pedantic = 0;
   6721     if (options & XML_PARSE_NOBLANKS) {
   6722         ctxt->keepBlanks = 0;
   6723         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
   6724         options -= XML_PARSE_NOBLANKS;
   6725 	ctxt->options |= XML_PARSE_NOBLANKS;
   6726     } else
   6727         ctxt->keepBlanks = 1;
   6728     if (options & HTML_PARSE_RECOVER) {
   6729         ctxt->recovery = 1;
   6730 	options -= HTML_PARSE_RECOVER;
   6731     } else
   6732         ctxt->recovery = 0;
   6733     if (options & HTML_PARSE_COMPACT) {
   6734 	ctxt->options |= HTML_PARSE_COMPACT;
   6735         options -= HTML_PARSE_COMPACT;
   6736     }
   6737     if (options & XML_PARSE_HUGE) {
   6738 	ctxt->options |= XML_PARSE_HUGE;
   6739         options -= XML_PARSE_HUGE;
   6740     }
   6741     if (options & HTML_PARSE_NODEFDTD) {
   6742 	ctxt->options |= HTML_PARSE_NODEFDTD;
   6743         options -= HTML_PARSE_NODEFDTD;
   6744     }
   6745     if (options & HTML_PARSE_IGNORE_ENC) {
   6746 	ctxt->options |= HTML_PARSE_IGNORE_ENC;
   6747         options -= HTML_PARSE_IGNORE_ENC;
   6748     }
   6749     if (options & HTML_PARSE_NOIMPLIED) {
   6750         ctxt->options |= HTML_PARSE_NOIMPLIED;
   6751         options -= HTML_PARSE_NOIMPLIED;
   6752     }
   6753     ctxt->dictNames = 0;
   6754     return (options);
   6755 }
   6756 
   6757 /**
   6758  * htmlDoRead:
   6759  * @ctxt:  an HTML parser context
   6760  * @URL:  the base URL to use for the document
   6761  * @encoding:  the document encoding, or NULL
   6762  * @options:  a combination of htmlParserOption(s)
   6763  * @reuse:  keep the context for reuse
   6764  *
   6765  * Common front-end for the htmlRead functions
   6766  *
   6767  * Returns the resulting document tree or NULL
   6768  */
   6769 static htmlDocPtr
   6770 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
   6771           int options, int reuse)
   6772 {
   6773     htmlDocPtr ret;
   6774 
   6775     htmlCtxtUseOptions(ctxt, options);
   6776     ctxt->html = 1;
   6777     if (encoding != NULL) {
   6778         xmlCharEncodingHandlerPtr hdlr;
   6779 
   6780 	hdlr = xmlFindCharEncodingHandler(encoding);
   6781 	if (hdlr != NULL) {
   6782 	    xmlSwitchToEncoding(ctxt, hdlr);
   6783 	    if (ctxt->input->encoding != NULL)
   6784 	      xmlFree((xmlChar *) ctxt->input->encoding);
   6785             ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
   6786         }
   6787     }
   6788     if ((URL != NULL) && (ctxt->input != NULL) &&
   6789         (ctxt->input->filename == NULL))
   6790         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
   6791     htmlParseDocument(ctxt);
   6792     ret = ctxt->myDoc;
   6793     ctxt->myDoc = NULL;
   6794     if (!reuse) {
   6795         if ((ctxt->dictNames) &&
   6796 	    (ret != NULL) &&
   6797 	    (ret->dict == ctxt->dict))
   6798 	    ctxt->dict = NULL;
   6799 	xmlFreeParserCtxt(ctxt);
   6800     }
   6801     return (ret);
   6802 }
   6803 
   6804 /**
   6805  * htmlReadDoc:
   6806  * @cur:  a pointer to a zero terminated string
   6807  * @URL:  the base URL to use for the document
   6808  * @encoding:  the document encoding, or NULL
   6809  * @options:  a combination of htmlParserOption(s)
   6810  *
   6811  * parse an XML in-memory document and build a tree.
   6812  *
   6813  * Returns the resulting document tree
   6814  */
   6815 htmlDocPtr
   6816 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
   6817 {
   6818     htmlParserCtxtPtr ctxt;
   6819 
   6820     if (cur == NULL)
   6821         return (NULL);
   6822 
   6823     xmlInitParser();
   6824     ctxt = htmlCreateDocParserCtxt(cur, NULL);
   6825     if (ctxt == NULL)
   6826         return (NULL);
   6827     return (htmlDoRead(ctxt, URL, encoding, options, 0));
   6828 }
   6829 
   6830 /**
   6831  * htmlReadFile:
   6832  * @filename:  a file or URL
   6833  * @encoding:  the document encoding, or NULL
   6834  * @options:  a combination of htmlParserOption(s)
   6835  *
   6836  * parse an XML file from the filesystem or the network.
   6837  *
   6838  * Returns the resulting document tree
   6839  */
   6840 htmlDocPtr
   6841 htmlReadFile(const char *filename, const char *encoding, int options)
   6842 {
   6843     htmlParserCtxtPtr ctxt;
   6844 
   6845     xmlInitParser();
   6846     ctxt = htmlCreateFileParserCtxt(filename, encoding);
   6847     if (ctxt == NULL)
   6848         return (NULL);
   6849     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
   6850 }
   6851 
   6852 /**
   6853  * htmlReadMemory:
   6854  * @buffer:  a pointer to a char array
   6855  * @size:  the size of the array
   6856  * @URL:  the base URL to use for the document
   6857  * @encoding:  the document encoding, or NULL
   6858  * @options:  a combination of htmlParserOption(s)
   6859  *
   6860  * parse an XML in-memory document and build a tree.
   6861  *
   6862  * Returns the resulting document tree
   6863  */
   6864 htmlDocPtr
   6865 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
   6866 {
   6867     htmlParserCtxtPtr ctxt;
   6868 
   6869     xmlInitParser();
   6870     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
   6871     if (ctxt == NULL)
   6872         return (NULL);
   6873     htmlDefaultSAXHandlerInit();
   6874     if (ctxt->sax != NULL)
   6875         memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
   6876     return (htmlDoRead(ctxt, URL, encoding, options, 0));
   6877 }
   6878 
   6879 /**
   6880  * htmlReadFd:
   6881  * @fd:  an open file descriptor
   6882  * @URL:  the base URL to use for the document
   6883  * @encoding:  the document encoding, or NULL
   6884  * @options:  a combination of htmlParserOption(s)
   6885  *
   6886  * parse an XML from a file descriptor and build a tree.
   6887  *
   6888  * Returns the resulting document tree
   6889  */
   6890 htmlDocPtr
   6891 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
   6892 {
   6893     htmlParserCtxtPtr ctxt;
   6894     xmlParserInputBufferPtr input;
   6895     xmlParserInputPtr stream;
   6896 
   6897     if (fd < 0)
   6898         return (NULL);
   6899     xmlInitParser();
   6900 
   6901     xmlInitParser();
   6902     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
   6903     if (input == NULL)
   6904         return (NULL);
   6905     ctxt = xmlNewParserCtxt();
   6906     if (ctxt == NULL) {
   6907         xmlFreeParserInputBuffer(input);
   6908         return (NULL);
   6909     }
   6910     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
   6911     if (stream == NULL) {
   6912         xmlFreeParserInputBuffer(input);
   6913 	xmlFreeParserCtxt(ctxt);
   6914         return (NULL);
   6915     }
   6916     inputPush(ctxt, stream);
   6917     return (htmlDoRead(ctxt, URL, encoding, options, 0));
   6918 }
   6919 
   6920 /**
   6921  * htmlReadIO:
   6922  * @ioread:  an I/O read function
   6923  * @ioclose:  an I/O close function
   6924  * @ioctx:  an I/O handler
   6925  * @URL:  the base URL to use for the document
   6926  * @encoding:  the document encoding, or NULL
   6927  * @options:  a combination of htmlParserOption(s)
   6928  *
   6929  * parse an HTML document from I/O functions and source and build a tree.
   6930  *
   6931  * Returns the resulting document tree
   6932  */
   6933 htmlDocPtr
   6934 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
   6935           void *ioctx, const char *URL, const char *encoding, int options)
   6936 {
   6937     htmlParserCtxtPtr ctxt;
   6938     xmlParserInputBufferPtr input;
   6939     xmlParserInputPtr stream;
   6940 
   6941     if (ioread == NULL)
   6942         return (NULL);
   6943     xmlInitParser();
   6944 
   6945     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
   6946                                          XML_CHAR_ENCODING_NONE);
   6947     if (input == NULL) {
   6948         if (ioclose != NULL)
   6949             ioclose(ioctx);
   6950         return (NULL);
   6951     }
   6952     ctxt = htmlNewParserCtxt();
   6953     if (ctxt == NULL) {
   6954         xmlFreeParserInputBuffer(input);
   6955         return (NULL);
   6956     }
   6957     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
   6958     if (stream == NULL) {
   6959         xmlFreeParserInputBuffer(input);
   6960 	xmlFreeParserCtxt(ctxt);
   6961         return (NULL);
   6962     }
   6963     inputPush(ctxt, stream);
   6964     return (htmlDoRead(ctxt, URL, encoding, options, 0));
   6965 }
   6966 
   6967 /**
   6968  * htmlCtxtReadDoc:
   6969  * @ctxt:  an HTML parser context
   6970  * @cur:  a pointer to a zero terminated string
   6971  * @URL:  the base URL to use for the document
   6972  * @encoding:  the document encoding, or NULL
   6973  * @options:  a combination of htmlParserOption(s)
   6974  *
   6975  * parse an XML in-memory document and build a tree.
   6976  * This reuses the existing @ctxt parser context
   6977  *
   6978  * Returns the resulting document tree
   6979  */
   6980 htmlDocPtr
   6981 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
   6982                const char *URL, const char *encoding, int options)
   6983 {
   6984     xmlParserInputPtr stream;
   6985 
   6986     if (cur == NULL)
   6987         return (NULL);
   6988     if (ctxt == NULL)
   6989         return (NULL);
   6990     xmlInitParser();
   6991 
   6992     htmlCtxtReset(ctxt);
   6993 
   6994     stream = xmlNewStringInputStream(ctxt, cur);
   6995     if (stream == NULL) {
   6996         return (NULL);
   6997     }
   6998     inputPush(ctxt, stream);
   6999     return (htmlDoRead(ctxt, URL, encoding, options, 1));
   7000 }
   7001 
   7002 /**
   7003  * htmlCtxtReadFile:
   7004  * @ctxt:  an HTML parser context
   7005  * @filename:  a file or URL
   7006  * @encoding:  the document encoding, or NULL
   7007  * @options:  a combination of htmlParserOption(s)
   7008  *
   7009  * parse an XML file from the filesystem or the network.
   7010  * This reuses the existing @ctxt parser context
   7011  *
   7012  * Returns the resulting document tree
   7013  */
   7014 htmlDocPtr
   7015 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
   7016                 const char *encoding, int options)
   7017 {
   7018     xmlParserInputPtr stream;
   7019 
   7020     if (filename == NULL)
   7021         return (NULL);
   7022     if (ctxt == NULL)
   7023         return (NULL);
   7024     xmlInitParser();
   7025 
   7026     htmlCtxtReset(ctxt);
   7027 
   7028     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
   7029     if (stream == NULL) {
   7030         return (NULL);
   7031     }
   7032     inputPush(ctxt, stream);
   7033     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
   7034 }
   7035 
   7036 /**
   7037  * htmlCtxtReadMemory:
   7038  * @ctxt:  an HTML parser context
   7039  * @buffer:  a pointer to a char array
   7040  * @size:  the size of the array
   7041  * @URL:  the base URL to use for the document
   7042  * @encoding:  the document encoding, or NULL
   7043  * @options:  a combination of htmlParserOption(s)
   7044  *
   7045  * parse an XML in-memory document and build a tree.
   7046  * This reuses the existing @ctxt parser context
   7047  *
   7048  * Returns the resulting document tree
   7049  */
   7050 htmlDocPtr
   7051 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
   7052                   const char *URL, const char *encoding, int options)
   7053 {
   7054     xmlParserInputBufferPtr input;
   7055     xmlParserInputPtr stream;
   7056 
   7057     if (ctxt == NULL)
   7058         return (NULL);
   7059     if (buffer == NULL)
   7060         return (NULL);
   7061     xmlInitParser();
   7062 
   7063     htmlCtxtReset(ctxt);
   7064 
   7065     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
   7066     if (input == NULL) {
   7067 	return(NULL);
   7068     }
   7069 
   7070     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
   7071     if (stream == NULL) {
   7072 	xmlFreeParserInputBuffer(input);
   7073 	return(NULL);
   7074     }
   7075 
   7076     inputPush(ctxt, stream);
   7077     return (htmlDoRead(ctxt, URL, encoding, options, 1));
   7078 }
   7079 
   7080 /**
   7081  * htmlCtxtReadFd:
   7082  * @ctxt:  an HTML parser context
   7083  * @fd:  an open file descriptor
   7084  * @URL:  the base URL to use for the document
   7085  * @encoding:  the document encoding, or NULL
   7086  * @options:  a combination of htmlParserOption(s)
   7087  *
   7088  * parse an XML from a file descriptor and build a tree.
   7089  * This reuses the existing @ctxt parser context
   7090  *
   7091  * Returns the resulting document tree
   7092  */
   7093 htmlDocPtr
   7094 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
   7095               const char *URL, const char *encoding, int options)
   7096 {
   7097     xmlParserInputBufferPtr input;
   7098     xmlParserInputPtr stream;
   7099 
   7100     if (fd < 0)
   7101         return (NULL);
   7102     if (ctxt == NULL)
   7103         return (NULL);
   7104     xmlInitParser();
   7105 
   7106     htmlCtxtReset(ctxt);
   7107 
   7108 
   7109     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
   7110     if (input == NULL)
   7111         return (NULL);
   7112     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
   7113     if (stream == NULL) {
   7114         xmlFreeParserInputBuffer(input);
   7115         return (NULL);
   7116     }
   7117     inputPush(ctxt, stream);
   7118     return (htmlDoRead(ctxt, URL, encoding, options, 1));
   7119 }
   7120 
   7121 /**
   7122  * htmlCtxtReadIO:
   7123  * @ctxt:  an HTML parser context
   7124  * @ioread:  an I/O read function
   7125  * @ioclose:  an I/O close function
   7126  * @ioctx:  an I/O handler
   7127  * @URL:  the base URL to use for the document
   7128  * @encoding:  the document encoding, or NULL
   7129  * @options:  a combination of htmlParserOption(s)
   7130  *
   7131  * parse an HTML document from I/O functions and source and build a tree.
   7132  * This reuses the existing @ctxt parser context
   7133  *
   7134  * Returns the resulting document tree
   7135  */
   7136 htmlDocPtr
   7137 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
   7138               xmlInputCloseCallback ioclose, void *ioctx,
   7139 	      const char *URL,
   7140               const char *encoding, int options)
   7141 {
   7142     xmlParserInputBufferPtr input;
   7143     xmlParserInputPtr stream;
   7144 
   7145     if (ioread == NULL)
   7146         return (NULL);
   7147     if (ctxt == NULL)
   7148         return (NULL);
   7149     xmlInitParser();
   7150 
   7151     htmlCtxtReset(ctxt);
   7152 
   7153     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
   7154                                          XML_CHAR_ENCODING_NONE);
   7155     if (input == NULL) {
   7156         if (ioclose != NULL)
   7157             ioclose(ioctx);
   7158         return (NULL);
   7159     }
   7160     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
   7161     if (stream == NULL) {
   7162         xmlFreeParserInputBuffer(input);
   7163         return (NULL);
   7164     }
   7165     inputPush(ctxt, stream);
   7166     return (htmlDoRead(ctxt, URL, encoding, options, 1));
   7167 }
   7168 
   7169 #define bottom_HTMLparser
   7170 #include "elfgcchack.h"
   7171 #endif /* LIBXML_HTML_ENABLED */
   7172