Home | History | Annotate | Download | only in src
      1 /*
      2  * HTMLparser.c : an HTML 4.0 non-verifying parser
      3  *
      4  * See Copyright for the status of this software.
      5  *
      6  * daniel (at) veillard.com
      7  */
      8 
      9 #define IN_LIBXML
     10 #include "libxml.h"
     11 #ifdef LIBXML_HTML_ENABLED
     12 
     13 #include <string.h>
     14 #ifdef HAVE_CTYPE_H
     15 #include <ctype.h>
     16 #endif
     17 #ifdef HAVE_STDLIB_H
     18 #include <stdlib.h>
     19 #endif
     20 #ifdef HAVE_SYS_STAT_H
     21 #include <sys/stat.h>
     22 #endif
     23 #ifdef HAVE_FCNTL_H
     24 #include <fcntl.h>
     25 #endif
     26 #ifdef HAVE_UNISTD_H
     27 #include <unistd.h>
     28 #endif
     29 #ifdef HAVE_ZLIB_H
     30 #include <zlib.h>
     31 #endif
     32 
     33 #include <libxml/xmlmemory.h>
     34 #include <libxml/tree.h>
     35 #include <libxml/parser.h>
     36 #include <libxml/parserInternals.h>
     37 #include <libxml/xmlerror.h>
     38 #include <libxml/HTMLparser.h>
     39 #include <libxml/HTMLtree.h>
     40 #include <libxml/entities.h>
     41 #include <libxml/encoding.h>
     42 #include <libxml/valid.h>
     43 #include <libxml/xmlIO.h>
     44 #include <libxml/globals.h>
     45 #include <libxml/uri.h>
     46 
     47 #define HTML_MAX_NAMELEN 1000
     48 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
     49 #define HTML_PARSER_BUFFER_SIZE 100
     50 
     51 /* #define DEBUG */
     52 /* #define DEBUG_PUSH */
     53 
     54 static int htmlOmittedDefaultValue = 1;
     55 
     56 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
     57 			     xmlChar end, xmlChar  end2, xmlChar end3);
     58 static void htmlParseComment(htmlParserCtxtPtr ctxt);
     59 
     60 /************************************************************************
     61  *									*
     62  *		Some factorized error routines				*
     63  *									*
     64  ************************************************************************/
     65 
     66 /**
     67  * htmlErrMemory:
     68  * @ctxt:  an HTML parser context
     69  * @extra:  extra informations
     70  *
     71  * Handle a redefinition of attribute error
     72  */
     73 static void
     74 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
     75 {
     76     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
     77         (ctxt->instate == XML_PARSER_EOF))
     78 	return;
     79     if (ctxt != NULL) {
     80         ctxt->errNo = XML_ERR_NO_MEMORY;
     81         ctxt->instate = XML_PARSER_EOF;
     82         ctxt->disableSAX = 1;
     83     }
     84     if (extra)
     85         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
     86                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
     87                         NULL, NULL, 0, 0,
     88                         "Memory allocation failed : %s\n", extra);
     89     else
     90         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
     91                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
     92                         NULL, NULL, 0, 0, "Memory allocation failed\n");
     93 }
     94 
     95 /**
     96  * htmlParseErr:
     97  * @ctxt:  an HTML parser context
     98  * @error:  the error number
     99  * @msg:  the error message
    100  * @str1:  string infor
    101  * @str2:  string infor
    102  *
    103  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
    104  */
    105 static void
    106 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
    107              const char *msg, const xmlChar *str1, const xmlChar *str2)
    108 {
    109     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
    110         (ctxt->instate == XML_PARSER_EOF))
    111 	return;
    112     if (ctxt != NULL)
    113 	ctxt->errNo = error;
    114     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
    115                     XML_ERR_ERROR, NULL, 0,
    116 		    (const char *) str1, (const char *) str2,
    117 		    NULL, 0, 0,
    118 		    msg, str1, str2);
    119     if (ctxt != NULL)
    120 	ctxt->wellFormed = 0;
    121 }
    122 
    123 /**
    124  * htmlParseErrInt:
    125  * @ctxt:  an HTML parser context
    126  * @error:  the error number
    127  * @msg:  the error message
    128  * @val:  integer info
    129  *
    130  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
    131  */
    132 static void
    133 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
    134              const char *msg, int val)
    135 {
    136     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
    137         (ctxt->instate == XML_PARSER_EOF))
    138 	return;
    139     if (ctxt != NULL)
    140 	ctxt->errNo = error;
    141     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
    142                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
    143 		    NULL, val, 0, msg, val);
    144     if (ctxt != NULL)
    145 	ctxt->wellFormed = 0;
    146 }
    147 
    148 /************************************************************************
    149  *									*
    150  *	Parser stacks related functions and macros		*
    151  *									*
    152  ************************************************************************/
    153 
    154 /**
    155  * htmlnamePush:
    156  * @ctxt:  an HTML parser context
    157  * @value:  the element name
    158  *
    159  * Pushes a new element name on top of the name stack
    160  *
    161  * Returns 0 in case of error, the index in the stack otherwise
    162  */
    163 static int
    164 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
    165 {
    166     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
    167         ctxt->html = 3;
    168     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
    169         ctxt->html = 10;
    170     if (ctxt->nameNr >= ctxt->nameMax) {
    171         ctxt->nameMax *= 2;
    172         ctxt->nameTab = (const xmlChar * *)
    173                          xmlRealloc((xmlChar * *)ctxt->nameTab,
    174                                     ctxt->nameMax *
    175                                     sizeof(ctxt->nameTab[0]));
    176         if (ctxt->nameTab == NULL) {
    177             htmlErrMemory(ctxt, NULL);
    178             return (0);
    179         }
    180     }
    181     ctxt->nameTab[ctxt->nameNr] = value;
    182     ctxt->name = value;
    183     return (ctxt->nameNr++);
    184 }
    185 /**
    186  * htmlnamePop:
    187  * @ctxt: an HTML parser context
    188  *
    189  * Pops the top element name from the name stack
    190  *
    191  * Returns the name just removed
    192  */
    193 static const xmlChar *
    194 htmlnamePop(htmlParserCtxtPtr ctxt)
    195 {
    196     const xmlChar *ret;
    197 
    198     if (ctxt->nameNr <= 0)
    199         return (NULL);
    200     ctxt->nameNr--;
    201     if (ctxt->nameNr < 0)
    202         return (NULL);
    203     if (ctxt->nameNr > 0)
    204         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
    205     else
    206         ctxt->name = NULL;
    207     ret = ctxt->nameTab[ctxt->nameNr];
    208     ctxt->nameTab[ctxt->nameNr] = NULL;
    209     return (ret);
    210 }
    211 
    212 /**
    213  * htmlNodeInfoPush:
    214  * @ctxt:  an HTML parser context
    215  * @value:  the node info
    216  *
    217  * Pushes a new element name on top of the node info stack
    218  *
    219  * Returns 0 in case of error, the index in the stack otherwise
    220  */
    221 static int
    222 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
    223 {
    224     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
    225         if (ctxt->nodeInfoMax == 0)
    226                 ctxt->nodeInfoMax = 5;
    227         ctxt->nodeInfoMax *= 2;
    228         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
    229                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
    230                                     ctxt->nodeInfoMax *
    231                                     sizeof(ctxt->nodeInfoTab[0]));
    232         if (ctxt->nodeInfoTab == NULL) {
    233             htmlErrMemory(ctxt, NULL);
    234             return (0);
    235         }
    236     }
    237     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
    238     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
    239     return (ctxt->nodeInfoNr++);
    240 }
    241 
    242 /**
    243  * htmlNodeInfoPop:
    244  * @ctxt:  an HTML parser context
    245  *
    246  * Pops the top element name from the node info stack
    247  *
    248  * Returns 0 in case of error, the pointer to NodeInfo otherwise
    249  */
    250 static htmlParserNodeInfo *
    251 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
    252 {
    253     if (ctxt->nodeInfoNr <= 0)
    254         return (NULL);
    255     ctxt->nodeInfoNr--;
    256     if (ctxt->nodeInfoNr < 0)
    257         return (NULL);
    258     if (ctxt->nodeInfoNr > 0)
    259         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
    260     else
    261         ctxt->nodeInfo = NULL;
    262     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
    263 }
    264 
    265 /*
    266  * Macros for accessing the content. Those should be used only by the parser,
    267  * and not exported.
    268  *
    269  * Dirty macros, i.e. one need to make assumption on the context to use them
    270  *
    271  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
    272  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
    273  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
    274  *           in UNICODE mode. This should be used internally by the parser
    275  *           only to compare to ASCII values otherwise it would break when
    276  *           running with UTF-8 encoding.
    277  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
    278  *           to compare on ASCII based substring.
    279  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
    280  *           it should be used only to compare on ASCII based substring.
    281  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
    282  *           strings without newlines within the parser.
    283  *
    284  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
    285  *
    286  *   CURRENT Returns the current char value, with the full decoding of
    287  *           UTF-8 if we are using this mode. It returns an int.
    288  *   NEXT    Skip to the next character, this does the proper decoding
    289  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
    290  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
    291  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
    292  */
    293 
    294 #define UPPER (toupper(*ctxt->input->cur))
    295 
    296 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
    297 
    298 #define NXT(val) ctxt->input->cur[(val)]
    299 
    300 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
    301 
    302 #define CUR_PTR ctxt->input->cur
    303 
    304 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
    305 		   (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
    306 	xmlParserInputShrink(ctxt->input)
    307 
    308 #define GROW if ((ctxt->progressive == 0) &&				\
    309 		 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))	\
    310 	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
    311 
    312 #define CURRENT ((int) (*ctxt->input->cur))
    313 
    314 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
    315 
    316 /* Inported from XML */
    317 
    318 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
    319 #define CUR ((int) (*ctxt->input->cur))
    320 #define NEXT xmlNextChar(ctxt)
    321 
    322 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
    323 
    324 
    325 #define NEXTL(l) do {							\
    326     if (*(ctxt->input->cur) == '\n') {					\
    327 	ctxt->input->line++; ctxt->input->col = 1;			\
    328     } else ctxt->input->col++;						\
    329     ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;		\
    330   } while (0)
    331 
    332 /************
    333     \
    334     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);	\
    335     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
    336  ************/
    337 
    338 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
    339 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
    340 
    341 #define COPY_BUF(l,b,i,v)						\
    342     if (l == 1) b[i++] = (xmlChar) v;					\
    343     else i += xmlCopyChar(l,&b[i],v)
    344 
    345 /**
    346  * htmlFindEncoding:
    347  * @the HTML parser context
    348  *
    349  * Ty to find and encoding in the current data available in the input
    350  * buffer this is needed to try to switch to the proper encoding when
    351  * one face a character error.
    352  * That's an heuristic, since it's operating outside of parsing it could
    353  * try to use a meta which had been commented out, that's the reason it
    354  * should only be used in case of error, not as a default.
    355  *
    356  * Returns an encoding string or NULL if not found, the string need to
    357  *   be freed
    358  */
    359 static xmlChar *
    360 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
    361     const xmlChar *start, *cur, *end;
    362 
    363     if ((ctxt == NULL) || (ctxt->input == NULL) ||
    364         (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
    365         (ctxt->input->buf->encoder != NULL))
    366         return(NULL);
    367     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
    368         return(NULL);
    369 
    370     start = ctxt->input->cur;
    371     end = ctxt->input->end;
    372     /* we also expect the input buffer to be zero terminated */
    373     if (*end != 0)
    374         return(NULL);
    375 
    376     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
    377     if (cur == NULL)
    378         return(NULL);
    379     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
    380     if (cur == NULL)
    381         return(NULL);
    382     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
    383     if (cur == NULL)
    384         return(NULL);
    385     cur += 8;
    386     start = cur;
    387     while (((*cur >= 'A') && (*cur <= 'Z')) ||
    388            ((*cur >= 'a') && (*cur <= 'z')) ||
    389            ((*cur >= '0') && (*cur <= '9')) ||
    390            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
    391            cur++;
    392     if (cur == start)
    393         return(NULL);
    394     return(xmlStrndup(start, cur - start));
    395 }
    396 
    397 /**
    398  * htmlCurrentChar:
    399  * @ctxt:  the HTML parser context
    400  * @len:  pointer to the length of the char read
    401  *
    402  * The current char value, if using UTF-8 this may actually span multiple
    403  * bytes in the input buffer. Implement the end of line normalization:
    404  * 2.11 End-of-Line Handling
    405  * If the encoding is unspecified, in the case we find an ISO-Latin-1
    406  * char, then the encoding converter is plugged in automatically.
    407  *
    408  * Returns the current char value and its length
    409  */
    410 
    411 static int
    412 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
    413     if (ctxt->instate == XML_PARSER_EOF)
    414 	return(0);
    415 
    416     if (ctxt->token != 0) {
    417 	*len = 0;
    418 	return(ctxt->token);
    419     }
    420     if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
    421 	/*
    422 	 * We are supposed to handle UTF8, check it's valid
    423 	 * From rfc2044: encoding of the Unicode values on UTF-8:
    424 	 *
    425 	 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
    426 	 * 0000 0000-0000 007F   0xxxxxxx
    427 	 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
    428 	 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
    429 	 *
    430 	 * Check for the 0x110000 limit too
    431 	 */
    432 	const unsigned char *cur = ctxt->input->cur;
    433 	unsigned char c;
    434 	unsigned int val;
    435 
    436 	c = *cur;
    437 	if (c & 0x80) {
    438 	    if (cur[1] == 0) {
    439 		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
    440                 cur = ctxt->input->cur;
    441             }
    442 	    if ((cur[1] & 0xc0) != 0x80)
    443 		goto encoding_error;
    444 	    if ((c & 0xe0) == 0xe0) {
    445 
    446 		if (cur[2] == 0) {
    447 		    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
    448                     cur = ctxt->input->cur;
    449                 }
    450 		if ((cur[2] & 0xc0) != 0x80)
    451 		    goto encoding_error;
    452 		if ((c & 0xf0) == 0xf0) {
    453 		    if (cur[3] == 0) {
    454 			xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
    455                         cur = ctxt->input->cur;
    456                     }
    457 		    if (((c & 0xf8) != 0xf0) ||
    458 			((cur[3] & 0xc0) != 0x80))
    459 			goto encoding_error;
    460 		    /* 4-byte code */
    461 		    *len = 4;
    462 		    val = (cur[0] & 0x7) << 18;
    463 		    val |= (cur[1] & 0x3f) << 12;
    464 		    val |= (cur[2] & 0x3f) << 6;
    465 		    val |= cur[3] & 0x3f;
    466 		} else {
    467 		  /* 3-byte code */
    468 		    *len = 3;
    469 		    val = (cur[0] & 0xf) << 12;
    470 		    val |= (cur[1] & 0x3f) << 6;
    471 		    val |= cur[2] & 0x3f;
    472 		}
    473 	    } else {
    474 	      /* 2-byte code */
    475 		*len = 2;
    476 		val = (cur[0] & 0x1f) << 6;
    477 		val |= cur[1] & 0x3f;
    478 	    }
    479 	    if (!IS_CHAR(val)) {
    480 	        htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
    481 				"Char 0x%X out of allowed range\n", val);
    482 	    }
    483 	    return(val);
    484 	} else {
    485             if ((*ctxt->input->cur == 0) &&
    486                 (ctxt->input->cur < ctxt->input->end)) {
    487                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
    488 				"Char 0x%X out of allowed range\n", 0);
    489                 *len = 1;
    490                 return(' ');
    491             }
    492 	    /* 1-byte code */
    493 	    *len = 1;
    494 	    return((int) *ctxt->input->cur);
    495 	}
    496     }
    497     /*
    498      * Assume it's a fixed length encoding (1) with
    499      * a compatible encoding for the ASCII set, since
    500      * XML constructs only use < 128 chars
    501      */
    502     *len = 1;
    503     if ((int) *ctxt->input->cur < 0x80)
    504 	return((int) *ctxt->input->cur);
    505 
    506     /*
    507      * Humm this is bad, do an automatic flow conversion
    508      */
    509     {
    510         xmlChar * guess;
    511         xmlCharEncodingHandlerPtr handler;
    512 
    513         guess = htmlFindEncoding(ctxt);
    514         if (guess == NULL) {
    515             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
    516         } else {
    517             if (ctxt->input->encoding != NULL)
    518                 xmlFree((xmlChar *) ctxt->input->encoding);
    519             ctxt->input->encoding = guess;
    520             handler = xmlFindCharEncodingHandler((const char *) guess);
    521             if (handler != NULL) {
    522                 xmlSwitchToEncoding(ctxt, handler);
    523             } else {
    524                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
    525                              "Unsupported encoding %s", guess, NULL);
    526             }
    527         }
    528         ctxt->charset = XML_CHAR_ENCODING_UTF8;
    529     }
    530 
    531     return(xmlCurrentChar(ctxt, len));
    532 
    533 encoding_error:
    534     /*
    535      * If we detect an UTF8 error that probably mean that the
    536      * input encoding didn't get properly advertized in the
    537      * declaration header. Report the error and switch the encoding
    538      * to ISO-Latin-1 (if you don't like this policy, just declare the
    539      * encoding !)
    540      */
    541     {
    542         char buffer[150];
    543 
    544 	if (ctxt->input->end - ctxt->input->cur >= 4) {
    545 	    snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
    546 			    ctxt->input->cur[0], ctxt->input->cur[1],
    547 			    ctxt->input->cur[2], ctxt->input->cur[3]);
    548 	} else {
    549 	    snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
    550 	}
    551 	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
    552 		     "Input is not proper UTF-8, indicate encoding !\n",
    553 		     BAD_CAST buffer, NULL);
    554     }
    555 
    556     ctxt->charset = XML_CHAR_ENCODING_8859_1;
    557     *len = 1;
    558     return((int) *ctxt->input->cur);
    559 }
    560 
    561 /**
    562  * htmlSkipBlankChars:
    563  * @ctxt:  the HTML parser context
    564  *
    565  * skip all blanks character found at that point in the input streams.
    566  *
    567  * Returns the number of space chars skipped
    568  */
    569 
    570 static int
    571 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
    572     int res = 0;
    573 
    574     while (IS_BLANK_CH(*(ctxt->input->cur))) {
    575 	if ((*ctxt->input->cur == 0) &&
    576 	    (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
    577 		xmlPopInput(ctxt);
    578 	} else {
    579 	    if (*(ctxt->input->cur) == '\n') {
    580 		ctxt->input->line++; ctxt->input->col = 1;
    581 	    } else ctxt->input->col++;
    582 	    ctxt->input->cur++;
    583 	    ctxt->nbChars++;
    584 	    if (*ctxt->input->cur == 0)
    585 		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
    586 	}
    587 	res++;
    588     }
    589     return(res);
    590 }
    591 
    592 
    593 
    594 /************************************************************************
    595  *									*
    596  *	The list of HTML elements and their properties		*
    597  *									*
    598  ************************************************************************/
    599 
    600 /*
    601  *  Start Tag: 1 means the start tag can be ommited
    602  *  End Tag:   1 means the end tag can be ommited
    603  *             2 means it's forbidden (empty elements)
    604  *             3 means the tag is stylistic and should be closed easily
    605  *  Depr:      this element is deprecated
    606  *  DTD:       1 means that this element is valid only in the Loose DTD
    607  *             2 means that this element is valid only in the Frameset DTD
    608  *
    609  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
    610 	, subElements , impliedsubelt , Attributes, userdata
    611  */
    612 
    613 /* Definitions and a couple of vars for HTML Elements */
    614 
    615 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
    616 #define NB_FONTSTYLE 8
    617 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
    618 #define NB_PHRASE 10
    619 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
    620 #define NB_SPECIAL 16
    621 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
    622 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
    623 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
    624 #define NB_BLOCK NB_HEADING + NB_LIST + 14
    625 #define FORMCTRL "input", "select", "textarea", "label", "button"
    626 #define NB_FORMCTRL 5
    627 #define PCDATA
    628 #define NB_PCDATA 0
    629 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
    630 #define NB_HEADING 6
    631 #define LIST "ul", "ol", "dir", "menu"
    632 #define NB_LIST 4
    633 #define MODIFIER
    634 #define NB_MODIFIER 0
    635 #define FLOW BLOCK,INLINE
    636 #define NB_FLOW NB_BLOCK + NB_INLINE
    637 #define EMPTY NULL
    638 
    639 
    640 static const char* const html_flow[] = { FLOW, NULL } ;
    641 static const char* const html_inline[] = { INLINE, NULL } ;
    642 
    643 /* placeholders: elts with content but no subelements */
    644 static const char* const html_pcdata[] = { NULL } ;
    645 #define html_cdata html_pcdata
    646 
    647 
    648 /* ... and for HTML Attributes */
    649 
    650 #define COREATTRS "id", "class", "style", "title"
    651 #define NB_COREATTRS 4
    652 #define I18N "lang", "dir"
    653 #define NB_I18N 2
    654 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
    655 #define NB_EVENTS 9
    656 #define ATTRS COREATTRS,I18N,EVENTS
    657 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
    658 #define CELLHALIGN "align", "char", "charoff"
    659 #define NB_CELLHALIGN 3
    660 #define CELLVALIGN "valign"
    661 #define NB_CELLVALIGN 1
    662 
    663 static const char* const html_attrs[] = { ATTRS, NULL } ;
    664 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
    665 static const char* const core_attrs[] = { COREATTRS, NULL } ;
    666 static const char* const i18n_attrs[] = { I18N, NULL } ;
    667 
    668 
    669 /* Other declarations that should go inline ... */
    670 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
    671 	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
    672 	"tabindex", "onfocus", "onblur", NULL } ;
    673 static const char* const target_attr[] = { "target", NULL } ;
    674 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
    675 static const char* const alt_attr[] = { "alt", NULL } ;
    676 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
    677 static const char* const href_attrs[] = { "href", NULL } ;
    678 static const char* const clear_attrs[] = { "clear", NULL } ;
    679 static const char* const inline_p[] = { INLINE, "p", NULL } ;
    680 
    681 static const char* const flow_param[] = { FLOW, "param", NULL } ;
    682 static const char* const applet_attrs[] = { COREATTRS , "codebase",
    683 		"archive", "alt", "name", "height", "width", "align",
    684 		"hspace", "vspace", NULL } ;
    685 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
    686 	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
    687 static const char* const basefont_attrs[] =
    688 	{ "id", "size", "color", "face", NULL } ;
    689 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
    690 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
    691 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
    692 static const char* const body_depr[] = { "background", "bgcolor", "text",
    693 	"link", "vlink", "alink", NULL } ;
    694 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
    695 	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
    696 
    697 
    698 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
    699 static const char* const col_elt[] = { "col", NULL } ;
    700 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
    701 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
    702 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
    703 static const char* const compact_attr[] = { "compact", NULL } ;
    704 static const char* const label_attr[] = { "label", NULL } ;
    705 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
    706 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
    707 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
    708 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
    709 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
    710 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
    711 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
    712 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
    713 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
    714 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
    715 static const char* const version_attr[] = { "version", NULL } ;
    716 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
    717 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
    718 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
    719 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
    720 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
    721 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
    722 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
    723 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
    724 static const char* const align_attr[] = { "align", NULL } ;
    725 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
    726 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
    727 static const char* const name_attr[] = { "name", NULL } ;
    728 static const char* const action_attr[] = { "action", NULL } ;
    729 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
    730 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
    731 static const char* const content_attr[] = { "content", NULL } ;
    732 static const char* const type_attr[] = { "type", NULL } ;
    733 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
    734 static const char* const object_contents[] = { FLOW, "param", NULL } ;
    735 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
    736 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
    737 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
    738 static const char* const option_elt[] = { "option", NULL } ;
    739 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
    740 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
    741 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
    742 static const char* const width_attr[] = { "width", NULL } ;
    743 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
    744 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
    745 static const char* const language_attr[] = { "language", NULL } ;
    746 static const char* const select_content[] = { "optgroup", "option", NULL } ;
    747 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
    748 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
    749 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
    750 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
    751 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
    752 static const char* const tr_elt[] = { "tr", NULL } ;
    753 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
    754 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
    755 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
    756 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
    757 static const char* const tr_contents[] = { "th", "td", NULL } ;
    758 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
    759 static const char* const li_elt[] = { "li", NULL } ;
    760 static const char* const ul_depr[] = { "type", "compact", NULL} ;
    761 static const char* const dir_attr[] = { "dir", NULL} ;
    762 
    763 #define DECL (const char**)
    764 
    765 static const htmlElemDesc
    766 html40ElementTable[] = {
    767 { "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
    768 	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
    769 },
    770 { "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
    771 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    772 },
    773 { "acronym",	0, 0, 0, 0, 0, 0, 1, "",
    774 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    775 },
    776 { "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
    777 	DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
    778 },
    779 { "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
    780 	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
    781 },
    782 { "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
    783 	EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
    784 },
    785 { "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
    786 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    787 },
    788 { "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
    789 	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
    790 },
    791 { "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
    792 	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
    793 },
    794 { "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
    795 	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
    796 },
    797 { "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
    798 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    799 },
    800 { "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
    801 	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
    802 },
    803 { "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
    804 	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
    805 },
    806 { "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
    807 	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
    808 },
    809 { "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
    810 	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
    811 },
    812 { "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
    813 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    814 },
    815 { "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
    816 	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
    817 },
    818 { "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
    819 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    820 },
    821 { "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
    822 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    823 },
    824 { "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
    825 	EMPTY , NULL , DECL col_attrs , NULL, NULL
    826 },
    827 { "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
    828 	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
    829 },
    830 { "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
    831 	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
    832 },
    833 { "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
    834 	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
    835 },
    836 { "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
    837 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    838 },
    839 { "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
    840 	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
    841 },
    842 { "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
    843 	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
    844 },
    845 { "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
    846 	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
    847 },
    848 { "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
    849 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    850 },
    851 { "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
    852 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    853 },
    854 { "embed",	0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
    855 	EMPTY, NULL, DECL embed_attrs, NULL, NULL
    856 },
    857 { "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
    858 	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
    859 },
    860 { "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
    861 	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
    862 },
    863 { "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
    864 	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
    865 },
    866 { "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
    867 	EMPTY, NULL, NULL, DECL frame_attrs, NULL
    868 },
    869 { "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
    870 	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
    871 },
    872 { "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
    873 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    874 },
    875 { "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
    876 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    877 },
    878 { "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
    879 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    880 },
    881 { "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
    882 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    883 },
    884 { "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
    885 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    886 },
    887 { "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
    888 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    889 },
    890 { "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
    891 	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
    892 },
    893 { "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
    894 	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
    895 },
    896 { "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
    897 	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
    898 },
    899 { "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
    900 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    901 },
    902 { "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
    903 	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
    904 },
    905 { "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
    906 	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
    907 },
    908 { "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
    909 	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
    910 },
    911 { "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
    912 	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
    913 },
    914 { "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
    915 	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
    916 },
    917 { "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
    918 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    919 },
    920 { "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
    921 	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
    922 },
    923 { "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
    924 	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
    925 },
    926 { "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
    927 	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
    928 },
    929 { "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
    930 	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
    931 },
    932 { "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
    933 	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
    934 },
    935 { "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
    936 	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
    937 },
    938 { "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
    939 	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
    940 },
    941 { "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
    942 	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
    943 },
    944 { "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
    945 	DECL html_flow, "div", DECL html_attrs, NULL, NULL
    946 },
    947 { "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
    948 	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
    949 },
    950 { "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
    951 	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
    952 },
    953 { "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
    954 	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
    955 },
    956 { "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
    957 	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
    958 },
    959 { "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
    960 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    961 },
    962 { "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
    963 	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
    964 },
    965 { "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
    966 	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
    967 },
    968 { "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
    969 	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
    970 },
    971 { "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
    972 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
    973 },
    974 { "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
    975 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    976 },
    977 { "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
    978 	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
    979 },
    980 { "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
    981 	DECL select_content, NULL, DECL select_attrs, NULL, NULL
    982 },
    983 { "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
    984 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    985 },
    986 { "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
    987 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    988 },
    989 { "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
    990 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
    991 },
    992 { "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
    993 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    994 },
    995 { "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
    996 	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
    997 },
    998 { "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
    999 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
   1000 },
   1001 { "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
   1002 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
   1003 },
   1004 { "table",	0, 0, 0, 0, 0, 0, 0, "",
   1005 	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
   1006 },
   1007 { "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
   1008 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
   1009 },
   1010 { "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
   1011 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
   1012 },
   1013 { "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
   1014 	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
   1015 },
   1016 { "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
   1017 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
   1018 },
   1019 { "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
   1020 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
   1021 },
   1022 { "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
   1023 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
   1024 },
   1025 { "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
   1026 	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
   1027 },
   1028 { "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
   1029 	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
   1030 },
   1031 { "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
   1032 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
   1033 },
   1034 { "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
   1035 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
   1036 },
   1037 { "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
   1038 	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
   1039 },
   1040 { "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
   1041 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
   1042 }
   1043 };
   1044 
   1045 /*
   1046  * start tags that imply the end of current element
   1047  */
   1048 static const char * const htmlStartClose[] = {
   1049 "form",		"form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
   1050 		"dl", "ul", "ol", "menu", "dir", "address", "pre",
   1051 		"listing", "xmp", "head", NULL,
   1052 "head",		"p", NULL,
   1053 "title",	"p", NULL,
   1054 "body",		"head", "style", "link", "title", "p", NULL,
   1055 "frameset",	"head", "style", "link", "title", "p", NULL,
   1056 "li",		"p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
   1057 		"pre", "listing", "xmp", "head", "li", NULL,
   1058 "hr",		"p", "head", NULL,
   1059 "h1",		"p", "head", NULL,
   1060 "h2",		"p", "head", NULL,
   1061 "h3",		"p", "head", NULL,
   1062 "h4",		"p", "head", NULL,
   1063 "h5",		"p", "head", NULL,
   1064 "h6",		"p", "head", NULL,
   1065 "dir",		"p", "head", NULL,
   1066 "address",	"p", "head", "ul", NULL,
   1067 "pre",		"p", "head", "ul", NULL,
   1068 "listing",	"p", "head", NULL,
   1069 "xmp",		"p", "head", NULL,
   1070 "blockquote",	"p", "head", NULL,
   1071 "dl",		"p", "dt", "menu", "dir", "address", "pre", "listing",
   1072 		"xmp", "head", NULL,
   1073 "dt",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
   1074                 "head", "dd", NULL,
   1075 "dd",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
   1076                 "head", "dt", NULL,
   1077 "ul",		"p", "head", "ol", "menu", "dir", "address", "pre",
   1078 		"listing", "xmp", NULL,
   1079 "ol",		"p", "head", "ul", NULL,
   1080 "menu",		"p", "head", "ul", NULL,
   1081 "p",		"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
   1082 "div",		"p", "head", NULL,
   1083 "noscript",	"p", "head", NULL,
   1084 "center",	"font", "b", "i", "p", "head", NULL,
   1085 "a",		"a", NULL,
   1086 "caption",	"p", NULL,
   1087 "colgroup",	"caption", "colgroup", "col", "p", NULL,
   1088 "col",		"caption", "col", "p", NULL,
   1089 "table",	"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
   1090 		"listing", "xmp", "a", NULL,
   1091 "th",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
   1092 "td",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
   1093 "tr",		"th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
   1094 "thead",	"caption", "col", "colgroup", NULL,
   1095 "tfoot",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
   1096 		"tbody", "p", NULL,
   1097 "tbody",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
   1098 		"tfoot", "tbody", "p", NULL,
   1099 "optgroup",	"option", NULL,
   1100 "option",	"option", NULL,
   1101 "fieldset",	"legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
   1102 		"pre", "listing", "xmp", "a", NULL,
   1103 NULL
   1104 };
   1105 
   1106 /*
   1107  * The list of HTML elements which are supposed not to have
   1108  * CDATA content and where a p element will be implied
   1109  *
   1110  * TODO: extend that list by reading the HTML SGML DTD on
   1111  *       implied paragraph
   1112  */
   1113 static const char *const htmlNoContentElements[] = {
   1114     "html",
   1115     "head",
   1116     NULL
   1117 };
   1118 
   1119 /*
   1120  * The list of HTML attributes which are of content %Script;
   1121  * NOTE: when adding ones, check htmlIsScriptAttribute() since
   1122  *       it assumes the name starts with 'on'
   1123  */
   1124 static const char *const htmlScriptAttributes[] = {
   1125     "onclick",
   1126     "ondblclick",
   1127     "onmousedown",
   1128     "onmouseup",
   1129     "onmouseover",
   1130     "onmousemove",
   1131     "onmouseout",
   1132     "onkeypress",
   1133     "onkeydown",
   1134     "onkeyup",
   1135     "onload",
   1136     "onunload",
   1137     "onfocus",
   1138     "onblur",
   1139     "onsubmit",
   1140     "onrest",
   1141     "onchange",
   1142     "onselect"
   1143 };
   1144 
   1145 /*
   1146  * This table is used by the htmlparser to know what to do with
   1147  * broken html pages. By assigning different priorities to different
   1148  * elements the parser can decide how to handle extra endtags.
   1149  * Endtags are only allowed to close elements with lower or equal
   1150  * priority.
   1151  */
   1152 
   1153 typedef struct {
   1154     const char *name;
   1155     int priority;
   1156 } elementPriority;
   1157 
   1158 static const elementPriority htmlEndPriority[] = {
   1159     {"div",   150},
   1160     {"td",    160},
   1161     {"th",    160},
   1162     {"tr",    170},
   1163     {"thead", 180},
   1164     {"tbody", 180},
   1165     {"tfoot", 180},
   1166     {"table", 190},
   1167     {"head",  200},
   1168     {"body",  200},
   1169     {"html",  220},
   1170     {NULL,    100} /* Default priority */
   1171 };
   1172 
   1173 static const char** htmlStartCloseIndex[100];
   1174 static int htmlStartCloseIndexinitialized = 0;
   1175 
   1176 /************************************************************************
   1177  *									*
   1178  *	functions to handle HTML specific data			*
   1179  *									*
   1180  ************************************************************************/
   1181 
   1182 /**
   1183  * htmlInitAutoClose:
   1184  *
   1185  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
   1186  * This is not reentrant. Call xmlInitParser() once before processing in
   1187  * case of use in multithreaded programs.
   1188  */
   1189 void
   1190 htmlInitAutoClose(void) {
   1191     int indx, i = 0;
   1192 
   1193     if (htmlStartCloseIndexinitialized) return;
   1194 
   1195     for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
   1196     indx = 0;
   1197     while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
   1198         htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
   1199 	while (htmlStartClose[i] != NULL) i++;
   1200 	i++;
   1201     }
   1202     htmlStartCloseIndexinitialized = 1;
   1203 }
   1204 
   1205 /**
   1206  * htmlTagLookup:
   1207  * @tag:  The tag name in lowercase
   1208  *
   1209  * Lookup the HTML tag in the ElementTable
   1210  *
   1211  * Returns the related htmlElemDescPtr or NULL if not found.
   1212  */
   1213 const htmlElemDesc *
   1214 htmlTagLookup(const xmlChar *tag) {
   1215     unsigned int i;
   1216 
   1217     for (i = 0; i < (sizeof(html40ElementTable) /
   1218                      sizeof(html40ElementTable[0]));i++) {
   1219         if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
   1220 	    return((htmlElemDescPtr) &html40ElementTable[i]);
   1221     }
   1222     return(NULL);
   1223 }
   1224 
   1225 /**
   1226  * htmlGetEndPriority:
   1227  * @name: The name of the element to look up the priority for.
   1228  *
   1229  * Return value: The "endtag" priority.
   1230  **/
   1231 static int
   1232 htmlGetEndPriority (const xmlChar *name) {
   1233     int i = 0;
   1234 
   1235     while ((htmlEndPriority[i].name != NULL) &&
   1236 	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
   1237 	i++;
   1238 
   1239     return(htmlEndPriority[i].priority);
   1240 }
   1241 
   1242 
   1243 /**
   1244  * htmlCheckAutoClose:
   1245  * @newtag:  The new tag name
   1246  * @oldtag:  The old tag name
   1247  *
   1248  * Checks whether the new tag is one of the registered valid tags for
   1249  * closing old.
   1250  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
   1251  *
   1252  * Returns 0 if no, 1 if yes.
   1253  */
   1254 static int
   1255 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
   1256 {
   1257     int i, indx;
   1258     const char **closed = NULL;
   1259 
   1260     if (htmlStartCloseIndexinitialized == 0)
   1261         htmlInitAutoClose();
   1262 
   1263     /* inefficient, but not a big deal */
   1264     for (indx = 0; indx < 100; indx++) {
   1265         closed = htmlStartCloseIndex[indx];
   1266         if (closed == NULL)
   1267             return (0);
   1268         if (xmlStrEqual(BAD_CAST * closed, newtag))
   1269             break;
   1270     }
   1271 
   1272     i = closed - htmlStartClose;
   1273     i++;
   1274     while (htmlStartClose[i] != NULL) {
   1275         if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
   1276             return (1);
   1277         }
   1278         i++;
   1279     }
   1280     return (0);
   1281 }
   1282 
   1283 /**
   1284  * htmlAutoCloseOnClose:
   1285  * @ctxt:  an HTML parser context
   1286  * @newtag:  The new tag name
   1287  * @force:  force the tag closure
   1288  *
   1289  * The HTML DTD allows an ending tag to implicitly close other tags.
   1290  */
   1291 static void
   1292 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
   1293 {
   1294     const htmlElemDesc *info;
   1295     int i, priority;
   1296 
   1297     priority = htmlGetEndPriority(newtag);
   1298 
   1299     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
   1300 
   1301         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
   1302             break;
   1303         /*
   1304          * A missplaced endtag can only close elements with lower
   1305          * or equal priority, so if we find an element with higher
   1306          * priority before we find an element with
   1307          * matching name, we just ignore this endtag
   1308          */
   1309         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
   1310             return;
   1311     }
   1312     if (i < 0)
   1313         return;
   1314 
   1315     while (!xmlStrEqual(newtag, ctxt->name)) {
   1316         info = htmlTagLookup(ctxt->name);
   1317         if ((info != NULL) && (info->endTag == 3)) {
   1318             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
   1319 	                 "Opening and ending tag mismatch: %s and %s\n",
   1320 			 newtag, ctxt->name);
   1321         }
   1322         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   1323             ctxt->sax->endElement(ctxt->userData, ctxt->name);
   1324 	htmlnamePop(ctxt);
   1325     }
   1326 }
   1327 
   1328 /**
   1329  * htmlAutoCloseOnEnd:
   1330  * @ctxt:  an HTML parser context
   1331  *
   1332  * Close all remaining tags at the end of the stream
   1333  */
   1334 static void
   1335 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
   1336 {
   1337     int i;
   1338 
   1339     if (ctxt->nameNr == 0)
   1340         return;
   1341     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
   1342         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   1343             ctxt->sax->endElement(ctxt->userData, ctxt->name);
   1344 	htmlnamePop(ctxt);
   1345     }
   1346 }
   1347 
   1348 /**
   1349  * htmlAutoClose:
   1350  * @ctxt:  an HTML parser context
   1351  * @newtag:  The new tag name or NULL
   1352  *
   1353  * The HTML DTD allows a tag to implicitly close other tags.
   1354  * The list is kept in htmlStartClose array. This function is
   1355  * called when a new tag has been detected and generates the
   1356  * appropriates closes if possible/needed.
   1357  * If newtag is NULL this mean we are at the end of the resource
   1358  * and we should check
   1359  */
   1360 static void
   1361 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
   1362 {
   1363     while ((newtag != NULL) && (ctxt->name != NULL) &&
   1364            (htmlCheckAutoClose(newtag, ctxt->name))) {
   1365         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   1366             ctxt->sax->endElement(ctxt->userData, ctxt->name);
   1367 	htmlnamePop(ctxt);
   1368     }
   1369     if (newtag == NULL) {
   1370         htmlAutoCloseOnEnd(ctxt);
   1371         return;
   1372     }
   1373     while ((newtag == NULL) && (ctxt->name != NULL) &&
   1374            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
   1375             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
   1376             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
   1377         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   1378             ctxt->sax->endElement(ctxt->userData, ctxt->name);
   1379 	htmlnamePop(ctxt);
   1380     }
   1381 }
   1382 
   1383 /**
   1384  * htmlAutoCloseTag:
   1385  * @doc:  the HTML document
   1386  * @name:  The tag name
   1387  * @elem:  the HTML element
   1388  *
   1389  * The HTML DTD allows a tag to implicitly close other tags.
   1390  * The list is kept in htmlStartClose array. This function checks
   1391  * if the element or one of it's children would autoclose the
   1392  * given tag.
   1393  *
   1394  * Returns 1 if autoclose, 0 otherwise
   1395  */
   1396 int
   1397 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
   1398     htmlNodePtr child;
   1399 
   1400     if (elem == NULL) return(1);
   1401     if (xmlStrEqual(name, elem->name)) return(0);
   1402     if (htmlCheckAutoClose(elem->name, name)) return(1);
   1403     child = elem->children;
   1404     while (child != NULL) {
   1405         if (htmlAutoCloseTag(doc, name, child)) return(1);
   1406 	child = child->next;
   1407     }
   1408     return(0);
   1409 }
   1410 
   1411 /**
   1412  * htmlIsAutoClosed:
   1413  * @doc:  the HTML document
   1414  * @elem:  the HTML element
   1415  *
   1416  * The HTML DTD allows a tag to implicitly close other tags.
   1417  * The list is kept in htmlStartClose array. This function checks
   1418  * if a tag is autoclosed by one of it's child
   1419  *
   1420  * Returns 1 if autoclosed, 0 otherwise
   1421  */
   1422 int
   1423 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
   1424     htmlNodePtr child;
   1425 
   1426     if (elem == NULL) return(1);
   1427     child = elem->children;
   1428     while (child != NULL) {
   1429 	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
   1430 	child = child->next;
   1431     }
   1432     return(0);
   1433 }
   1434 
   1435 /**
   1436  * htmlCheckImplied:
   1437  * @ctxt:  an HTML parser context
   1438  * @newtag:  The new tag name
   1439  *
   1440  * The HTML DTD allows a tag to exists only implicitly
   1441  * called when a new tag has been detected and generates the
   1442  * appropriates implicit tags if missing
   1443  */
   1444 static void
   1445 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
   1446     int i;
   1447 
   1448     if (ctxt->options & HTML_PARSE_NOIMPLIED)
   1449         return;
   1450     if (!htmlOmittedDefaultValue)
   1451 	return;
   1452     if (xmlStrEqual(newtag, BAD_CAST"html"))
   1453 	return;
   1454     if (ctxt->nameNr <= 0) {
   1455 	htmlnamePush(ctxt, BAD_CAST"html");
   1456 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1457 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
   1458     }
   1459     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
   1460         return;
   1461     if ((ctxt->nameNr <= 1) &&
   1462         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
   1463 	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
   1464 	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
   1465 	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
   1466 	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
   1467 	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
   1468         if (ctxt->html >= 3) {
   1469             /* we already saw or generated an <head> before */
   1470             return;
   1471         }
   1472         /*
   1473          * dropped OBJECT ... i you put it first BODY will be
   1474          * assumed !
   1475          */
   1476         htmlnamePush(ctxt, BAD_CAST"head");
   1477         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1478             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
   1479     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
   1480 	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
   1481 	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
   1482         if (ctxt->html >= 10) {
   1483             /* we already saw or generated a <body> before */
   1484             return;
   1485         }
   1486 	for (i = 0;i < ctxt->nameNr;i++) {
   1487 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
   1488 		return;
   1489 	    }
   1490 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
   1491 		return;
   1492 	    }
   1493 	}
   1494 
   1495 	htmlnamePush(ctxt, BAD_CAST"body");
   1496 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1497 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
   1498     }
   1499 }
   1500 
   1501 /**
   1502  * htmlCheckParagraph
   1503  * @ctxt:  an HTML parser context
   1504  *
   1505  * Check whether a p element need to be implied before inserting
   1506  * characters in the current element.
   1507  *
   1508  * Returns 1 if a paragraph has been inserted, 0 if not and -1
   1509  *         in case of error.
   1510  */
   1511 
   1512 static int
   1513 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
   1514     const xmlChar *tag;
   1515     int i;
   1516 
   1517     if (ctxt == NULL)
   1518 	return(-1);
   1519     tag = ctxt->name;
   1520     if (tag == NULL) {
   1521 	htmlAutoClose(ctxt, BAD_CAST"p");
   1522 	htmlCheckImplied(ctxt, BAD_CAST"p");
   1523 	htmlnamePush(ctxt, BAD_CAST"p");
   1524 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1525 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
   1526 	return(1);
   1527     }
   1528     if (!htmlOmittedDefaultValue)
   1529 	return(0);
   1530     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
   1531 	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
   1532 	    htmlAutoClose(ctxt, BAD_CAST"p");
   1533 	    htmlCheckImplied(ctxt, BAD_CAST"p");
   1534 	    htmlnamePush(ctxt, BAD_CAST"p");
   1535 	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1536 		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
   1537 	    return(1);
   1538 	}
   1539     }
   1540     return(0);
   1541 }
   1542 
   1543 /**
   1544  * htmlIsScriptAttribute:
   1545  * @name:  an attribute name
   1546  *
   1547  * Check if an attribute is of content type Script
   1548  *
   1549  * Returns 1 is the attribute is a script 0 otherwise
   1550  */
   1551 int
   1552 htmlIsScriptAttribute(const xmlChar *name) {
   1553     unsigned int i;
   1554 
   1555     if (name == NULL)
   1556       return(0);
   1557     /*
   1558      * all script attributes start with 'on'
   1559      */
   1560     if ((name[0] != 'o') || (name[1] != 'n'))
   1561       return(0);
   1562     for (i = 0;
   1563 	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
   1564 	 i++) {
   1565 	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
   1566 	    return(1);
   1567     }
   1568     return(0);
   1569 }
   1570 
   1571 /************************************************************************
   1572  *									*
   1573  *	The list of HTML predefined entities			*
   1574  *									*
   1575  ************************************************************************/
   1576 
   1577 
   1578 static const htmlEntityDesc  html40EntitiesTable[] = {
   1579 /*
   1580  * the 4 absolute ones, plus apostrophe.
   1581  */
   1582 { 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
   1583 { 38,	"amp",	"ampersand, U+0026 ISOnum" },
   1584 { 39,	"apos",	"single quote" },
   1585 { 60,	"lt",	"less-than sign, U+003C ISOnum" },
   1586 { 62,	"gt",	"greater-than sign, U+003E ISOnum" },
   1587 
   1588 /*
   1589  * A bunch still in the 128-255 range
   1590  * Replacing them depend really on the charset used.
   1591  */
   1592 { 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
   1593 { 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
   1594 { 162,	"cent",	"cent sign, U+00A2 ISOnum" },
   1595 { 163,	"pound","pound sign, U+00A3 ISOnum" },
   1596 { 164,	"curren","currency sign, U+00A4 ISOnum" },
   1597 { 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
   1598 { 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
   1599 { 167,	"sect",	"section sign, U+00A7 ISOnum" },
   1600 { 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
   1601 { 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
   1602 { 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
   1603 { 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
   1604 { 172,	"not",	"not sign, U+00AC ISOnum" },
   1605 { 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
   1606 { 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
   1607 { 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
   1608 { 176,	"deg",	"degree sign, U+00B0 ISOnum" },
   1609 { 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
   1610 { 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
   1611 { 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
   1612 { 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
   1613 { 181,	"micro","micro sign, U+00B5 ISOnum" },
   1614 { 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
   1615 { 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
   1616 { 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
   1617 { 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
   1618 { 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
   1619 { 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
   1620 { 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
   1621 { 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
   1622 { 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
   1623 { 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
   1624 { 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
   1625 { 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
   1626 { 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
   1627 { 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
   1628 { 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
   1629 { 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
   1630 { 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
   1631 { 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
   1632 { 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
   1633 { 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
   1634 { 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
   1635 { 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
   1636 { 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
   1637 { 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
   1638 { 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
   1639 { 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
   1640 { 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
   1641 { 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
   1642 { 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
   1643 { 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
   1644 { 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
   1645 { 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
   1646 { 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
   1647 { 215,	"times","multiplication sign, U+00D7 ISOnum" },
   1648 { 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
   1649 { 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
   1650 { 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
   1651 { 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
   1652 { 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
   1653 { 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
   1654 { 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
   1655 { 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
   1656 { 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
   1657 { 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
   1658 { 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
   1659 { 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
   1660 { 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
   1661 { 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
   1662 { 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
   1663 { 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
   1664 { 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
   1665 { 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
   1666 { 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
   1667 { 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
   1668 { 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
   1669 { 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
   1670 { 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
   1671 { 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
   1672 { 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
   1673 { 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
   1674 { 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
   1675 { 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
   1676 { 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
   1677 { 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
   1678 { 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
   1679 { 247,	"divide","division sign, U+00F7 ISOnum" },
   1680 { 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
   1681 { 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
   1682 { 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
   1683 { 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
   1684 { 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
   1685 { 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
   1686 { 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
   1687 { 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
   1688 
   1689 { 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
   1690 { 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
   1691 { 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
   1692 { 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
   1693 { 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
   1694 
   1695 /*
   1696  * Anything below should really be kept as entities references
   1697  */
   1698 { 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
   1699 
   1700 { 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
   1701 { 732,	"tilde","small tilde, U+02DC ISOdia" },
   1702 
   1703 { 913,	"Alpha","greek capital letter alpha, U+0391" },
   1704 { 914,	"Beta",	"greek capital letter beta, U+0392" },
   1705 { 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
   1706 { 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
   1707 { 917,	"Epsilon","greek capital letter epsilon, U+0395" },
   1708 { 918,	"Zeta",	"greek capital letter zeta, U+0396" },
   1709 { 919,	"Eta",	"greek capital letter eta, U+0397" },
   1710 { 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
   1711 { 921,	"Iota",	"greek capital letter iota, U+0399" },
   1712 { 922,	"Kappa","greek capital letter kappa, U+039A" },
   1713 { 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
   1714 { 924,	"Mu",	"greek capital letter mu, U+039C" },
   1715 { 925,	"Nu",	"greek capital letter nu, U+039D" },
   1716 { 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
   1717 { 927,	"Omicron","greek capital letter omicron, U+039F" },
   1718 { 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
   1719 { 929,	"Rho",	"greek capital letter rho, U+03A1" },
   1720 { 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
   1721 { 932,	"Tau",	"greek capital letter tau, U+03A4" },
   1722 { 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
   1723 { 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
   1724 { 935,	"Chi",	"greek capital letter chi, U+03A7" },
   1725 { 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
   1726 { 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
   1727 
   1728 { 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
   1729 { 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
   1730 { 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
   1731 { 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
   1732 { 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
   1733 { 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
   1734 { 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
   1735 { 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
   1736 { 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
   1737 { 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
   1738 { 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
   1739 { 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
   1740 { 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
   1741 { 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
   1742 { 959,	"omicron","greek small letter omicron, U+03BF NEW" },
   1743 { 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
   1744 { 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
   1745 { 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
   1746 { 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
   1747 { 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
   1748 { 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
   1749 { 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
   1750 { 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
   1751 { 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
   1752 { 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
   1753 { 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
   1754 { 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
   1755 { 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
   1756 
   1757 { 8194,	"ensp",	"en space, U+2002 ISOpub" },
   1758 { 8195,	"emsp",	"em space, U+2003 ISOpub" },
   1759 { 8201,	"thinsp","thin space, U+2009 ISOpub" },
   1760 { 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
   1761 { 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
   1762 { 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
   1763 { 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
   1764 { 8211,	"ndash","en dash, U+2013 ISOpub" },
   1765 { 8212,	"mdash","em dash, U+2014 ISOpub" },
   1766 { 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
   1767 { 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
   1768 { 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
   1769 { 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
   1770 { 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
   1771 { 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
   1772 { 8224,	"dagger","dagger, U+2020 ISOpub" },
   1773 { 8225,	"Dagger","double dagger, U+2021 ISOpub" },
   1774 
   1775 { 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
   1776 { 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
   1777 
   1778 { 8240,	"permil","per mille sign, U+2030 ISOtech" },
   1779 
   1780 { 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
   1781 { 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
   1782 
   1783 { 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
   1784 { 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
   1785 
   1786 { 8254,	"oline","overline = spacing overscore, U+203E NEW" },
   1787 { 8260,	"frasl","fraction slash, U+2044 NEW" },
   1788 
   1789 { 8364,	"euro",	"euro sign, U+20AC NEW" },
   1790 
   1791 { 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
   1792 { 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
   1793 { 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
   1794 { 8482,	"trade","trade mark sign, U+2122 ISOnum" },
   1795 { 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
   1796 { 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
   1797 { 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
   1798 { 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
   1799 { 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
   1800 { 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
   1801 { 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
   1802 { 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
   1803 { 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
   1804 { 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
   1805 { 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
   1806 { 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
   1807 
   1808 { 8704,	"forall","for all, U+2200 ISOtech" },
   1809 { 8706,	"part",	"partial differential, U+2202 ISOtech" },
   1810 { 8707,	"exist","there exists, U+2203 ISOtech" },
   1811 { 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
   1812 { 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
   1813 { 8712,	"isin",	"element of, U+2208 ISOtech" },
   1814 { 8713,	"notin","not an element of, U+2209 ISOtech" },
   1815 { 8715,	"ni",	"contains as member, U+220B ISOtech" },
   1816 { 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
   1817 { 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
   1818 { 8722,	"minus","minus sign, U+2212 ISOtech" },
   1819 { 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
   1820 { 8730,	"radic","square root = radical sign, U+221A ISOtech" },
   1821 { 8733,	"prop",	"proportional to, U+221D ISOtech" },
   1822 { 8734,	"infin","infinity, U+221E ISOtech" },
   1823 { 8736,	"ang",	"angle, U+2220 ISOamso" },
   1824 { 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
   1825 { 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
   1826 { 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
   1827 { 8746,	"cup",	"union = cup, U+222A ISOtech" },
   1828 { 8747,	"int",	"integral, U+222B ISOtech" },
   1829 { 8756,	"there4","therefore, U+2234 ISOtech" },
   1830 { 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
   1831 { 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
   1832 { 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
   1833 { 8800,	"ne",	"not equal to, U+2260 ISOtech" },
   1834 { 8801,	"equiv","identical to, U+2261 ISOtech" },
   1835 { 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
   1836 { 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
   1837 { 8834,	"sub",	"subset of, U+2282 ISOtech" },
   1838 { 8835,	"sup",	"superset of, U+2283 ISOtech" },
   1839 { 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
   1840 { 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
   1841 { 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
   1842 { 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
   1843 { 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
   1844 { 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
   1845 { 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
   1846 { 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
   1847 { 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
   1848 { 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
   1849 { 8971,	"rfloor","right floor, U+230B ISOamsc" },
   1850 { 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
   1851 { 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
   1852 { 9674,	"loz",	"lozenge, U+25CA ISOpub" },
   1853 
   1854 { 9824,	"spades","black spade suit, U+2660 ISOpub" },
   1855 { 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
   1856 { 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
   1857 { 9830,	"diams","black diamond suit, U+2666 ISOpub" },
   1858 
   1859 };
   1860 
   1861 /************************************************************************
   1862  *									*
   1863  *		Commodity functions to handle entities			*
   1864  *									*
   1865  ************************************************************************/
   1866 
   1867 /*
   1868  * Macro used to grow the current buffer.
   1869  */
   1870 #define growBuffer(buffer) {						\
   1871     xmlChar *tmp;							\
   1872     buffer##_size *= 2;							\
   1873     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
   1874     if (tmp == NULL) {						\
   1875 	htmlErrMemory(ctxt, "growing buffer\n");			\
   1876 	xmlFree(buffer);						\
   1877 	return(NULL);							\
   1878     }									\
   1879     buffer = tmp;							\
   1880 }
   1881 
   1882 /**
   1883  * htmlEntityLookup:
   1884  * @name: the entity name
   1885  *
   1886  * Lookup the given entity in EntitiesTable
   1887  *
   1888  * TODO: the linear scan is really ugly, an hash table is really needed.
   1889  *
   1890  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
   1891  */
   1892 const htmlEntityDesc *
   1893 htmlEntityLookup(const xmlChar *name) {
   1894     unsigned int i;
   1895 
   1896     for (i = 0;i < (sizeof(html40EntitiesTable)/
   1897                     sizeof(html40EntitiesTable[0]));i++) {
   1898         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
   1899             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
   1900 	}
   1901     }
   1902     return(NULL);
   1903 }
   1904 
   1905 /**
   1906  * htmlEntityValueLookup:
   1907  * @value: the entity's unicode value
   1908  *
   1909  * Lookup the given entity in EntitiesTable
   1910  *
   1911  * TODO: the linear scan is really ugly, an hash table is really needed.
   1912  *
   1913  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
   1914  */
   1915 const htmlEntityDesc *
   1916 htmlEntityValueLookup(unsigned int value) {
   1917     unsigned int i;
   1918 
   1919     for (i = 0;i < (sizeof(html40EntitiesTable)/
   1920                     sizeof(html40EntitiesTable[0]));i++) {
   1921         if (html40EntitiesTable[i].value >= value) {
   1922 	    if (html40EntitiesTable[i].value > value)
   1923 		break;
   1924             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
   1925 	}
   1926     }
   1927     return(NULL);
   1928 }
   1929 
   1930 /**
   1931  * UTF8ToHtml:
   1932  * @out:  a pointer to an array of bytes to store the result
   1933  * @outlen:  the length of @out
   1934  * @in:  a pointer to an array of UTF-8 chars
   1935  * @inlen:  the length of @in
   1936  *
   1937  * Take a block of UTF-8 chars in and try to convert it to an ASCII
   1938  * plus HTML entities block of chars out.
   1939  *
   1940  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
   1941  * The value of @inlen after return is the number of octets consumed
   1942  *     as the return value is positive, else unpredictable.
   1943  * The value of @outlen after return is the number of octets consumed.
   1944  */
   1945 int
   1946 UTF8ToHtml(unsigned char* out, int *outlen,
   1947               const unsigned char* in, int *inlen) {
   1948     const unsigned char* processed = in;
   1949     const unsigned char* outend;
   1950     const unsigned char* outstart = out;
   1951     const unsigned char* instart = in;
   1952     const unsigned char* inend;
   1953     unsigned int c, d;
   1954     int trailing;
   1955 
   1956     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
   1957     if (in == NULL) {
   1958         /*
   1959 	 * initialization nothing to do
   1960 	 */
   1961 	*outlen = 0;
   1962 	*inlen = 0;
   1963 	return(0);
   1964     }
   1965     inend = in + (*inlen);
   1966     outend = out + (*outlen);
   1967     while (in < inend) {
   1968 	d = *in++;
   1969 	if      (d < 0x80)  { c= d; trailing= 0; }
   1970 	else if (d < 0xC0) {
   1971 	    /* trailing byte in leading position */
   1972 	    *outlen = out - outstart;
   1973 	    *inlen = processed - instart;
   1974 	    return(-2);
   1975         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
   1976         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
   1977         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
   1978 	else {
   1979 	    /* no chance for this in Ascii */
   1980 	    *outlen = out - outstart;
   1981 	    *inlen = processed - instart;
   1982 	    return(-2);
   1983 	}
   1984 
   1985 	if (inend - in < trailing) {
   1986 	    break;
   1987 	}
   1988 
   1989 	for ( ; trailing; trailing--) {
   1990 	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
   1991 		break;
   1992 	    c <<= 6;
   1993 	    c |= d & 0x3F;
   1994 	}
   1995 
   1996 	/* assertion: c is a single UTF-4 value */
   1997 	if (c < 0x80) {
   1998 	    if (out + 1 >= outend)
   1999 		break;
   2000 	    *out++ = c;
   2001 	} else {
   2002 	    int len;
   2003 	    const htmlEntityDesc * ent;
   2004 	    const char *cp;
   2005 	    char nbuf[16];
   2006 
   2007 	    /*
   2008 	     * Try to lookup a predefined HTML entity for it
   2009 	     */
   2010 
   2011 	    ent = htmlEntityValueLookup(c);
   2012 	    if (ent == NULL) {
   2013 	      snprintf(nbuf, sizeof(nbuf), "#%u", c);
   2014 	      cp = nbuf;
   2015 	    }
   2016 	    else
   2017 	      cp = ent->name;
   2018 	    len = strlen(cp);
   2019 	    if (out + 2 + len >= outend)
   2020 		break;
   2021 	    *out++ = '&';
   2022 	    memcpy(out, cp, len);
   2023 	    out += len;
   2024 	    *out++ = ';';
   2025 	}
   2026 	processed = in;
   2027     }
   2028     *outlen = out - outstart;
   2029     *inlen = processed - instart;
   2030     return(0);
   2031 }
   2032 
   2033 /**
   2034  * htmlEncodeEntities:
   2035  * @out:  a pointer to an array of bytes to store the result
   2036  * @outlen:  the length of @out
   2037  * @in:  a pointer to an array of UTF-8 chars
   2038  * @inlen:  the length of @in
   2039  * @quoteChar: the quote character to escape (' or ") or zero.
   2040  *
   2041  * Take a block of UTF-8 chars in and try to convert it to an ASCII
   2042  * plus HTML entities block of chars out.
   2043  *
   2044  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
   2045  * The value of @inlen after return is the number of octets consumed
   2046  *     as the return value is positive, else unpredictable.
   2047  * The value of @outlen after return is the number of octets consumed.
   2048  */
   2049 int
   2050 htmlEncodeEntities(unsigned char* out, int *outlen,
   2051 		   const unsigned char* in, int *inlen, int quoteChar) {
   2052     const unsigned char* processed = in;
   2053     const unsigned char* outend;
   2054     const unsigned char* outstart = out;
   2055     const unsigned char* instart = in;
   2056     const unsigned char* inend;
   2057     unsigned int c, d;
   2058     int trailing;
   2059 
   2060     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
   2061         return(-1);
   2062     outend = out + (*outlen);
   2063     inend = in + (*inlen);
   2064     while (in < inend) {
   2065 	d = *in++;
   2066 	if      (d < 0x80)  { c= d; trailing= 0; }
   2067 	else if (d < 0xC0) {
   2068 	    /* trailing byte in leading position */
   2069 	    *outlen = out - outstart;
   2070 	    *inlen = processed - instart;
   2071 	    return(-2);
   2072         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
   2073         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
   2074         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
   2075 	else {
   2076 	    /* no chance for this in Ascii */
   2077 	    *outlen = out - outstart;
   2078 	    *inlen = processed - instart;
   2079 	    return(-2);
   2080 	}
   2081 
   2082 	if (inend - in < trailing)
   2083 	    break;
   2084 
   2085 	while (trailing--) {
   2086 	    if (((d= *in++) & 0xC0) != 0x80) {
   2087 		*outlen = out - outstart;
   2088 		*inlen = processed - instart;
   2089 		return(-2);
   2090 	    }
   2091 	    c <<= 6;
   2092 	    c |= d & 0x3F;
   2093 	}
   2094 
   2095 	/* assertion: c is a single UTF-4 value */
   2096 	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
   2097 	    (c != '&') && (c != '<') && (c != '>')) {
   2098 	    if (out >= outend)
   2099 		break;
   2100 	    *out++ = c;
   2101 	} else {
   2102 	    const htmlEntityDesc * ent;
   2103 	    const char *cp;
   2104 	    char nbuf[16];
   2105 	    int len;
   2106 
   2107 	    /*
   2108 	     * Try to lookup a predefined HTML entity for it
   2109 	     */
   2110 	    ent = htmlEntityValueLookup(c);
   2111 	    if (ent == NULL) {
   2112 		snprintf(nbuf, sizeof(nbuf), "#%u", c);
   2113 		cp = nbuf;
   2114 	    }
   2115 	    else
   2116 		cp = ent->name;
   2117 	    len = strlen(cp);
   2118 	    if (out + 2 + len > outend)
   2119 		break;
   2120 	    *out++ = '&';
   2121 	    memcpy(out, cp, len);
   2122 	    out += len;
   2123 	    *out++ = ';';
   2124 	}
   2125 	processed = in;
   2126     }
   2127     *outlen = out - outstart;
   2128     *inlen = processed - instart;
   2129     return(0);
   2130 }
   2131 
   2132 /************************************************************************
   2133  *									*
   2134  *		Commodity functions to handle streams			*
   2135  *									*
   2136  ************************************************************************/
   2137 
   2138 /**
   2139  * htmlNewInputStream:
   2140  * @ctxt:  an HTML parser context
   2141  *
   2142  * Create a new input stream structure
   2143  * Returns the new input stream or NULL
   2144  */
   2145 static htmlParserInputPtr
   2146 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
   2147     htmlParserInputPtr input;
   2148 
   2149     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
   2150     if (input == NULL) {
   2151         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
   2152 	return(NULL);
   2153     }
   2154     memset(input, 0, sizeof(htmlParserInput));
   2155     input->filename = NULL;
   2156     input->directory = NULL;
   2157     input->base = NULL;
   2158     input->cur = NULL;
   2159     input->buf = NULL;
   2160     input->line = 1;
   2161     input->col = 1;
   2162     input->buf = NULL;
   2163     input->free = NULL;
   2164     input->version = NULL;
   2165     input->consumed = 0;
   2166     input->length = 0;
   2167     return(input);
   2168 }
   2169 
   2170 
   2171 /************************************************************************
   2172  *									*
   2173  *		Commodity functions, cleanup needed ?			*
   2174  *									*
   2175  ************************************************************************/
   2176 /*
   2177  * all tags allowing pc data from the html 4.01 loose dtd
   2178  * NOTE: it might be more apropriate to integrate this information
   2179  * into the html40ElementTable array but I don't want to risk any
   2180  * binary incomptibility
   2181  */
   2182 static const char *allowPCData[] = {
   2183     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
   2184     "blockquote", "body", "button", "caption", "center", "cite", "code",
   2185     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
   2186     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
   2187     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
   2188     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
   2189 };
   2190 
   2191 /**
   2192  * areBlanks:
   2193  * @ctxt:  an HTML parser context
   2194  * @str:  a xmlChar *
   2195  * @len:  the size of @str
   2196  *
   2197  * Is this a sequence of blank chars that one can ignore ?
   2198  *
   2199  * Returns 1 if ignorable 0 otherwise.
   2200  */
   2201 
   2202 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
   2203     unsigned int i;
   2204     int j;
   2205     xmlNodePtr lastChild;
   2206     xmlDtdPtr dtd;
   2207 
   2208     for (j = 0;j < len;j++)
   2209         if (!(IS_BLANK_CH(str[j]))) return(0);
   2210 
   2211     if (CUR == 0) return(1);
   2212     if (CUR != '<') return(0);
   2213     if (ctxt->name == NULL)
   2214 	return(1);
   2215     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
   2216 	return(1);
   2217     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
   2218 	return(1);
   2219 
   2220     /* Only strip CDATA children of the body tag for strict HTML DTDs */
   2221     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
   2222         dtd = xmlGetIntSubset(ctxt->myDoc);
   2223         if (dtd != NULL && dtd->ExternalID != NULL) {
   2224             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
   2225                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
   2226                 return(1);
   2227         }
   2228     }
   2229 
   2230     if (ctxt->node == NULL) return(0);
   2231     lastChild = xmlGetLastChild(ctxt->node);
   2232     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
   2233 	lastChild = lastChild->prev;
   2234     if (lastChild == NULL) {
   2235         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
   2236             (ctxt->node->content != NULL)) return(0);
   2237 	/* keep ws in constructs like ...<b> </b>...
   2238 	   for all tags "b" allowing PCDATA */
   2239 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
   2240 	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
   2241 		return(0);
   2242 	    }
   2243 	}
   2244     } else if (xmlNodeIsText(lastChild)) {
   2245         return(0);
   2246     } else {
   2247 	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
   2248 	   for all tags "p" allowing PCDATA */
   2249 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
   2250 	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
   2251 		return(0);
   2252 	    }
   2253 	}
   2254     }
   2255     return(1);
   2256 }
   2257 
   2258 /**
   2259  * htmlNewDocNoDtD:
   2260  * @URI:  URI for the dtd, or NULL
   2261  * @ExternalID:  the external ID of the DTD, or NULL
   2262  *
   2263  * Creates a new HTML document without a DTD node if @URI and @ExternalID
   2264  * are NULL
   2265  *
   2266  * Returns a new document, do not initialize the DTD if not provided
   2267  */
   2268 htmlDocPtr
   2269 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
   2270     xmlDocPtr cur;
   2271 
   2272     /*
   2273      * Allocate a new document and fill the fields.
   2274      */
   2275     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
   2276     if (cur == NULL) {
   2277 	htmlErrMemory(NULL, "HTML document creation failed\n");
   2278 	return(NULL);
   2279     }
   2280     memset(cur, 0, sizeof(xmlDoc));
   2281 
   2282     cur->type = XML_HTML_DOCUMENT_NODE;
   2283     cur->version = NULL;
   2284     cur->intSubset = NULL;
   2285     cur->doc = cur;
   2286     cur->name = NULL;
   2287     cur->children = NULL;
   2288     cur->extSubset = NULL;
   2289     cur->oldNs = NULL;
   2290     cur->encoding = NULL;
   2291     cur->standalone = 1;
   2292     cur->compression = 0;
   2293     cur->ids = NULL;
   2294     cur->refs = NULL;
   2295     cur->_private = NULL;
   2296     cur->charset = XML_CHAR_ENCODING_UTF8;
   2297     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
   2298     if ((ExternalID != NULL) ||
   2299 	(URI != NULL))
   2300 	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
   2301     return(cur);
   2302 }
   2303 
   2304 /**
   2305  * htmlNewDoc:
   2306  * @URI:  URI for the dtd, or NULL
   2307  * @ExternalID:  the external ID of the DTD, or NULL
   2308  *
   2309  * Creates a new HTML document
   2310  *
   2311  * Returns a new document
   2312  */
   2313 htmlDocPtr
   2314 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
   2315     if ((URI == NULL) && (ExternalID == NULL))
   2316 	return(htmlNewDocNoDtD(
   2317 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
   2318 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
   2319 
   2320     return(htmlNewDocNoDtD(URI, ExternalID));
   2321 }
   2322 
   2323 
   2324 /************************************************************************
   2325  *									*
   2326  *			The parser itself				*
   2327  *	Relates to http://www.w3.org/TR/html40				*
   2328  *									*
   2329  ************************************************************************/
   2330 
   2331 /************************************************************************
   2332  *									*
   2333  *			The parser itself				*
   2334  *									*
   2335  ************************************************************************/
   2336 
   2337 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
   2338 
   2339 /**
   2340  * htmlParseHTMLName:
   2341  * @ctxt:  an HTML parser context
   2342  *
   2343  * parse an HTML tag or attribute name, note that we convert it to lowercase
   2344  * since HTML names are not case-sensitive.
   2345  *
   2346  * Returns the Tag Name parsed or NULL
   2347  */
   2348 
   2349 static const xmlChar *
   2350 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
   2351     int i = 0;
   2352     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
   2353 
   2354     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
   2355         (CUR != ':') && (CUR != '.')) return(NULL);
   2356 
   2357     while ((i < HTML_PARSER_BUFFER_SIZE) &&
   2358            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
   2359 	   (CUR == ':') || (CUR == '-') || (CUR == '_') ||
   2360            (CUR == '.'))) {
   2361 	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
   2362         else loc[i] = CUR;
   2363 	i++;
   2364 
   2365 	NEXT;
   2366     }
   2367 
   2368     return(xmlDictLookup(ctxt->dict, loc, i));
   2369 }
   2370 
   2371 
   2372 /**
   2373  * htmlParseHTMLName_nonInvasive:
   2374  * @ctxt:  an HTML parser context
   2375  *
   2376  * parse an HTML tag or attribute name, note that we convert it to lowercase
   2377  * since HTML names are not case-sensitive, this doesn't consume the data
   2378  * from the stream, it's a look-ahead
   2379  *
   2380  * Returns the Tag Name parsed or NULL
   2381  */
   2382 
   2383 static const xmlChar *
   2384 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
   2385     int i = 0;
   2386     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
   2387 
   2388     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
   2389         (NXT(1) != ':')) return(NULL);
   2390 
   2391     while ((i < HTML_PARSER_BUFFER_SIZE) &&
   2392            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
   2393 	   (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
   2394 	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
   2395         else loc[i] = NXT(1+i);
   2396 	i++;
   2397     }
   2398 
   2399     return(xmlDictLookup(ctxt->dict, loc, i));
   2400 }
   2401 
   2402 
   2403 /**
   2404  * htmlParseName:
   2405  * @ctxt:  an HTML parser context
   2406  *
   2407  * parse an HTML name, this routine is case sensitive.
   2408  *
   2409  * Returns the Name parsed or NULL
   2410  */
   2411 
   2412 static const xmlChar *
   2413 htmlParseName(htmlParserCtxtPtr ctxt) {
   2414     const xmlChar *in;
   2415     const xmlChar *ret;
   2416     int count = 0;
   2417 
   2418     GROW;
   2419 
   2420     /*
   2421      * Accelerator for simple ASCII names
   2422      */
   2423     in = ctxt->input->cur;
   2424     if (((*in >= 0x61) && (*in <= 0x7A)) ||
   2425 	((*in >= 0x41) && (*in <= 0x5A)) ||
   2426 	(*in == '_') || (*in == ':')) {
   2427 	in++;
   2428 	while (((*in >= 0x61) && (*in <= 0x7A)) ||
   2429 	       ((*in >= 0x41) && (*in <= 0x5A)) ||
   2430 	       ((*in >= 0x30) && (*in <= 0x39)) ||
   2431 	       (*in == '_') || (*in == '-') ||
   2432 	       (*in == ':') || (*in == '.'))
   2433 	    in++;
   2434 	if ((*in > 0) && (*in < 0x80)) {
   2435 	    count = in - ctxt->input->cur;
   2436 	    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
   2437 	    ctxt->input->cur = in;
   2438 	    ctxt->nbChars += count;
   2439 	    ctxt->input->col += count;
   2440 	    return(ret);
   2441 	}
   2442     }
   2443     return(htmlParseNameComplex(ctxt));
   2444 }
   2445 
   2446 static const xmlChar *
   2447 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
   2448     int len = 0, l;
   2449     int c;
   2450     int count = 0;
   2451 
   2452     /*
   2453      * Handler for more complex cases
   2454      */
   2455     GROW;
   2456     c = CUR_CHAR(l);
   2457     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
   2458 	(!IS_LETTER(c) && (c != '_') &&
   2459          (c != ':'))) {
   2460 	return(NULL);
   2461     }
   2462 
   2463     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
   2464 	   ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
   2465             (c == '.') || (c == '-') ||
   2466 	    (c == '_') || (c == ':') ||
   2467 	    (IS_COMBINING(c)) ||
   2468 	    (IS_EXTENDER(c)))) {
   2469 	if (count++ > 100) {
   2470 	    count = 0;
   2471 	    GROW;
   2472 	}
   2473 	len += l;
   2474 	NEXTL(l);
   2475 	c = CUR_CHAR(l);
   2476     }
   2477     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
   2478 }
   2479 
   2480 
   2481 /**
   2482  * htmlParseHTMLAttribute:
   2483  * @ctxt:  an HTML parser context
   2484  * @stop:  a char stop value
   2485  *
   2486  * parse an HTML attribute value till the stop (quote), if
   2487  * stop is 0 then it stops at the first space
   2488  *
   2489  * Returns the attribute parsed or NULL
   2490  */
   2491 
   2492 static xmlChar *
   2493 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
   2494     xmlChar *buffer = NULL;
   2495     int buffer_size = 0;
   2496     xmlChar *out = NULL;
   2497     const xmlChar *name = NULL;
   2498     const xmlChar *cur = NULL;
   2499     const htmlEntityDesc * ent;
   2500 
   2501     /*
   2502      * allocate a translation buffer.
   2503      */
   2504     buffer_size = HTML_PARSER_BUFFER_SIZE;
   2505     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
   2506     if (buffer == NULL) {
   2507 	htmlErrMemory(ctxt, "buffer allocation failed\n");
   2508 	return(NULL);
   2509     }
   2510     out = buffer;
   2511 
   2512     /*
   2513      * Ok loop until we reach one of the ending chars
   2514      */
   2515     while ((CUR != 0) && (CUR != stop)) {
   2516 	if ((stop == 0) && (CUR == '>')) break;
   2517 	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
   2518         if (CUR == '&') {
   2519 	    if (NXT(1) == '#') {
   2520 		unsigned int c;
   2521 		int bits;
   2522 
   2523 		c = htmlParseCharRef(ctxt);
   2524 		if      (c <    0x80)
   2525 		        { *out++  = c;                bits= -6; }
   2526 		else if (c <   0x800)
   2527 		        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   2528 		else if (c < 0x10000)
   2529 		        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   2530 		else
   2531 		        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   2532 
   2533 		for ( ; bits >= 0; bits-= 6) {
   2534 		    *out++  = ((c >> bits) & 0x3F) | 0x80;
   2535 		}
   2536 
   2537 		if (out - buffer > buffer_size - 100) {
   2538 			int indx = out - buffer;
   2539 
   2540 			growBuffer(buffer);
   2541 			out = &buffer[indx];
   2542 		}
   2543 	    } else {
   2544 		ent = htmlParseEntityRef(ctxt, &name);
   2545 		if (name == NULL) {
   2546 		    *out++ = '&';
   2547 		    if (out - buffer > buffer_size - 100) {
   2548 			int indx = out - buffer;
   2549 
   2550 			growBuffer(buffer);
   2551 			out = &buffer[indx];
   2552 		    }
   2553 		} else if (ent == NULL) {
   2554 		    *out++ = '&';
   2555 		    cur = name;
   2556 		    while (*cur != 0) {
   2557 			if (out - buffer > buffer_size - 100) {
   2558 			    int indx = out - buffer;
   2559 
   2560 			    growBuffer(buffer);
   2561 			    out = &buffer[indx];
   2562 			}
   2563 			*out++ = *cur++;
   2564 		    }
   2565 		} else {
   2566 		    unsigned int c;
   2567 		    int bits;
   2568 
   2569 		    if (out - buffer > buffer_size - 100) {
   2570 			int indx = out - buffer;
   2571 
   2572 			growBuffer(buffer);
   2573 			out = &buffer[indx];
   2574 		    }
   2575 		    c = ent->value;
   2576 		    if      (c <    0x80)
   2577 			{ *out++  = c;                bits= -6; }
   2578 		    else if (c <   0x800)
   2579 			{ *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   2580 		    else if (c < 0x10000)
   2581 			{ *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   2582 		    else
   2583 			{ *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   2584 
   2585 		    for ( ; bits >= 0; bits-= 6) {
   2586 			*out++  = ((c >> bits) & 0x3F) | 0x80;
   2587 		    }
   2588 		}
   2589 	    }
   2590 	} else {
   2591 	    unsigned int c;
   2592 	    int bits, l;
   2593 
   2594 	    if (out - buffer > buffer_size - 100) {
   2595 		int indx = out - buffer;
   2596 
   2597 		growBuffer(buffer);
   2598 		out = &buffer[indx];
   2599 	    }
   2600 	    c = CUR_CHAR(l);
   2601 	    if      (c <    0x80)
   2602 		    { *out++  = c;                bits= -6; }
   2603 	    else if (c <   0x800)
   2604 		    { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   2605 	    else if (c < 0x10000)
   2606 		    { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   2607 	    else
   2608 		    { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   2609 
   2610 	    for ( ; bits >= 0; bits-= 6) {
   2611 		*out++  = ((c >> bits) & 0x3F) | 0x80;
   2612 	    }
   2613 	    NEXT;
   2614 	}
   2615     }
   2616     *out = 0;
   2617     return(buffer);
   2618 }
   2619 
   2620 /**
   2621  * htmlParseEntityRef:
   2622  * @ctxt:  an HTML parser context
   2623  * @str:  location to store the entity name
   2624  *
   2625  * parse an HTML ENTITY references
   2626  *
   2627  * [68] EntityRef ::= '&' Name ';'
   2628  *
   2629  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
   2630  *         if non-NULL *str will have to be freed by the caller.
   2631  */
   2632 const htmlEntityDesc *
   2633 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
   2634     const xmlChar *name;
   2635     const htmlEntityDesc * ent = NULL;
   2636 
   2637     if (str != NULL) *str = NULL;
   2638     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
   2639 
   2640     if (CUR == '&') {
   2641         NEXT;
   2642         name = htmlParseName(ctxt);
   2643 	if (name == NULL) {
   2644 	    htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   2645 	                 "htmlParseEntityRef: no name\n", NULL, NULL);
   2646 	} else {
   2647 	    GROW;
   2648 	    if (CUR == ';') {
   2649 	        if (str != NULL)
   2650 		    *str = name;
   2651 
   2652 		/*
   2653 		 * Lookup the entity in the table.
   2654 		 */
   2655 		ent = htmlEntityLookup(name);
   2656 		if (ent != NULL) /* OK that's ugly !!! */
   2657 		    NEXT;
   2658 	    } else {
   2659 		htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
   2660 		             "htmlParseEntityRef: expecting ';'\n",
   2661 			     NULL, NULL);
   2662 	        if (str != NULL)
   2663 		    *str = name;
   2664 	    }
   2665 	}
   2666     }
   2667     return(ent);
   2668 }
   2669 
   2670 /**
   2671  * htmlParseAttValue:
   2672  * @ctxt:  an HTML parser context
   2673  *
   2674  * parse a value for an attribute
   2675  * Note: the parser won't do substitution of entities here, this
   2676  * will be handled later in xmlStringGetNodeList, unless it was
   2677  * asked for ctxt->replaceEntities != 0
   2678  *
   2679  * Returns the AttValue parsed or NULL.
   2680  */
   2681 
   2682 static xmlChar *
   2683 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
   2684     xmlChar *ret = NULL;
   2685 
   2686     if (CUR == '"') {
   2687         NEXT;
   2688 	ret = htmlParseHTMLAttribute(ctxt, '"');
   2689         if (CUR != '"') {
   2690 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
   2691 	                 "AttValue: \" expected\n", NULL, NULL);
   2692 	} else
   2693 	    NEXT;
   2694     } else if (CUR == '\'') {
   2695         NEXT;
   2696 	ret = htmlParseHTMLAttribute(ctxt, '\'');
   2697         if (CUR != '\'') {
   2698 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
   2699 	                 "AttValue: ' expected\n", NULL, NULL);
   2700 	} else
   2701 	    NEXT;
   2702     } else {
   2703         /*
   2704 	 * That's an HTMLism, the attribute value may not be quoted
   2705 	 */
   2706 	ret = htmlParseHTMLAttribute(ctxt, 0);
   2707 	if (ret == NULL) {
   2708 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
   2709 	                 "AttValue: no value found\n", NULL, NULL);
   2710 	}
   2711     }
   2712     return(ret);
   2713 }
   2714 
   2715 /**
   2716  * htmlParseSystemLiteral:
   2717  * @ctxt:  an HTML parser context
   2718  *
   2719  * parse an HTML Literal
   2720  *
   2721  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
   2722  *
   2723  * Returns the SystemLiteral parsed or NULL
   2724  */
   2725 
   2726 static xmlChar *
   2727 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
   2728     const xmlChar *q;
   2729     xmlChar *ret = NULL;
   2730 
   2731     if (CUR == '"') {
   2732         NEXT;
   2733 	q = CUR_PTR;
   2734 	while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
   2735 	    NEXT;
   2736 	if (!IS_CHAR_CH(CUR)) {
   2737 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
   2738 			 "Unfinished SystemLiteral\n", NULL, NULL);
   2739 	} else {
   2740 	    ret = xmlStrndup(q, CUR_PTR - q);
   2741 	    NEXT;
   2742         }
   2743     } else if (CUR == '\'') {
   2744         NEXT;
   2745 	q = CUR_PTR;
   2746 	while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
   2747 	    NEXT;
   2748 	if (!IS_CHAR_CH(CUR)) {
   2749 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
   2750 			 "Unfinished SystemLiteral\n", NULL, NULL);
   2751 	} else {
   2752 	    ret = xmlStrndup(q, CUR_PTR - q);
   2753 	    NEXT;
   2754         }
   2755     } else {
   2756 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
   2757 	             " or ' expected\n", NULL, NULL);
   2758     }
   2759 
   2760     return(ret);
   2761 }
   2762 
   2763 /**
   2764  * htmlParsePubidLiteral:
   2765  * @ctxt:  an HTML parser context
   2766  *
   2767  * parse an HTML public literal
   2768  *
   2769  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
   2770  *
   2771  * Returns the PubidLiteral parsed or NULL.
   2772  */
   2773 
   2774 static xmlChar *
   2775 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
   2776     const xmlChar *q;
   2777     xmlChar *ret = NULL;
   2778     /*
   2779      * Name ::= (Letter | '_') (NameChar)*
   2780      */
   2781     if (CUR == '"') {
   2782         NEXT;
   2783 	q = CUR_PTR;
   2784 	while (IS_PUBIDCHAR_CH(CUR)) NEXT;
   2785 	if (CUR != '"') {
   2786 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
   2787 	                 "Unfinished PubidLiteral\n", NULL, NULL);
   2788 	} else {
   2789 	    ret = xmlStrndup(q, CUR_PTR - q);
   2790 	    NEXT;
   2791 	}
   2792     } else if (CUR == '\'') {
   2793         NEXT;
   2794 	q = CUR_PTR;
   2795 	while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
   2796 	    NEXT;
   2797 	if (CUR != '\'') {
   2798 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
   2799 	                 "Unfinished PubidLiteral\n", NULL, NULL);
   2800 	} else {
   2801 	    ret = xmlStrndup(q, CUR_PTR - q);
   2802 	    NEXT;
   2803 	}
   2804     } else {
   2805 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
   2806 	             "PubidLiteral \" or ' expected\n", NULL, NULL);
   2807     }
   2808 
   2809     return(ret);
   2810 }
   2811 
   2812 /**
   2813  * htmlParseScript:
   2814  * @ctxt:  an HTML parser context
   2815  *
   2816  * parse the content of an HTML SCRIPT or STYLE element
   2817  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
   2818  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
   2819  * http://www.w3.org/TR/html4/types.html#type-script
   2820  * http://www.w3.org/TR/html4/types.html#h-6.15
   2821  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
   2822  *
   2823  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
   2824  * element and the value of intrinsic event attributes. User agents must
   2825  * not evaluate script data as HTML markup but instead must pass it on as
   2826  * data to a script engine.
   2827  * NOTES:
   2828  * - The content is passed like CDATA
   2829  * - the attributes for style and scripting "onXXX" are also described
   2830  *   as CDATA but SGML allows entities references in attributes so their
   2831  *   processing is identical as other attributes
   2832  */
   2833 static void
   2834 htmlParseScript(htmlParserCtxtPtr ctxt) {
   2835     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
   2836     int nbchar = 0;
   2837     int cur,l;
   2838 
   2839     SHRINK;
   2840     cur = CUR_CHAR(l);
   2841     while (IS_CHAR_CH(cur)) {
   2842 	if ((cur == '<') && (NXT(1) == '/')) {
   2843             /*
   2844              * One should break here, the specification is clear:
   2845              * Authors should therefore escape "</" within the content.
   2846              * Escape mechanisms are specific to each scripting or
   2847              * style sheet language.
   2848              *
   2849              * In recovery mode, only break if end tag match the
   2850              * current tag, effectively ignoring all tags inside the
   2851              * script/style block and treating the entire block as
   2852              * CDATA.
   2853              */
   2854             if (ctxt->recovery) {
   2855                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
   2856 				   xmlStrlen(ctxt->name)) == 0)
   2857                 {
   2858                     break; /* while */
   2859                 } else {
   2860 		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
   2861 				 "Element %s embeds close tag\n",
   2862 		                 ctxt->name, NULL);
   2863 		}
   2864             } else {
   2865                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
   2866                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
   2867                 {
   2868                     break; /* while */
   2869                 }
   2870             }
   2871 	}
   2872 	COPY_BUF(l,buf,nbchar,cur);
   2873 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
   2874 	    if (ctxt->sax->cdataBlock!= NULL) {
   2875 		/*
   2876 		 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
   2877 		 */
   2878 		ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
   2879 	    } else if (ctxt->sax->characters != NULL) {
   2880 		ctxt->sax->characters(ctxt->userData, buf, nbchar);
   2881 	    }
   2882 	    nbchar = 0;
   2883 	}
   2884 	GROW;
   2885 	NEXTL(l);
   2886 	cur = CUR_CHAR(l);
   2887     }
   2888 
   2889     if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
   2890 	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
   2891 	                "Invalid char in CDATA 0x%X\n", cur);
   2892 	NEXT;
   2893     }
   2894 
   2895     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
   2896 	if (ctxt->sax->cdataBlock!= NULL) {
   2897 	    /*
   2898 	     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
   2899 	     */
   2900 	    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
   2901 	} else if (ctxt->sax->characters != NULL) {
   2902 	    ctxt->sax->characters(ctxt->userData, buf, nbchar);
   2903 	}
   2904     }
   2905 }
   2906 
   2907 
   2908 /**
   2909  * htmlParseCharData:
   2910  * @ctxt:  an HTML parser context
   2911  *
   2912  * parse a CharData section.
   2913  * if we are within a CDATA section ']]>' marks an end of section.
   2914  *
   2915  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
   2916  */
   2917 
   2918 static void
   2919 htmlParseCharData(htmlParserCtxtPtr ctxt) {
   2920     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
   2921     int nbchar = 0;
   2922     int cur, l;
   2923     int chunk = 0;
   2924 
   2925     SHRINK;
   2926     cur = CUR_CHAR(l);
   2927     while (((cur != '<') || (ctxt->token == '<')) &&
   2928            ((cur != '&') || (ctxt->token == '&')) &&
   2929 	   (cur != 0)) {
   2930 	if (!(IS_CHAR(cur))) {
   2931 	    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
   2932 	                "Invalid char in CDATA 0x%X\n", cur);
   2933 	} else {
   2934 	    COPY_BUF(l,buf,nbchar,cur);
   2935 	}
   2936 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
   2937 	    /*
   2938 	     * Ok the segment is to be consumed as chars.
   2939 	     */
   2940 	    if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
   2941 		if (areBlanks(ctxt, buf, nbchar)) {
   2942 		    if (ctxt->sax->ignorableWhitespace != NULL)
   2943 			ctxt->sax->ignorableWhitespace(ctxt->userData,
   2944 			                               buf, nbchar);
   2945 		} else {
   2946 		    htmlCheckParagraph(ctxt);
   2947 		    if (ctxt->sax->characters != NULL)
   2948 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
   2949 		}
   2950 	    }
   2951 	    nbchar = 0;
   2952 	}
   2953 	NEXTL(l);
   2954         chunk++;
   2955         if (chunk > HTML_PARSER_BUFFER_SIZE) {
   2956             chunk = 0;
   2957             SHRINK;
   2958             GROW;
   2959         }
   2960 	cur = CUR_CHAR(l);
   2961 	if (cur == 0) {
   2962 	    SHRINK;
   2963 	    GROW;
   2964 	    cur = CUR_CHAR(l);
   2965 	}
   2966     }
   2967     if (nbchar != 0) {
   2968         buf[nbchar] = 0;
   2969 
   2970 	/*
   2971 	 * Ok the segment is to be consumed as chars.
   2972 	 */
   2973 	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
   2974 	    if (areBlanks(ctxt, buf, nbchar)) {
   2975 		if (ctxt->sax->ignorableWhitespace != NULL)
   2976 		    ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
   2977 	    } else {
   2978 		htmlCheckParagraph(ctxt);
   2979 		if (ctxt->sax->characters != NULL)
   2980 		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
   2981 	    }
   2982 	}
   2983     } else {
   2984 	/*
   2985 	 * Loop detection
   2986 	 */
   2987 	if (cur == 0)
   2988 	    ctxt->instate = XML_PARSER_EOF;
   2989     }
   2990 }
   2991 
   2992 /**
   2993  * htmlParseExternalID:
   2994  * @ctxt:  an HTML parser context
   2995  * @publicID:  a xmlChar** receiving PubidLiteral
   2996  *
   2997  * Parse an External ID or a Public ID
   2998  *
   2999  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
   3000  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
   3001  *
   3002  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
   3003  *
   3004  * Returns the function returns SystemLiteral and in the second
   3005  *                case publicID receives PubidLiteral, is strict is off
   3006  *                it is possible to return NULL and have publicID set.
   3007  */
   3008 
   3009 static xmlChar *
   3010 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
   3011     xmlChar *URI = NULL;
   3012 
   3013     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
   3014          (UPP(2) == 'S') && (UPP(3) == 'T') &&
   3015 	 (UPP(4) == 'E') && (UPP(5) == 'M')) {
   3016         SKIP(6);
   3017 	if (!IS_BLANK_CH(CUR)) {
   3018 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
   3019 	                 "Space required after 'SYSTEM'\n", NULL, NULL);
   3020 	}
   3021         SKIP_BLANKS;
   3022 	URI = htmlParseSystemLiteral(ctxt);
   3023 	if (URI == NULL) {
   3024 	    htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
   3025 	                 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
   3026         }
   3027     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
   3028 	       (UPP(2) == 'B') && (UPP(3) == 'L') &&
   3029 	       (UPP(4) == 'I') && (UPP(5) == 'C')) {
   3030         SKIP(6);
   3031 	if (!IS_BLANK_CH(CUR)) {
   3032 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
   3033 	                 "Space required after 'PUBLIC'\n", NULL, NULL);
   3034 	}
   3035         SKIP_BLANKS;
   3036 	*publicID = htmlParsePubidLiteral(ctxt);
   3037 	if (*publicID == NULL) {
   3038 	    htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
   3039 	                 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
   3040 			 NULL, NULL);
   3041 	}
   3042         SKIP_BLANKS;
   3043         if ((CUR == '"') || (CUR == '\'')) {
   3044 	    URI = htmlParseSystemLiteral(ctxt);
   3045 	}
   3046     }
   3047     return(URI);
   3048 }
   3049 
   3050 /**
   3051  * xmlParsePI:
   3052  * @ctxt:  an XML parser context
   3053  *
   3054  * parse an XML Processing Instruction.
   3055  *
   3056  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
   3057  */
   3058 static void
   3059 htmlParsePI(htmlParserCtxtPtr ctxt) {
   3060     xmlChar *buf = NULL;
   3061     int len = 0;
   3062     int size = HTML_PARSER_BUFFER_SIZE;
   3063     int cur, l;
   3064     const xmlChar *target;
   3065     xmlParserInputState state;
   3066     int count = 0;
   3067 
   3068     if ((RAW == '<') && (NXT(1) == '?')) {
   3069 	state = ctxt->instate;
   3070         ctxt->instate = XML_PARSER_PI;
   3071 	/*
   3072 	 * this is a Processing Instruction.
   3073 	 */
   3074 	SKIP(2);
   3075 	SHRINK;
   3076 
   3077 	/*
   3078 	 * Parse the target name and check for special support like
   3079 	 * namespace.
   3080 	 */
   3081         target = htmlParseName(ctxt);
   3082 	if (target != NULL) {
   3083 	    if (RAW == '>') {
   3084 		SKIP(1);
   3085 
   3086 		/*
   3087 		 * SAX: PI detected.
   3088 		 */
   3089 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
   3090 		    (ctxt->sax->processingInstruction != NULL))
   3091 		    ctxt->sax->processingInstruction(ctxt->userData,
   3092 		                                     target, NULL);
   3093 		ctxt->instate = state;
   3094 		return;
   3095 	    }
   3096 	    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
   3097 	    if (buf == NULL) {
   3098 		htmlErrMemory(ctxt, NULL);
   3099 		ctxt->instate = state;
   3100 		return;
   3101 	    }
   3102 	    cur = CUR;
   3103 	    if (!IS_BLANK(cur)) {
   3104 		htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
   3105 			  "ParsePI: PI %s space expected\n", target, NULL);
   3106 	    }
   3107             SKIP_BLANKS;
   3108 	    cur = CUR_CHAR(l);
   3109 	    while (IS_CHAR(cur) && (cur != '>')) {
   3110 		if (len + 5 >= size) {
   3111 		    xmlChar *tmp;
   3112 
   3113 		    size *= 2;
   3114 		    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
   3115 		    if (tmp == NULL) {
   3116 			htmlErrMemory(ctxt, NULL);
   3117 			xmlFree(buf);
   3118 			ctxt->instate = state;
   3119 			return;
   3120 		    }
   3121 		    buf = tmp;
   3122 		}
   3123 		count++;
   3124 		if (count > 50) {
   3125 		    GROW;
   3126 		    count = 0;
   3127 		}
   3128 		COPY_BUF(l,buf,len,cur);
   3129 		NEXTL(l);
   3130 		cur = CUR_CHAR(l);
   3131 		if (cur == 0) {
   3132 		    SHRINK;
   3133 		    GROW;
   3134 		    cur = CUR_CHAR(l);
   3135 		}
   3136 	    }
   3137 	    buf[len] = 0;
   3138 	    if (cur != '>') {
   3139 		htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
   3140 		      "ParsePI: PI %s never end ...\n", target, NULL);
   3141 	    } else {
   3142 		SKIP(1);
   3143 
   3144 		/*
   3145 		 * SAX: PI detected.
   3146 		 */
   3147 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
   3148 		    (ctxt->sax->processingInstruction != NULL))
   3149 		    ctxt->sax->processingInstruction(ctxt->userData,
   3150 		                                     target, buf);
   3151 	    }
   3152 	    xmlFree(buf);
   3153 	} else {
   3154 	    htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
   3155                          "PI is not started correctly", NULL, NULL);
   3156 	}
   3157 	ctxt->instate = state;
   3158     }
   3159 }
   3160 
   3161 /**
   3162  * htmlParseComment:
   3163  * @ctxt:  an HTML parser context
   3164  *
   3165  * Parse an XML (SGML) comment <!-- .... -->
   3166  *
   3167  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
   3168  */
   3169 static void
   3170 htmlParseComment(htmlParserCtxtPtr ctxt) {
   3171     xmlChar *buf = NULL;
   3172     int len;
   3173     int size = HTML_PARSER_BUFFER_SIZE;
   3174     int q, ql;
   3175     int r, rl;
   3176     int cur, l;
   3177     xmlParserInputState state;
   3178 
   3179     /*
   3180      * Check that there is a comment right here.
   3181      */
   3182     if ((RAW != '<') || (NXT(1) != '!') ||
   3183         (NXT(2) != '-') || (NXT(3) != '-')) return;
   3184 
   3185     state = ctxt->instate;
   3186     ctxt->instate = XML_PARSER_COMMENT;
   3187     SHRINK;
   3188     SKIP(4);
   3189     buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
   3190     if (buf == NULL) {
   3191         htmlErrMemory(ctxt, "buffer allocation failed\n");
   3192 	ctxt->instate = state;
   3193 	return;
   3194     }
   3195     q = CUR_CHAR(ql);
   3196     NEXTL(ql);
   3197     r = CUR_CHAR(rl);
   3198     NEXTL(rl);
   3199     cur = CUR_CHAR(l);
   3200     len = 0;
   3201     while (IS_CHAR(cur) &&
   3202            ((cur != '>') ||
   3203 	    (r != '-') || (q != '-'))) {
   3204 	if (len + 5 >= size) {
   3205 	    xmlChar *tmp;
   3206 
   3207 	    size *= 2;
   3208 	    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
   3209 	    if (tmp == NULL) {
   3210 	        xmlFree(buf);
   3211 	        htmlErrMemory(ctxt, "growing buffer failed\n");
   3212 		ctxt->instate = state;
   3213 		return;
   3214 	    }
   3215 	    buf = tmp;
   3216 	}
   3217 	COPY_BUF(ql,buf,len,q);
   3218 	q = r;
   3219 	ql = rl;
   3220 	r = cur;
   3221 	rl = l;
   3222 	NEXTL(l);
   3223 	cur = CUR_CHAR(l);
   3224 	if (cur == 0) {
   3225 	    SHRINK;
   3226 	    GROW;
   3227 	    cur = CUR_CHAR(l);
   3228 	}
   3229     }
   3230     buf[len] = 0;
   3231     if (!IS_CHAR(cur)) {
   3232 	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
   3233 	             "Comment not terminated \n<!--%.50s\n", buf, NULL);
   3234 	xmlFree(buf);
   3235     } else {
   3236         NEXT;
   3237 	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
   3238 	    (!ctxt->disableSAX))
   3239 	    ctxt->sax->comment(ctxt->userData, buf);
   3240 	xmlFree(buf);
   3241     }
   3242     ctxt->instate = state;
   3243 }
   3244 
   3245 /**
   3246  * htmlParseCharRef:
   3247  * @ctxt:  an HTML parser context
   3248  *
   3249  * parse Reference declarations
   3250  *
   3251  * [66] CharRef ::= '&#' [0-9]+ ';' |
   3252  *                  '&#x' [0-9a-fA-F]+ ';'
   3253  *
   3254  * Returns the value parsed (as an int)
   3255  */
   3256 int
   3257 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
   3258     int val = 0;
   3259 
   3260     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   3261 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   3262 		     "htmlParseCharRef: context error\n",
   3263 		     NULL, NULL);
   3264         return(0);
   3265     }
   3266     if ((CUR == '&') && (NXT(1) == '#') &&
   3267         ((NXT(2) == 'x') || NXT(2) == 'X')) {
   3268 	SKIP(3);
   3269 	while (CUR != ';') {
   3270 	    if ((CUR >= '0') && (CUR <= '9'))
   3271 	        val = val * 16 + (CUR - '0');
   3272 	    else if ((CUR >= 'a') && (CUR <= 'f'))
   3273 	        val = val * 16 + (CUR - 'a') + 10;
   3274 	    else if ((CUR >= 'A') && (CUR <= 'F'))
   3275 	        val = val * 16 + (CUR - 'A') + 10;
   3276 	    else {
   3277 	        htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
   3278 		             "htmlParseCharRef: missing semicolumn\n",
   3279 			     NULL, NULL);
   3280 		break;
   3281 	    }
   3282 	    NEXT;
   3283 	}
   3284 	if (CUR == ';')
   3285 	    NEXT;
   3286     } else if  ((CUR == '&') && (NXT(1) == '#')) {
   3287 	SKIP(2);
   3288 	while (CUR != ';') {
   3289 	    if ((CUR >= '0') && (CUR <= '9'))
   3290 	        val = val * 10 + (CUR - '0');
   3291 	    else {
   3292 	        htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
   3293 		             "htmlParseCharRef: missing semicolumn\n",
   3294 			     NULL, NULL);
   3295 		break;
   3296 	    }
   3297 	    NEXT;
   3298 	}
   3299 	if (CUR == ';')
   3300 	    NEXT;
   3301     } else {
   3302 	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
   3303 	             "htmlParseCharRef: invalid value\n", NULL, NULL);
   3304     }
   3305     /*
   3306      * Check the value IS_CHAR ...
   3307      */
   3308     if (IS_CHAR(val)) {
   3309         return(val);
   3310     } else {
   3311 	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
   3312 			"htmlParseCharRef: invalid xmlChar value %d\n",
   3313 			val);
   3314     }
   3315     return(0);
   3316 }
   3317 
   3318 
   3319 /**
   3320  * htmlParseDocTypeDecl:
   3321  * @ctxt:  an HTML parser context
   3322  *
   3323  * parse a DOCTYPE declaration
   3324  *
   3325  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
   3326  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
   3327  */
   3328 
   3329 static void
   3330 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
   3331     const xmlChar *name;
   3332     xmlChar *ExternalID = NULL;
   3333     xmlChar *URI = NULL;
   3334 
   3335     /*
   3336      * We know that '<!DOCTYPE' has been detected.
   3337      */
   3338     SKIP(9);
   3339 
   3340     SKIP_BLANKS;
   3341 
   3342     /*
   3343      * Parse the DOCTYPE name.
   3344      */
   3345     name = htmlParseName(ctxt);
   3346     if (name == NULL) {
   3347 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   3348 	             "htmlParseDocTypeDecl : no DOCTYPE name !\n",
   3349 		     NULL, NULL);
   3350     }
   3351     /*
   3352      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
   3353      */
   3354 
   3355     SKIP_BLANKS;
   3356 
   3357     /*
   3358      * Check for SystemID and ExternalID
   3359      */
   3360     URI = htmlParseExternalID(ctxt, &ExternalID);
   3361     SKIP_BLANKS;
   3362 
   3363     /*
   3364      * We should be at the end of the DOCTYPE declaration.
   3365      */
   3366     if (CUR != '>') {
   3367 	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
   3368 	             "DOCTYPE improperly terminated\n", NULL, NULL);
   3369         /* We shouldn't try to resynchronize ... */
   3370     }
   3371     NEXT;
   3372 
   3373     /*
   3374      * Create or update the document accordingly to the DOCTYPE
   3375      */
   3376     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
   3377 	(!ctxt->disableSAX))
   3378 	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
   3379 
   3380     /*
   3381      * Cleanup, since we don't use all those identifiers
   3382      */
   3383     if (URI != NULL) xmlFree(URI);
   3384     if (ExternalID != NULL) xmlFree(ExternalID);
   3385 }
   3386 
   3387 /**
   3388  * htmlParseAttribute:
   3389  * @ctxt:  an HTML parser context
   3390  * @value:  a xmlChar ** used to store the value of the attribute
   3391  *
   3392  * parse an attribute
   3393  *
   3394  * [41] Attribute ::= Name Eq AttValue
   3395  *
   3396  * [25] Eq ::= S? '=' S?
   3397  *
   3398  * With namespace:
   3399  *
   3400  * [NS 11] Attribute ::= QName Eq AttValue
   3401  *
   3402  * Also the case QName == xmlns:??? is handled independently as a namespace
   3403  * definition.
   3404  *
   3405  * Returns the attribute name, and the value in *value.
   3406  */
   3407 
   3408 static const xmlChar *
   3409 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
   3410     const xmlChar *name;
   3411     xmlChar *val = NULL;
   3412 
   3413     *value = NULL;
   3414     name = htmlParseHTMLName(ctxt);
   3415     if (name == NULL) {
   3416 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   3417 	             "error parsing attribute name\n", NULL, NULL);
   3418         return(NULL);
   3419     }
   3420 
   3421     /*
   3422      * read the value
   3423      */
   3424     SKIP_BLANKS;
   3425     if (CUR == '=') {
   3426         NEXT;
   3427 	SKIP_BLANKS;
   3428 	val = htmlParseAttValue(ctxt);
   3429     }
   3430 
   3431     *value = val;
   3432     return(name);
   3433 }
   3434 
   3435 /**
   3436  * htmlCheckEncoding:
   3437  * @ctxt:  an HTML parser context
   3438  * @attvalue: the attribute value
   3439  *
   3440  * Checks an http-equiv attribute from a Meta tag to detect
   3441  * the encoding
   3442  * If a new encoding is detected the parser is switched to decode
   3443  * it and pass UTF8
   3444  */
   3445 static void
   3446 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
   3447     const xmlChar *encoding;
   3448 
   3449     if ((ctxt == NULL) || (attvalue == NULL))
   3450 	return;
   3451 
   3452     /* do not change encoding */
   3453     if (ctxt->input->encoding != NULL)
   3454         return;
   3455 
   3456     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
   3457     if (encoding != NULL) {
   3458 	encoding += 8;
   3459     } else {
   3460 	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
   3461 	if (encoding != NULL)
   3462 	    encoding += 9;
   3463     }
   3464     if (encoding != NULL) {
   3465 	xmlCharEncoding enc;
   3466 	xmlCharEncodingHandlerPtr handler;
   3467 
   3468 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
   3469 
   3470 	if (ctxt->input->encoding != NULL)
   3471 	    xmlFree((xmlChar *) ctxt->input->encoding);
   3472 	ctxt->input->encoding = xmlStrdup(encoding);
   3473 
   3474 	enc = xmlParseCharEncoding((const char *) encoding);
   3475 	/*
   3476 	 * registered set of known encodings
   3477 	 */
   3478 	if (enc != XML_CHAR_ENCODING_ERROR) {
   3479 	    if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
   3480 	         (enc == XML_CHAR_ENCODING_UTF16BE) ||
   3481 		 (enc == XML_CHAR_ENCODING_UCS4LE) ||
   3482 		 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
   3483 		(ctxt->input->buf != NULL) &&
   3484 		(ctxt->input->buf->encoder == NULL)) {
   3485 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
   3486 		             "htmlCheckEncoding: wrong encoding meta\n",
   3487 			     NULL, NULL);
   3488 	    } else {
   3489 		xmlSwitchEncoding(ctxt, enc);
   3490 	    }
   3491 	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
   3492 	} else {
   3493 	    /*
   3494 	     * fallback for unknown encodings
   3495 	     */
   3496 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
   3497 	    if (handler != NULL) {
   3498 		xmlSwitchToEncoding(ctxt, handler);
   3499 		ctxt->charset = XML_CHAR_ENCODING_UTF8;
   3500 	    } else {
   3501 		ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
   3502 	    }
   3503 	}
   3504 
   3505 	if ((ctxt->input->buf != NULL) &&
   3506 	    (ctxt->input->buf->encoder != NULL) &&
   3507 	    (ctxt->input->buf->raw != NULL) &&
   3508 	    (ctxt->input->buf->buffer != NULL)) {
   3509 	    int nbchars;
   3510 	    int processed;
   3511 
   3512 	    /*
   3513 	     * convert as much as possible to the parser reading buffer.
   3514 	     */
   3515 	    processed = ctxt->input->cur - ctxt->input->base;
   3516 	    xmlBufferShrink(ctxt->input->buf->buffer, processed);
   3517 	    nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
   3518 		                       ctxt->input->buf->buffer,
   3519 				       ctxt->input->buf->raw);
   3520 	    if (nbchars < 0) {
   3521 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
   3522 		             "htmlCheckEncoding: encoder error\n",
   3523 			     NULL, NULL);
   3524 	    }
   3525 	    ctxt->input->base =
   3526 	    ctxt->input->cur = ctxt->input->buf->buffer->content;
   3527             ctxt->input->end =
   3528                           &ctxt->input->base[ctxt->input->buf->buffer->use];
   3529 	}
   3530     }
   3531 }
   3532 
   3533 /**
   3534  * htmlCheckMeta:
   3535  * @ctxt:  an HTML parser context
   3536  * @atts:  the attributes values
   3537  *
   3538  * Checks an attributes from a Meta tag
   3539  */
   3540 static void
   3541 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
   3542     int i;
   3543     const xmlChar *att, *value;
   3544     int http = 0;
   3545     const xmlChar *content = NULL;
   3546 
   3547     if ((ctxt == NULL) || (atts == NULL))
   3548 	return;
   3549 
   3550     i = 0;
   3551     att = atts[i++];
   3552     while (att != NULL) {
   3553 	value = atts[i++];
   3554 	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
   3555 	 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
   3556 	    http = 1;
   3557 	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
   3558 	    content = value;
   3559 	att = atts[i++];
   3560     }
   3561     if ((http) && (content != NULL))
   3562 	htmlCheckEncoding(ctxt, content);
   3563 
   3564 }
   3565 
   3566 /**
   3567  * htmlParseStartTag:
   3568  * @ctxt:  an HTML parser context
   3569  *
   3570  * parse a start of tag either for rule element or
   3571  * EmptyElement. In both case we don't parse the tag closing chars.
   3572  *
   3573  * [40] STag ::= '<' Name (S Attribute)* S? '>'
   3574  *
   3575  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
   3576  *
   3577  * With namespace:
   3578  *
   3579  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
   3580  *
   3581  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
   3582  *
   3583  * Returns 0 in case of success, -1 in case of error and 1 if discarded
   3584  */
   3585 
   3586 static int
   3587 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
   3588     const xmlChar *name;
   3589     const xmlChar *attname;
   3590     xmlChar *attvalue;
   3591     const xmlChar **atts;
   3592     int nbatts = 0;
   3593     int maxatts;
   3594     int meta = 0;
   3595     int i;
   3596     int discardtag = 0;
   3597 
   3598     if (ctxt->instate == XML_PARSER_EOF)
   3599         return(-1);
   3600     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   3601 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   3602 		     "htmlParseStartTag: context error\n", NULL, NULL);
   3603 	return -1;
   3604     }
   3605     if (CUR != '<') return -1;
   3606     NEXT;
   3607 
   3608     atts = ctxt->atts;
   3609     maxatts = ctxt->maxatts;
   3610 
   3611     GROW;
   3612     name = htmlParseHTMLName(ctxt);
   3613     if (name == NULL) {
   3614 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   3615 	             "htmlParseStartTag: invalid element name\n",
   3616 		     NULL, NULL);
   3617 	/* Dump the bogus tag like browsers do */
   3618 	while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
   3619                (ctxt->instate != XML_PARSER_EOF))
   3620 	    NEXT;
   3621         return -1;
   3622     }
   3623     if (xmlStrEqual(name, BAD_CAST"meta"))
   3624 	meta = 1;
   3625 
   3626     /*
   3627      * Check for auto-closure of HTML elements.
   3628      */
   3629     htmlAutoClose(ctxt, name);
   3630 
   3631     /*
   3632      * Check for implied HTML elements.
   3633      */
   3634     htmlCheckImplied(ctxt, name);
   3635 
   3636     /*
   3637      * Avoid html at any level > 0, head at any level != 1
   3638      * or any attempt to recurse body
   3639      */
   3640     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
   3641 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   3642 	             "htmlParseStartTag: misplaced <html> tag\n",
   3643 		     name, NULL);
   3644 	discardtag = 1;
   3645 	ctxt->depth++;
   3646     }
   3647     if ((ctxt->nameNr != 1) &&
   3648 	(xmlStrEqual(name, BAD_CAST"head"))) {
   3649 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   3650 	             "htmlParseStartTag: misplaced <head> tag\n",
   3651 		     name, NULL);
   3652 	discardtag = 1;
   3653 	ctxt->depth++;
   3654     }
   3655     if (xmlStrEqual(name, BAD_CAST"body")) {
   3656 	int indx;
   3657 	for (indx = 0;indx < ctxt->nameNr;indx++) {
   3658 	    if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
   3659 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   3660 		             "htmlParseStartTag: misplaced <body> tag\n",
   3661 			     name, NULL);
   3662 		discardtag = 1;
   3663 		ctxt->depth++;
   3664 	    }
   3665 	}
   3666     }
   3667 
   3668     /*
   3669      * Now parse the attributes, it ends up with the ending
   3670      *
   3671      * (S Attribute)* S?
   3672      */
   3673     SKIP_BLANKS;
   3674     while ((IS_CHAR_CH(CUR)) &&
   3675            (CUR != '>') &&
   3676 	   ((CUR != '/') || (NXT(1) != '>'))) {
   3677 	long cons = ctxt->nbChars;
   3678 
   3679 	GROW;
   3680 	attname = htmlParseAttribute(ctxt, &attvalue);
   3681         if (attname != NULL) {
   3682 
   3683 	    /*
   3684 	     * Well formedness requires at most one declaration of an attribute
   3685 	     */
   3686 	    for (i = 0; i < nbatts;i += 2) {
   3687 	        if (xmlStrEqual(atts[i], attname)) {
   3688 		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
   3689 		                 "Attribute %s redefined\n", attname, NULL);
   3690 		    if (attvalue != NULL)
   3691 			xmlFree(attvalue);
   3692 		    goto failed;
   3693 		}
   3694 	    }
   3695 
   3696 	    /*
   3697 	     * Add the pair to atts
   3698 	     */
   3699 	    if (atts == NULL) {
   3700 	        maxatts = 22; /* allow for 10 attrs by default */
   3701 	        atts = (const xmlChar **)
   3702 		       xmlMalloc(maxatts * sizeof(xmlChar *));
   3703 		if (atts == NULL) {
   3704 		    htmlErrMemory(ctxt, NULL);
   3705 		    if (attvalue != NULL)
   3706 			xmlFree(attvalue);
   3707 		    goto failed;
   3708 		}
   3709 		ctxt->atts = atts;
   3710 		ctxt->maxatts = maxatts;
   3711 	    } else if (nbatts + 4 > maxatts) {
   3712 	        const xmlChar **n;
   3713 
   3714 	        maxatts *= 2;
   3715 	        n = (const xmlChar **) xmlRealloc((void *) atts,
   3716 					     maxatts * sizeof(const xmlChar *));
   3717 		if (n == NULL) {
   3718 		    htmlErrMemory(ctxt, NULL);
   3719 		    if (attvalue != NULL)
   3720 			xmlFree(attvalue);
   3721 		    goto failed;
   3722 		}
   3723 		atts = n;
   3724 		ctxt->atts = atts;
   3725 		ctxt->maxatts = maxatts;
   3726 	    }
   3727 	    atts[nbatts++] = attname;
   3728 	    atts[nbatts++] = attvalue;
   3729 	    atts[nbatts] = NULL;
   3730 	    atts[nbatts + 1] = NULL;
   3731 	}
   3732 	else {
   3733 	    if (attvalue != NULL)
   3734 	        xmlFree(attvalue);
   3735 	    /* Dump the bogus attribute string up to the next blank or
   3736 	     * the end of the tag. */
   3737 	    while ((IS_CHAR_CH(CUR)) &&
   3738 	           !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
   3739 		   ((CUR != '/') || (NXT(1) != '>')))
   3740 		NEXT;
   3741 	}
   3742 
   3743 failed:
   3744 	SKIP_BLANKS;
   3745         if (cons == ctxt->nbChars) {
   3746 	    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   3747 	                 "htmlParseStartTag: problem parsing attributes\n",
   3748 			 NULL, NULL);
   3749 	    break;
   3750 	}
   3751     }
   3752 
   3753     /*
   3754      * Handle specific association to the META tag
   3755      */
   3756     if (meta && (nbatts != 0))
   3757 	htmlCheckMeta(ctxt, atts);
   3758 
   3759     /*
   3760      * SAX: Start of Element !
   3761      */
   3762     if (!discardtag) {
   3763 	htmlnamePush(ctxt, name);
   3764 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
   3765 	    if (nbatts != 0)
   3766 		ctxt->sax->startElement(ctxt->userData, name, atts);
   3767 	    else
   3768 		ctxt->sax->startElement(ctxt->userData, name, NULL);
   3769 	}
   3770     }
   3771 
   3772     if (atts != NULL) {
   3773         for (i = 1;i < nbatts;i += 2) {
   3774 	    if (atts[i] != NULL)
   3775 		xmlFree((xmlChar *) atts[i]);
   3776 	}
   3777     }
   3778 
   3779     return(discardtag);
   3780 }
   3781 
   3782 /**
   3783  * htmlParseEndTag:
   3784  * @ctxt:  an HTML parser context
   3785  *
   3786  * parse an end of tag
   3787  *
   3788  * [42] ETag ::= '</' Name S? '>'
   3789  *
   3790  * With namespace
   3791  *
   3792  * [NS 9] ETag ::= '</' QName S? '>'
   3793  *
   3794  * Returns 1 if the current level should be closed.
   3795  */
   3796 
   3797 static int
   3798 htmlParseEndTag(htmlParserCtxtPtr ctxt)
   3799 {
   3800     const xmlChar *name;
   3801     const xmlChar *oldname;
   3802     int i, ret;
   3803 
   3804     if ((CUR != '<') || (NXT(1) != '/')) {
   3805         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
   3806 	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
   3807         return (0);
   3808     }
   3809     SKIP(2);
   3810 
   3811     name = htmlParseHTMLName(ctxt);
   3812     if (name == NULL)
   3813         return (0);
   3814     /*
   3815      * We should definitely be at the ending "S? '>'" part
   3816      */
   3817     SKIP_BLANKS;
   3818     if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
   3819         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
   3820 	             "End tag : expected '>'\n", NULL, NULL);
   3821 	if (ctxt->recovery) {
   3822 	    /*
   3823 	     * We're not at the ending > !!
   3824 	     * Error, unless in recover mode where we search forwards
   3825 	     * until we find a >
   3826 	     */
   3827 	    while (CUR != '\0' && CUR != '>') NEXT;
   3828 	    NEXT;
   3829 	}
   3830     } else
   3831         NEXT;
   3832 
   3833     /*
   3834      * if we ignored misplaced tags in htmlParseStartTag don't pop them
   3835      * out now.
   3836      */
   3837     if ((ctxt->depth > 0) &&
   3838         (xmlStrEqual(name, BAD_CAST "html") ||
   3839          xmlStrEqual(name, BAD_CAST "body") ||
   3840 	 xmlStrEqual(name, BAD_CAST "head"))) {
   3841 	ctxt->depth--;
   3842 	return (0);
   3843     }
   3844 
   3845     /*
   3846      * If the name read is not one of the element in the parsing stack
   3847      * then return, it's just an error.
   3848      */
   3849     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
   3850         if (xmlStrEqual(name, ctxt->nameTab[i]))
   3851             break;
   3852     }
   3853     if (i < 0) {
   3854         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
   3855 	             "Unexpected end tag : %s\n", name, NULL);
   3856         return (0);
   3857     }
   3858 
   3859 
   3860     /*
   3861      * Check for auto-closure of HTML elements.
   3862      */
   3863 
   3864     htmlAutoCloseOnClose(ctxt, name);
   3865 
   3866     /*
   3867      * Well formedness constraints, opening and closing must match.
   3868      * With the exception that the autoclose may have popped stuff out
   3869      * of the stack.
   3870      */
   3871     if (!xmlStrEqual(name, ctxt->name)) {
   3872         if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
   3873             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
   3874 	                 "Opening and ending tag mismatch: %s and %s\n",
   3875 			 name, ctxt->name);
   3876         }
   3877     }
   3878 
   3879     /*
   3880      * SAX: End of Tag
   3881      */
   3882     oldname = ctxt->name;
   3883     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
   3884         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   3885             ctxt->sax->endElement(ctxt->userData, name);
   3886         htmlnamePop(ctxt);
   3887         ret = 1;
   3888     } else {
   3889         ret = 0;
   3890     }
   3891 
   3892     return (ret);
   3893 }
   3894 
   3895 
   3896 /**
   3897  * htmlParseReference:
   3898  * @ctxt:  an HTML parser context
   3899  *
   3900  * parse and handle entity references in content,
   3901  * this will end-up in a call to character() since this is either a
   3902  * CharRef, or a predefined entity.
   3903  */
   3904 static void
   3905 htmlParseReference(htmlParserCtxtPtr ctxt) {
   3906     const htmlEntityDesc * ent;
   3907     xmlChar out[6];
   3908     const xmlChar *name;
   3909     if (CUR != '&') return;
   3910 
   3911     if (NXT(1) == '#') {
   3912 	unsigned int c;
   3913 	int bits, i = 0;
   3914 
   3915 	c = htmlParseCharRef(ctxt);
   3916 	if (c == 0)
   3917 	    return;
   3918 
   3919         if      (c <    0x80) { out[i++]= c;                bits= -6; }
   3920         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   3921         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   3922         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   3923 
   3924         for ( ; bits >= 0; bits-= 6) {
   3925             out[i++]= ((c >> bits) & 0x3F) | 0x80;
   3926         }
   3927 	out[i] = 0;
   3928 
   3929 	htmlCheckParagraph(ctxt);
   3930 	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
   3931 	    ctxt->sax->characters(ctxt->userData, out, i);
   3932     } else {
   3933 	ent = htmlParseEntityRef(ctxt, &name);
   3934 	if (name == NULL) {
   3935 	    htmlCheckParagraph(ctxt);
   3936 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
   3937 	        ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
   3938 	    return;
   3939 	}
   3940 	if ((ent == NULL) || !(ent->value > 0)) {
   3941 	    htmlCheckParagraph(ctxt);
   3942 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
   3943 		ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
   3944 		ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
   3945 		/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
   3946 	    }
   3947 	} else {
   3948 	    unsigned int c;
   3949 	    int bits, i = 0;
   3950 
   3951 	    c = ent->value;
   3952 	    if      (c <    0x80)
   3953 	            { out[i++]= c;                bits= -6; }
   3954 	    else if (c <   0x800)
   3955 	            { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   3956 	    else if (c < 0x10000)
   3957 	            { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   3958 	    else
   3959 	            { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   3960 
   3961 	    for ( ; bits >= 0; bits-= 6) {
   3962 		out[i++]= ((c >> bits) & 0x3F) | 0x80;
   3963 	    }
   3964 	    out[i] = 0;
   3965 
   3966 	    htmlCheckParagraph(ctxt);
   3967 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
   3968 		ctxt->sax->characters(ctxt->userData, out, i);
   3969 	}
   3970     }
   3971 }
   3972 
   3973 /**
   3974  * htmlParseContent:
   3975  * @ctxt:  an HTML parser context
   3976  *
   3977  * Parse a content: comment, sub-element, reference or text.
   3978  * Kept for compatibility with old code
   3979  */
   3980 
   3981 static void
   3982 htmlParseContent(htmlParserCtxtPtr ctxt) {
   3983     xmlChar *currentNode;
   3984     int depth;
   3985     const xmlChar *name;
   3986 
   3987     currentNode = xmlStrdup(ctxt->name);
   3988     depth = ctxt->nameNr;
   3989     while (1) {
   3990 	long cons = ctxt->nbChars;
   3991 
   3992         GROW;
   3993 
   3994         if (ctxt->instate == XML_PARSER_EOF)
   3995             break;
   3996 
   3997 	/*
   3998 	 * Our tag or one of it's parent or children is ending.
   3999 	 */
   4000         if ((CUR == '<') && (NXT(1) == '/')) {
   4001 	    if (htmlParseEndTag(ctxt) &&
   4002 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
   4003 		if (currentNode != NULL)
   4004 		    xmlFree(currentNode);
   4005 		return;
   4006 	    }
   4007 	    continue; /* while */
   4008         }
   4009 
   4010 	else if ((CUR == '<') &&
   4011 	         ((IS_ASCII_LETTER(NXT(1))) ||
   4012 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
   4013 	    name = htmlParseHTMLName_nonInvasive(ctxt);
   4014 	    if (name == NULL) {
   4015 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   4016 			 "htmlParseStartTag: invalid element name\n",
   4017 			 NULL, NULL);
   4018 	        /* Dump the bogus tag like browsers do */
   4019         while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
   4020 	            NEXT;
   4021 
   4022 	        if (currentNode != NULL)
   4023 	            xmlFree(currentNode);
   4024 	        return;
   4025 	    }
   4026 
   4027 	    if (ctxt->name != NULL) {
   4028 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
   4029 	            htmlAutoClose(ctxt, name);
   4030 	            continue;
   4031 	        }
   4032 	    }
   4033 	}
   4034 
   4035 	/*
   4036 	 * Has this node been popped out during parsing of
   4037 	 * the next element
   4038 	 */
   4039         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
   4040 	    (!xmlStrEqual(currentNode, ctxt->name)))
   4041 	     {
   4042 	    if (currentNode != NULL) xmlFree(currentNode);
   4043 	    return;
   4044 	}
   4045 
   4046 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
   4047 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
   4048 	    /*
   4049 	     * Handle SCRIPT/STYLE separately
   4050 	     */
   4051 	    htmlParseScript(ctxt);
   4052 	} else {
   4053 	    /*
   4054 	     * Sometimes DOCTYPE arrives in the middle of the document
   4055 	     */
   4056 	    if ((CUR == '<') && (NXT(1) == '!') &&
   4057 		(UPP(2) == 'D') && (UPP(3) == 'O') &&
   4058 		(UPP(4) == 'C') && (UPP(5) == 'T') &&
   4059 		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
   4060 		(UPP(8) == 'E')) {
   4061 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   4062 		             "Misplaced DOCTYPE declaration\n",
   4063 			     BAD_CAST "DOCTYPE" , NULL);
   4064 		htmlParseDocTypeDecl(ctxt);
   4065 	    }
   4066 
   4067 	    /*
   4068 	     * First case :  a comment
   4069 	     */
   4070 	    if ((CUR == '<') && (NXT(1) == '!') &&
   4071 		(NXT(2) == '-') && (NXT(3) == '-')) {
   4072 		htmlParseComment(ctxt);
   4073 	    }
   4074 
   4075 	    /*
   4076 	     * Second case : a Processing Instruction.
   4077 	     */
   4078 	    else if ((CUR == '<') && (NXT(1) == '?')) {
   4079 		htmlParsePI(ctxt);
   4080 	    }
   4081 
   4082 	    /*
   4083 	     * Third case :  a sub-element.
   4084 	     */
   4085 	    else if (CUR == '<') {
   4086 		htmlParseElement(ctxt);
   4087 	    }
   4088 
   4089 	    /*
   4090 	     * Fourth case : a reference. If if has not been resolved,
   4091 	     *    parsing returns it's Name, create the node
   4092 	     */
   4093 	    else if (CUR == '&') {
   4094 		htmlParseReference(ctxt);
   4095 	    }
   4096 
   4097 	    /*
   4098 	     * Fifth case : end of the resource
   4099 	     */
   4100 	    else if (CUR == 0) {
   4101 		htmlAutoCloseOnEnd(ctxt);
   4102 		break;
   4103 	    }
   4104 
   4105 	    /*
   4106 	     * Last case, text. Note that References are handled directly.
   4107 	     */
   4108 	    else {
   4109 		htmlParseCharData(ctxt);
   4110 	    }
   4111 
   4112 	    if (cons == ctxt->nbChars) {
   4113 		if (ctxt->node != NULL) {
   4114 		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   4115 		                 "detected an error in element content\n",
   4116 				 NULL, NULL);
   4117 		}
   4118 		break;
   4119 	    }
   4120 	}
   4121         GROW;
   4122     }
   4123     if (currentNode != NULL) xmlFree(currentNode);
   4124 }
   4125 
   4126 /**
   4127  * htmlParseElement:
   4128  * @ctxt:  an HTML parser context
   4129  *
   4130  * parse an HTML element, this is highly recursive
   4131  * this is kept for compatibility with previous code versions
   4132  *
   4133  * [39] element ::= EmptyElemTag | STag content ETag
   4134  *
   4135  * [41] Attribute ::= Name Eq AttValue
   4136  */
   4137 
   4138 void
   4139 htmlParseElement(htmlParserCtxtPtr ctxt) {
   4140     const xmlChar *name;
   4141     xmlChar *currentNode = NULL;
   4142     const htmlElemDesc * info;
   4143     htmlParserNodeInfo node_info;
   4144     int failed;
   4145     int depth;
   4146     const xmlChar *oldptr;
   4147 
   4148     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   4149 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   4150 		     "htmlParseElement: context error\n", NULL, NULL);
   4151 	return;
   4152     }
   4153 
   4154     if (ctxt->instate == XML_PARSER_EOF)
   4155         return;
   4156 
   4157     /* Capture start position */
   4158     if (ctxt->record_info) {
   4159         node_info.begin_pos = ctxt->input->consumed +
   4160                           (CUR_PTR - ctxt->input->base);
   4161 	node_info.begin_line = ctxt->input->line;
   4162     }
   4163 
   4164     failed = htmlParseStartTag(ctxt);
   4165     name = ctxt->name;
   4166     if ((failed == -1) || (name == NULL)) {
   4167 	if (CUR == '>')
   4168 	    NEXT;
   4169         return;
   4170     }
   4171 
   4172     /*
   4173      * Lookup the info for that element.
   4174      */
   4175     info = htmlTagLookup(name);
   4176     if (info == NULL) {
   4177 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
   4178 	             "Tag %s invalid\n", name, NULL);
   4179     }
   4180 
   4181     /*
   4182      * Check for an Empty Element labeled the XML/SGML way
   4183      */
   4184     if ((CUR == '/') && (NXT(1) == '>')) {
   4185         SKIP(2);
   4186 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   4187 	    ctxt->sax->endElement(ctxt->userData, name);
   4188 	htmlnamePop(ctxt);
   4189 	return;
   4190     }
   4191 
   4192     if (CUR == '>') {
   4193         NEXT;
   4194     } else {
   4195 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
   4196 	             "Couldn't find end of Start Tag %s\n", name, NULL);
   4197 
   4198 	/*
   4199 	 * end of parsing of this node.
   4200 	 */
   4201 	if (xmlStrEqual(name, ctxt->name)) {
   4202 	    nodePop(ctxt);
   4203 	    htmlnamePop(ctxt);
   4204 	}
   4205 
   4206 	/*
   4207 	 * Capture end position and add node
   4208 	 */
   4209 	if (ctxt->record_info) {
   4210 	   node_info.end_pos = ctxt->input->consumed +
   4211 			      (CUR_PTR - ctxt->input->base);
   4212 	   node_info.end_line = ctxt->input->line;
   4213 	   node_info.node = ctxt->node;
   4214 	   xmlParserAddNodeInfo(ctxt, &node_info);
   4215 	}
   4216 	return;
   4217     }
   4218 
   4219     /*
   4220      * Check for an Empty Element from DTD definition
   4221      */
   4222     if ((info != NULL) && (info->empty)) {
   4223 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   4224 	    ctxt->sax->endElement(ctxt->userData, name);
   4225 	htmlnamePop(ctxt);
   4226 	return;
   4227     }
   4228 
   4229     /*
   4230      * Parse the content of the element:
   4231      */
   4232     currentNode = xmlStrdup(ctxt->name);
   4233     depth = ctxt->nameNr;
   4234     while (IS_CHAR_CH(CUR)) {
   4235 	oldptr = ctxt->input->cur;
   4236 	htmlParseContent(ctxt);
   4237 	if (oldptr==ctxt->input->cur) break;
   4238 	if (ctxt->nameNr < depth) break;
   4239     }
   4240 
   4241     /*
   4242      * Capture end position and add node
   4243      */
   4244     if ( currentNode != NULL && ctxt->record_info ) {
   4245        node_info.end_pos = ctxt->input->consumed +
   4246                           (CUR_PTR - ctxt->input->base);
   4247        node_info.end_line = ctxt->input->line;
   4248        node_info.node = ctxt->node;
   4249        xmlParserAddNodeInfo(ctxt, &node_info);
   4250     }
   4251     if (!IS_CHAR_CH(CUR)) {
   4252 	htmlAutoCloseOnEnd(ctxt);
   4253     }
   4254 
   4255     if (currentNode != NULL)
   4256 	xmlFree(currentNode);
   4257 }
   4258 
   4259 static void
   4260 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
   4261     /*
   4262      * Capture end position and add node
   4263      */
   4264     if ( ctxt->node != NULL && ctxt->record_info ) {
   4265        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
   4266                                 (CUR_PTR - ctxt->input->base);
   4267        ctxt->nodeInfo->end_line = ctxt->input->line;
   4268        ctxt->nodeInfo->node = ctxt->node;
   4269        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
   4270        htmlNodeInfoPop(ctxt);
   4271     }
   4272     if (!IS_CHAR_CH(CUR)) {
   4273        htmlAutoCloseOnEnd(ctxt);
   4274     }
   4275 }
   4276 
   4277 /**
   4278  * htmlParseElementInternal:
   4279  * @ctxt:  an HTML parser context
   4280  *
   4281  * parse an HTML element, new version, non recursive
   4282  *
   4283  * [39] element ::= EmptyElemTag | STag content ETag
   4284  *
   4285  * [41] Attribute ::= Name Eq AttValue
   4286  */
   4287 
   4288 static void
   4289 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
   4290     const xmlChar *name;
   4291     const htmlElemDesc * info;
   4292     htmlParserNodeInfo node_info;
   4293     int failed;
   4294 
   4295     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   4296 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   4297 		     "htmlParseElementInternal: context error\n", NULL, NULL);
   4298 	return;
   4299     }
   4300 
   4301     if (ctxt->instate == XML_PARSER_EOF)
   4302         return;
   4303 
   4304     /* Capture start position */
   4305     if (ctxt->record_info) {
   4306         node_info.begin_pos = ctxt->input->consumed +
   4307                           (CUR_PTR - ctxt->input->base);
   4308 	node_info.begin_line = ctxt->input->line;
   4309     }
   4310 
   4311     failed = htmlParseStartTag(ctxt);
   4312     name = ctxt->name;
   4313     if ((failed == -1) || (name == NULL)) {
   4314 	if (CUR == '>')
   4315 	    NEXT;
   4316         return;
   4317     }
   4318 
   4319     /*
   4320      * Lookup the info for that element.
   4321      */
   4322     info = htmlTagLookup(name);
   4323     if (info == NULL) {
   4324 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
   4325 	             "Tag %s invalid\n", name, NULL);
   4326     }
   4327 
   4328     /*
   4329      * Check for an Empty Element labeled the XML/SGML way
   4330      */
   4331     if ((CUR == '/') && (NXT(1) == '>')) {
   4332         SKIP(2);
   4333 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   4334 	    ctxt->sax->endElement(ctxt->userData, name);
   4335 	htmlnamePop(ctxt);
   4336 	return;
   4337     }
   4338 
   4339     if (CUR == '>') {
   4340         NEXT;
   4341     } else {
   4342 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
   4343 	             "Couldn't find end of Start Tag %s\n", name, NULL);
   4344 
   4345 	/*
   4346 	 * end of parsing of this node.
   4347 	 */
   4348 	if (xmlStrEqual(name, ctxt->name)) {
   4349 	    nodePop(ctxt);
   4350 	    htmlnamePop(ctxt);
   4351 	}
   4352 
   4353         if (ctxt->record_info)
   4354             htmlNodeInfoPush(ctxt, &node_info);
   4355         htmlParserFinishElementParsing(ctxt);
   4356 	return;
   4357     }
   4358 
   4359     /*
   4360      * Check for an Empty Element from DTD definition
   4361      */
   4362     if ((info != NULL) && (info->empty)) {
   4363 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   4364 	    ctxt->sax->endElement(ctxt->userData, name);
   4365 	htmlnamePop(ctxt);
   4366 	return;
   4367     }
   4368 
   4369     if (ctxt->record_info)
   4370         htmlNodeInfoPush(ctxt, &node_info);
   4371 }
   4372 
   4373 /**
   4374  * htmlParseContentInternal:
   4375  * @ctxt:  an HTML parser context
   4376  *
   4377  * Parse a content: comment, sub-element, reference or text.
   4378  * New version for non recursive htmlParseElementInternal
   4379  */
   4380 
   4381 static void
   4382 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
   4383     xmlChar *currentNode;
   4384     int depth;
   4385     const xmlChar *name;
   4386 
   4387     currentNode = xmlStrdup(ctxt->name);
   4388     depth = ctxt->nameNr;
   4389     while (1) {
   4390 	long cons = ctxt->nbChars;
   4391 
   4392         GROW;
   4393 
   4394         if (ctxt->instate == XML_PARSER_EOF)
   4395             break;
   4396 
   4397 	/*
   4398 	 * Our tag or one of it's parent or children is ending.
   4399 	 */
   4400         if ((CUR == '<') && (NXT(1) == '/')) {
   4401 	    if (htmlParseEndTag(ctxt) &&
   4402 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
   4403 		if (currentNode != NULL)
   4404 		    xmlFree(currentNode);
   4405 
   4406 	        currentNode = xmlStrdup(ctxt->name);
   4407 	        depth = ctxt->nameNr;
   4408 	    }
   4409 	    continue; /* while */
   4410         }
   4411 
   4412 	else if ((CUR == '<') &&
   4413 	         ((IS_ASCII_LETTER(NXT(1))) ||
   4414 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
   4415 	    name = htmlParseHTMLName_nonInvasive(ctxt);
   4416 	    if (name == NULL) {
   4417 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   4418 			 "htmlParseStartTag: invalid element name\n",
   4419 			 NULL, NULL);
   4420 	        /* Dump the bogus tag like browsers do */
   4421 	        while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
   4422 	            NEXT;
   4423 
   4424 	        htmlParserFinishElementParsing(ctxt);
   4425 	        if (currentNode != NULL)
   4426 	            xmlFree(currentNode);
   4427 
   4428 	        currentNode = xmlStrdup(ctxt->name);
   4429 	        depth = ctxt->nameNr;
   4430 	        continue;
   4431 	    }
   4432 
   4433 	    if (ctxt->name != NULL) {
   4434 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
   4435 	            htmlAutoClose(ctxt, name);
   4436 	            continue;
   4437 	        }
   4438 	    }
   4439 	}
   4440 
   4441 	/*
   4442 	 * Has this node been popped out during parsing of
   4443 	 * the next element
   4444 	 */
   4445         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
   4446 	    (!xmlStrEqual(currentNode, ctxt->name)))
   4447 	     {
   4448 	    htmlParserFinishElementParsing(ctxt);
   4449 	    if (currentNode != NULL) xmlFree(currentNode);
   4450 
   4451 	    currentNode = xmlStrdup(ctxt->name);
   4452 	    depth = ctxt->nameNr;
   4453 	    continue;
   4454 	}
   4455 
   4456 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
   4457 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
   4458 	    /*
   4459 	     * Handle SCRIPT/STYLE separately
   4460 	     */
   4461 	    htmlParseScript(ctxt);
   4462 	} else {
   4463 	    /*
   4464 	     * Sometimes DOCTYPE arrives in the middle of the document
   4465 	     */
   4466 	    if ((CUR == '<') && (NXT(1) == '!') &&
   4467 		(UPP(2) == 'D') && (UPP(3) == 'O') &&
   4468 		(UPP(4) == 'C') && (UPP(5) == 'T') &&
   4469 		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
   4470 		(UPP(8) == 'E')) {
   4471 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   4472 		             "Misplaced DOCTYPE declaration\n",
   4473 			     BAD_CAST "DOCTYPE" , NULL);
   4474 		htmlParseDocTypeDecl(ctxt);
   4475 	    }
   4476 
   4477 	    /*
   4478 	     * First case :  a comment
   4479 	     */
   4480 	    if ((CUR == '<') && (NXT(1) == '!') &&
   4481 		(NXT(2) == '-') && (NXT(3) == '-')) {
   4482 		htmlParseComment(ctxt);
   4483 	    }
   4484 
   4485 	    /*
   4486 	     * Second case : a Processing Instruction.
   4487 	     */
   4488 	    else if ((CUR == '<') && (NXT(1) == '?')) {
   4489 		htmlParsePI(ctxt);
   4490 	    }
   4491 
   4492 	    /*
   4493 	     * Third case :  a sub-element.
   4494 	     */
   4495 	    else if (CUR == '<') {
   4496 		htmlParseElementInternal(ctxt);
   4497 		if (currentNode != NULL) xmlFree(currentNode);
   4498 
   4499 		currentNode = xmlStrdup(ctxt->name);
   4500 		depth = ctxt->nameNr;
   4501 	    }
   4502 
   4503 	    /*
   4504 	     * Fourth case : a reference. If if has not been resolved,
   4505 	     *    parsing returns it's Name, create the node
   4506 	     */
   4507 	    else if (CUR == '&') {
   4508 		htmlParseReference(ctxt);
   4509 	    }
   4510 
   4511 	    /*
   4512 	     * Fifth case : end of the resource
   4513 	     */
   4514 	    else if (CUR == 0) {
   4515 		htmlAutoCloseOnEnd(ctxt);
   4516 		break;
   4517 	    }
   4518 
   4519 	    /*
   4520 	     * Last case, text. Note that References are handled directly.
   4521 	     */
   4522 	    else {
   4523 		htmlParseCharData(ctxt);
   4524 	    }
   4525 
   4526 	    if (cons == ctxt->nbChars) {
   4527 		if (ctxt->node != NULL) {
   4528 		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   4529 		                 "detected an error in element content\n",
   4530 				 NULL, NULL);
   4531 		}
   4532 		break;
   4533 	    }
   4534 	}
   4535         GROW;
   4536     }
   4537     if (currentNode != NULL) xmlFree(currentNode);
   4538 }
   4539 
   4540 /**
   4541  * htmlParseContent:
   4542  * @ctxt:  an HTML parser context
   4543  *
   4544  * Parse a content: comment, sub-element, reference or text.
   4545  * This is the entry point when called from parser.c
   4546  */
   4547 
   4548 void
   4549 __htmlParseContent(void *ctxt) {
   4550     if (ctxt != NULL)
   4551 	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
   4552 }
   4553 
   4554 /**
   4555  * htmlParseDocument:
   4556  * @ctxt:  an HTML parser context
   4557  *
   4558  * parse an HTML document (and build a tree if using the standard SAX
   4559  * interface).
   4560  *
   4561  * Returns 0, -1 in case of error. the parser context is augmented
   4562  *                as a result of the parsing.
   4563  */
   4564 
   4565 int
   4566 htmlParseDocument(htmlParserCtxtPtr ctxt) {
   4567     xmlChar start[4];
   4568     xmlCharEncoding enc;
   4569     xmlDtdPtr dtd;
   4570 
   4571     xmlInitParser();
   4572 
   4573     htmlDefaultSAXHandlerInit();
   4574 
   4575     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   4576 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   4577 		     "htmlParseDocument: context error\n", NULL, NULL);
   4578 	return(XML_ERR_INTERNAL_ERROR);
   4579     }
   4580     ctxt->html = 1;
   4581     ctxt->linenumbers = 1;
   4582     GROW;
   4583     /*
   4584      * SAX: beginning of the document processing.
   4585      */
   4586     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
   4587         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
   4588 
   4589     if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
   4590         ((ctxt->input->end - ctxt->input->cur) >= 4)) {
   4591 	/*
   4592 	 * Get the 4 first bytes and decode the charset
   4593 	 * if enc != XML_CHAR_ENCODING_NONE
   4594 	 * plug some encoding conversion routines.
   4595 	 */
   4596 	start[0] = RAW;
   4597 	start[1] = NXT(1);
   4598 	start[2] = NXT(2);
   4599 	start[3] = NXT(3);
   4600 	enc = xmlDetectCharEncoding(&start[0], 4);
   4601 	if (enc != XML_CHAR_ENCODING_NONE) {
   4602 	    xmlSwitchEncoding(ctxt, enc);
   4603 	}
   4604     }
   4605 
   4606     /*
   4607      * Wipe out everything which is before the first '<'
   4608      */
   4609     SKIP_BLANKS;
   4610     if (CUR == 0) {
   4611 	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
   4612 	             "Document is empty\n", NULL, NULL);
   4613     }
   4614 
   4615     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
   4616 	ctxt->sax->startDocument(ctxt->userData);
   4617 
   4618 
   4619     /*
   4620      * Parse possible comments and PIs before any content
   4621      */
   4622     while (((CUR == '<') && (NXT(1) == '!') &&
   4623             (NXT(2) == '-') && (NXT(3) == '-')) ||
   4624 	   ((CUR == '<') && (NXT(1) == '?'))) {
   4625         htmlParseComment(ctxt);
   4626         htmlParsePI(ctxt);
   4627 	SKIP_BLANKS;
   4628     }
   4629 
   4630 
   4631     /*
   4632      * Then possibly doc type declaration(s) and more Misc
   4633      * (doctypedecl Misc*)?
   4634      */
   4635     if ((CUR == '<') && (NXT(1) == '!') &&
   4636 	(UPP(2) == 'D') && (UPP(3) == 'O') &&
   4637 	(UPP(4) == 'C') && (UPP(5) == 'T') &&
   4638 	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
   4639 	(UPP(8) == 'E')) {
   4640 	htmlParseDocTypeDecl(ctxt);
   4641     }
   4642     SKIP_BLANKS;
   4643 
   4644     /*
   4645      * Parse possible comments and PIs before any content
   4646      */
   4647     while (((CUR == '<') && (NXT(1) == '!') &&
   4648             (NXT(2) == '-') && (NXT(3) == '-')) ||
   4649 	   ((CUR == '<') && (NXT(1) == '?'))) {
   4650         htmlParseComment(ctxt);
   4651         htmlParsePI(ctxt);
   4652 	SKIP_BLANKS;
   4653     }
   4654 
   4655     /*
   4656      * Time to start parsing the tree itself
   4657      */
   4658     htmlParseContentInternal(ctxt);
   4659 
   4660     /*
   4661      * autoclose
   4662      */
   4663     if (CUR == 0)
   4664 	htmlAutoCloseOnEnd(ctxt);
   4665 
   4666 
   4667     /*
   4668      * SAX: end of the document processing.
   4669      */
   4670     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
   4671         ctxt->sax->endDocument(ctxt->userData);
   4672 
   4673     if (ctxt->myDoc != NULL) {
   4674 	dtd = xmlGetIntSubset(ctxt->myDoc);
   4675 	if (dtd == NULL)
   4676 	    ctxt->myDoc->intSubset =
   4677 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
   4678 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
   4679 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
   4680     }
   4681     if (! ctxt->wellFormed) return(-1);
   4682     return(0);
   4683 }
   4684 
   4685 
   4686 /************************************************************************
   4687  *									*
   4688  *			Parser contexts handling			*
   4689  *									*
   4690  ************************************************************************/
   4691 
   4692 /**
   4693  * htmlInitParserCtxt:
   4694  * @ctxt:  an HTML parser context
   4695  *
   4696  * Initialize a parser context
   4697  *
   4698  * Returns 0 in case of success and -1 in case of error
   4699  */
   4700 
   4701 static int
   4702 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
   4703 {
   4704     htmlSAXHandler *sax;
   4705 
   4706     if (ctxt == NULL) return(-1);
   4707     memset(ctxt, 0, sizeof(htmlParserCtxt));
   4708 
   4709     ctxt->dict = xmlDictCreate();
   4710     if (ctxt->dict == NULL) {
   4711         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
   4712 	return(-1);
   4713     }
   4714     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
   4715     if (sax == NULL) {
   4716         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
   4717 	return(-1);
   4718     }
   4719     else
   4720         memset(sax, 0, sizeof(htmlSAXHandler));
   4721 
   4722     /* Allocate the Input stack */
   4723     ctxt->inputTab = (htmlParserInputPtr *)
   4724                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
   4725     if (ctxt->inputTab == NULL) {
   4726         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
   4727 	ctxt->inputNr = 0;
   4728 	ctxt->inputMax = 0;
   4729 	ctxt->input = NULL;
   4730 	return(-1);
   4731     }
   4732     ctxt->inputNr = 0;
   4733     ctxt->inputMax = 5;
   4734     ctxt->input = NULL;
   4735     ctxt->version = NULL;
   4736     ctxt->encoding = NULL;
   4737     ctxt->standalone = -1;
   4738     ctxt->instate = XML_PARSER_START;
   4739 
   4740     /* Allocate the Node stack */
   4741     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
   4742     if (ctxt->nodeTab == NULL) {
   4743         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
   4744 	ctxt->nodeNr = 0;
   4745 	ctxt->nodeMax = 0;
   4746 	ctxt->node = NULL;
   4747 	ctxt->inputNr = 0;
   4748 	ctxt->inputMax = 0;
   4749 	ctxt->input = NULL;
   4750 	return(-1);
   4751     }
   4752     ctxt->nodeNr = 0;
   4753     ctxt->nodeMax = 10;
   4754     ctxt->node = NULL;
   4755 
   4756     /* Allocate the Name stack */
   4757     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
   4758     if (ctxt->nameTab == NULL) {
   4759         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
   4760 	ctxt->nameNr = 0;
   4761 	ctxt->nameMax = 0;
   4762 	ctxt->name = NULL;
   4763 	ctxt->nodeNr = 0;
   4764 	ctxt->nodeMax = 0;
   4765 	ctxt->node = NULL;
   4766 	ctxt->inputNr = 0;
   4767 	ctxt->inputMax = 0;
   4768 	ctxt->input = NULL;
   4769 	return(-1);
   4770     }
   4771     ctxt->nameNr = 0;
   4772     ctxt->nameMax = 10;
   4773     ctxt->name = NULL;
   4774 
   4775     ctxt->nodeInfoTab = NULL;
   4776     ctxt->nodeInfoNr  = 0;
   4777     ctxt->nodeInfoMax = 0;
   4778 
   4779     if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
   4780     else {
   4781         ctxt->sax = sax;
   4782 	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
   4783     }
   4784     ctxt->userData = ctxt;
   4785     ctxt->myDoc = NULL;
   4786     ctxt->wellFormed = 1;
   4787     ctxt->replaceEntities = 0;
   4788     ctxt->linenumbers = xmlLineNumbersDefaultValue;
   4789     ctxt->html = 1;
   4790     ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
   4791     ctxt->vctxt.userData = ctxt;
   4792     ctxt->vctxt.error = xmlParserValidityError;
   4793     ctxt->vctxt.warning = xmlParserValidityWarning;
   4794     ctxt->record_info = 0;
   4795     ctxt->validate = 0;
   4796     ctxt->nbChars = 0;
   4797     ctxt->checkIndex = 0;
   4798     ctxt->catalogs = NULL;
   4799     xmlInitNodeInfoSeq(&ctxt->node_seq);
   4800     return(0);
   4801 }
   4802 
   4803 /**
   4804  * htmlFreeParserCtxt:
   4805  * @ctxt:  an HTML parser context
   4806  *
   4807  * Free all the memory used by a parser context. However the parsed
   4808  * document in ctxt->myDoc is not freed.
   4809  */
   4810 
   4811 void
   4812 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
   4813 {
   4814     xmlFreeParserCtxt(ctxt);
   4815 }
   4816 
   4817 /**
   4818  * htmlNewParserCtxt:
   4819  *
   4820  * Allocate and initialize a new parser context.
   4821  *
   4822  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
   4823  */
   4824 
   4825 htmlParserCtxtPtr
   4826 htmlNewParserCtxt(void)
   4827 {
   4828     xmlParserCtxtPtr ctxt;
   4829 
   4830     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
   4831     if (ctxt == NULL) {
   4832         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
   4833 	return(NULL);
   4834     }
   4835     memset(ctxt, 0, sizeof(xmlParserCtxt));
   4836     if (htmlInitParserCtxt(ctxt) < 0) {
   4837         htmlFreeParserCtxt(ctxt);
   4838 	return(NULL);
   4839     }
   4840     return(ctxt);
   4841 }
   4842 
   4843 /**
   4844  * htmlCreateMemoryParserCtxt:
   4845  * @buffer:  a pointer to a char array
   4846  * @size:  the size of the array
   4847  *
   4848  * Create a parser context for an HTML in-memory document.
   4849  *
   4850  * Returns the new parser context or NULL
   4851  */
   4852 htmlParserCtxtPtr
   4853 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
   4854     xmlParserCtxtPtr ctxt;
   4855     xmlParserInputPtr input;
   4856     xmlParserInputBufferPtr buf;
   4857 
   4858     if (buffer == NULL)
   4859 	return(NULL);
   4860     if (size <= 0)
   4861 	return(NULL);
   4862 
   4863     ctxt = htmlNewParserCtxt();
   4864     if (ctxt == NULL)
   4865 	return(NULL);
   4866 
   4867     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
   4868     if (buf == NULL) return(NULL);
   4869 
   4870     input = xmlNewInputStream(ctxt);
   4871     if (input == NULL) {
   4872 	xmlFreeParserCtxt(ctxt);
   4873 	return(NULL);
   4874     }
   4875 
   4876     input->filename = NULL;
   4877     input->buf = buf;
   4878     input->base = input->buf->buffer->content;
   4879     input->cur = input->buf->buffer->content;
   4880     input->end = &input->buf->buffer->content[input->buf->buffer->use];
   4881 
   4882     inputPush(ctxt, input);
   4883     return(ctxt);
   4884 }
   4885 
   4886 /**
   4887  * htmlCreateDocParserCtxt:
   4888  * @cur:  a pointer to an array of xmlChar
   4889  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   4890  *
   4891  * Create a parser context for an HTML document.
   4892  *
   4893  * TODO: check the need to add encoding handling there
   4894  *
   4895  * Returns the new parser context or NULL
   4896  */
   4897 static htmlParserCtxtPtr
   4898 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
   4899     int len;
   4900     htmlParserCtxtPtr ctxt;
   4901 
   4902     if (cur == NULL)
   4903 	return(NULL);
   4904     len = xmlStrlen(cur);
   4905     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
   4906     if (ctxt == NULL)
   4907 	return(NULL);
   4908 
   4909     if (encoding != NULL) {
   4910 	xmlCharEncoding enc;
   4911 	xmlCharEncodingHandlerPtr handler;
   4912 
   4913 	if (ctxt->input->encoding != NULL)
   4914 	    xmlFree((xmlChar *) ctxt->input->encoding);
   4915 	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
   4916 
   4917 	enc = xmlParseCharEncoding(encoding);
   4918 	/*
   4919 	 * registered set of known encodings
   4920 	 */
   4921 	if (enc != XML_CHAR_ENCODING_ERROR) {
   4922 	    xmlSwitchEncoding(ctxt, enc);
   4923 	    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
   4924 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
   4925 		             "Unsupported encoding %s\n",
   4926 			     (const xmlChar *) encoding, NULL);
   4927 	    }
   4928 	} else {
   4929 	    /*
   4930 	     * fallback for unknown encodings
   4931 	     */
   4932 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
   4933 	    if (handler != NULL) {
   4934 		xmlSwitchToEncoding(ctxt, handler);
   4935 	    } else {
   4936 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
   4937 		             "Unsupported encoding %s\n",
   4938 			     (const xmlChar *) encoding, NULL);
   4939 	    }
   4940 	}
   4941     }
   4942     return(ctxt);
   4943 }
   4944 
   4945 #ifdef LIBXML_PUSH_ENABLED
   4946 /************************************************************************
   4947  *									*
   4948  *	Progressive parsing interfaces				*
   4949  *									*
   4950  ************************************************************************/
   4951 
   4952 /**
   4953  * htmlParseLookupSequence:
   4954  * @ctxt:  an HTML parser context
   4955  * @first:  the first char to lookup
   4956  * @next:  the next char to lookup or zero
   4957  * @third:  the next char to lookup or zero
   4958  * @comment: flag to force checking inside comments
   4959  *
   4960  * Try to find if a sequence (first, next, third) or  just (first next) or
   4961  * (first) is available in the input stream.
   4962  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
   4963  * to avoid rescanning sequences of bytes, it DOES change the state of the
   4964  * parser, do not use liberally.
   4965  * This is basically similar to xmlParseLookupSequence()
   4966  *
   4967  * Returns the index to the current parsing point if the full sequence
   4968  *      is available, -1 otherwise.
   4969  */
   4970 static int
   4971 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
   4972                         xmlChar next, xmlChar third, int iscomment,
   4973                         int ignoreattrval)
   4974 {
   4975     int base, len;
   4976     htmlParserInputPtr in;
   4977     const xmlChar *buf;
   4978     int incomment = 0;
   4979     int invalue = 0;
   4980     char valdellim = 0x0;
   4981 
   4982     in = ctxt->input;
   4983     if (in == NULL)
   4984         return (-1);
   4985 
   4986     base = in->cur - in->base;
   4987     if (base < 0)
   4988         return (-1);
   4989 
   4990     if (ctxt->checkIndex > base)
   4991         base = ctxt->checkIndex;
   4992 
   4993     if (in->buf == NULL) {
   4994         buf = in->base;
   4995         len = in->length;
   4996     } else {
   4997         buf = in->buf->buffer->content;
   4998         len = in->buf->buffer->use;
   4999     }
   5000 
   5001     /* take into account the sequence length */
   5002     if (third)
   5003         len -= 2;
   5004     else if (next)
   5005         len--;
   5006     for (; base < len; base++) {
   5007         if ((!incomment) && (base + 4 < len) && (!iscomment)) {
   5008             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
   5009                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
   5010                 incomment = 1;
   5011                 /* do not increment past <! - some people use <!--> */
   5012                 base += 2;
   5013             }
   5014         }
   5015         if (ignoreattrval) {
   5016             if (buf[base] == '"' || buf[base] == '\'') {
   5017                 if (invalue) {
   5018                     if (buf[base] == valdellim) {
   5019                         invalue = 0;
   5020                         continue;
   5021                     }
   5022                 } else {
   5023                     valdellim = buf[base];
   5024                     invalue = 1;
   5025                     continue;
   5026                 }
   5027             } else if (invalue) {
   5028                 continue;
   5029             }
   5030         }
   5031         if (incomment) {
   5032             if (base + 3 > len)
   5033                 return (-1);
   5034             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
   5035                 (buf[base + 2] == '>')) {
   5036                 incomment = 0;
   5037                 base += 2;
   5038             }
   5039             continue;
   5040         }
   5041         if (buf[base] == first) {
   5042             if (third != 0) {
   5043                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
   5044                     continue;
   5045             } else if (next != 0) {
   5046                 if (buf[base + 1] != next)
   5047                     continue;
   5048             }
   5049             ctxt->checkIndex = 0;
   5050 #ifdef DEBUG_PUSH
   5051             if (next == 0)
   5052                 xmlGenericError(xmlGenericErrorContext,
   5053                                 "HPP: lookup '%c' found at %d\n",
   5054                                 first, base);
   5055             else if (third == 0)
   5056                 xmlGenericError(xmlGenericErrorContext,
   5057                                 "HPP: lookup '%c%c' found at %d\n",
   5058                                 first, next, base);
   5059             else
   5060                 xmlGenericError(xmlGenericErrorContext,
   5061                                 "HPP: lookup '%c%c%c' found at %d\n",
   5062                                 first, next, third, base);
   5063 #endif
   5064             return (base - (in->cur - in->base));
   5065         }
   5066     }
   5067     if ((!incomment) && (!invalue))
   5068         ctxt->checkIndex = base;
   5069 #ifdef DEBUG_PUSH
   5070     if (next == 0)
   5071         xmlGenericError(xmlGenericErrorContext,
   5072                         "HPP: lookup '%c' failed\n", first);
   5073     else if (third == 0)
   5074         xmlGenericError(xmlGenericErrorContext,
   5075                         "HPP: lookup '%c%c' failed\n", first, next);
   5076     else
   5077         xmlGenericError(xmlGenericErrorContext,
   5078                         "HPP: lookup '%c%c%c' failed\n", first, next,
   5079                         third);
   5080 #endif
   5081     return (-1);
   5082 }
   5083 
   5084 /**
   5085  * htmlParseLookupChars:
   5086  * @ctxt: an HTML parser context
   5087  * @stop: Array of chars, which stop the lookup.
   5088  * @stopLen: Length of stop-Array
   5089  *
   5090  * Try to find if any char of the stop-Array is available in the input
   5091  * stream.
   5092  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
   5093  * to avoid rescanning sequences of bytes, it DOES change the state of the
   5094  * parser, do not use liberally.
   5095  *
   5096  * Returns the index to the current parsing point if a stopChar
   5097  *      is available, -1 otherwise.
   5098  */
   5099 static int
   5100 htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
   5101                      int stopLen)
   5102 {
   5103     int base, len;
   5104     htmlParserInputPtr in;
   5105     const xmlChar *buf;
   5106     int incomment = 0;
   5107     int i;
   5108 
   5109     in = ctxt->input;
   5110     if (in == NULL)
   5111         return (-1);
   5112 
   5113     base = in->cur - in->base;
   5114     if (base < 0)
   5115         return (-1);
   5116 
   5117     if (ctxt->checkIndex > base)
   5118         base = ctxt->checkIndex;
   5119 
   5120     if (in->buf == NULL) {
   5121         buf = in->base;
   5122         len = in->length;
   5123     } else {
   5124         buf = in->buf->buffer->content;
   5125         len = in->buf->buffer->use;
   5126     }
   5127 
   5128     for (; base < len; base++) {
   5129         if (!incomment && (base + 4 < len)) {
   5130             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
   5131                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
   5132                 incomment = 1;
   5133                 /* do not increment past <! - some people use <!--> */
   5134                 base += 2;
   5135             }
   5136         }
   5137         if (incomment) {
   5138             if (base + 3 > len)
   5139                 return (-1);
   5140             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
   5141                 (buf[base + 2] == '>')) {
   5142                 incomment = 0;
   5143                 base += 2;
   5144             }
   5145             continue;
   5146         }
   5147         for (i = 0; i < stopLen; ++i) {
   5148             if (buf[base] == stop[i]) {
   5149                 ctxt->checkIndex = 0;
   5150                 return (base - (in->cur - in->base));
   5151             }
   5152         }
   5153     }
   5154     ctxt->checkIndex = base;
   5155     return (-1);
   5156 }
   5157 
   5158 /**
   5159  * htmlParseTryOrFinish:
   5160  * @ctxt:  an HTML parser context
   5161  * @terminate:  last chunk indicator
   5162  *
   5163  * Try to progress on parsing
   5164  *
   5165  * Returns zero if no parsing was possible
   5166  */
   5167 static int
   5168 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
   5169     int ret = 0;
   5170     htmlParserInputPtr in;
   5171     int avail = 0;
   5172     xmlChar cur, next;
   5173 
   5174 #ifdef DEBUG_PUSH
   5175     switch (ctxt->instate) {
   5176 	case XML_PARSER_EOF:
   5177 	    xmlGenericError(xmlGenericErrorContext,
   5178 		    "HPP: try EOF\n"); break;
   5179 	case XML_PARSER_START:
   5180 	    xmlGenericError(xmlGenericErrorContext,
   5181 		    "HPP: try START\n"); break;
   5182 	case XML_PARSER_MISC:
   5183 	    xmlGenericError(xmlGenericErrorContext,
   5184 		    "HPP: try MISC\n");break;
   5185 	case XML_PARSER_COMMENT:
   5186 	    xmlGenericError(xmlGenericErrorContext,
   5187 		    "HPP: try COMMENT\n");break;
   5188 	case XML_PARSER_PROLOG:
   5189 	    xmlGenericError(xmlGenericErrorContext,
   5190 		    "HPP: try PROLOG\n");break;
   5191 	case XML_PARSER_START_TAG:
   5192 	    xmlGenericError(xmlGenericErrorContext,
   5193 		    "HPP: try START_TAG\n");break;
   5194 	case XML_PARSER_CONTENT:
   5195 	    xmlGenericError(xmlGenericErrorContext,
   5196 		    "HPP: try CONTENT\n");break;
   5197 	case XML_PARSER_CDATA_SECTION:
   5198 	    xmlGenericError(xmlGenericErrorContext,
   5199 		    "HPP: try CDATA_SECTION\n");break;
   5200 	case XML_PARSER_END_TAG:
   5201 	    xmlGenericError(xmlGenericErrorContext,
   5202 		    "HPP: try END_TAG\n");break;
   5203 	case XML_PARSER_ENTITY_DECL:
   5204 	    xmlGenericError(xmlGenericErrorContext,
   5205 		    "HPP: try ENTITY_DECL\n");break;
   5206 	case XML_PARSER_ENTITY_VALUE:
   5207 	    xmlGenericError(xmlGenericErrorContext,
   5208 		    "HPP: try ENTITY_VALUE\n");break;
   5209 	case XML_PARSER_ATTRIBUTE_VALUE:
   5210 	    xmlGenericError(xmlGenericErrorContext,
   5211 		    "HPP: try ATTRIBUTE_VALUE\n");break;
   5212 	case XML_PARSER_DTD:
   5213 	    xmlGenericError(xmlGenericErrorContext,
   5214 		    "HPP: try DTD\n");break;
   5215 	case XML_PARSER_EPILOG:
   5216 	    xmlGenericError(xmlGenericErrorContext,
   5217 		    "HPP: try EPILOG\n");break;
   5218 	case XML_PARSER_PI:
   5219 	    xmlGenericError(xmlGenericErrorContext,
   5220 		    "HPP: try PI\n");break;
   5221 	case XML_PARSER_SYSTEM_LITERAL:
   5222 	    xmlGenericError(xmlGenericErrorContext,
   5223 		    "HPP: try SYSTEM_LITERAL\n");break;
   5224     }
   5225 #endif
   5226 
   5227     while (1) {
   5228 
   5229 	in = ctxt->input;
   5230 	if (in == NULL) break;
   5231 	if (in->buf == NULL)
   5232 	    avail = in->length - (in->cur - in->base);
   5233 	else
   5234 	    avail = in->buf->buffer->use - (in->cur - in->base);
   5235 	if ((avail == 0) && (terminate)) {
   5236 	    htmlAutoCloseOnEnd(ctxt);
   5237 	    if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
   5238 		/*
   5239 		 * SAX: end of the document processing.
   5240 		 */
   5241 		ctxt->instate = XML_PARSER_EOF;
   5242 		if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
   5243 		    ctxt->sax->endDocument(ctxt->userData);
   5244 	    }
   5245 	}
   5246         if (avail < 1)
   5247 	    goto done;
   5248 	cur = in->cur[0];
   5249 	if (cur == 0) {
   5250 	    SKIP(1);
   5251 	    continue;
   5252 	}
   5253 
   5254         switch (ctxt->instate) {
   5255             case XML_PARSER_EOF:
   5256 	        /*
   5257 		 * Document parsing is done !
   5258 		 */
   5259 	        goto done;
   5260             case XML_PARSER_START:
   5261 	        /*
   5262 		 * Very first chars read from the document flow.
   5263 		 */
   5264 		cur = in->cur[0];
   5265 		if (IS_BLANK_CH(cur)) {
   5266 		    SKIP_BLANKS;
   5267 		    if (in->buf == NULL)
   5268 			avail = in->length - (in->cur - in->base);
   5269 		    else
   5270 			avail = in->buf->buffer->use - (in->cur - in->base);
   5271 		}
   5272 		if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
   5273 		    ctxt->sax->setDocumentLocator(ctxt->userData,
   5274 						  &xmlDefaultSAXLocator);
   5275 		if ((ctxt->sax) && (ctxt->sax->startDocument) &&
   5276 	            (!ctxt->disableSAX))
   5277 		    ctxt->sax->startDocument(ctxt->userData);
   5278 
   5279 		cur = in->cur[0];
   5280 		next = in->cur[1];
   5281 		if ((cur == '<') && (next == '!') &&
   5282 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
   5283 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
   5284 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
   5285 		    (UPP(8) == 'E')) {
   5286 		    if ((!terminate) &&
   5287 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5288 			goto done;
   5289 #ifdef DEBUG_PUSH
   5290 		    xmlGenericError(xmlGenericErrorContext,
   5291 			    "HPP: Parsing internal subset\n");
   5292 #endif
   5293 		    htmlParseDocTypeDecl(ctxt);
   5294 		    ctxt->instate = XML_PARSER_PROLOG;
   5295 #ifdef DEBUG_PUSH
   5296 		    xmlGenericError(xmlGenericErrorContext,
   5297 			    "HPP: entering PROLOG\n");
   5298 #endif
   5299                 } else {
   5300 		    ctxt->instate = XML_PARSER_MISC;
   5301 #ifdef DEBUG_PUSH
   5302 		    xmlGenericError(xmlGenericErrorContext,
   5303 			    "HPP: entering MISC\n");
   5304 #endif
   5305 		}
   5306 		break;
   5307             case XML_PARSER_MISC:
   5308 		SKIP_BLANKS;
   5309 		if (in->buf == NULL)
   5310 		    avail = in->length - (in->cur - in->base);
   5311 		else
   5312 		    avail = in->buf->buffer->use - (in->cur - in->base);
   5313 		if (avail < 2)
   5314 		    goto done;
   5315 		cur = in->cur[0];
   5316 		next = in->cur[1];
   5317 	        if ((cur == '<') && (next == '!') &&
   5318 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
   5319 		    if ((!terminate) &&
   5320 		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
   5321 			goto done;
   5322 #ifdef DEBUG_PUSH
   5323 		    xmlGenericError(xmlGenericErrorContext,
   5324 			    "HPP: Parsing Comment\n");
   5325 #endif
   5326 		    htmlParseComment(ctxt);
   5327 		    ctxt->instate = XML_PARSER_MISC;
   5328 	        } else if ((cur == '<') && (next == '?')) {
   5329 		    if ((!terminate) &&
   5330 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5331 			goto done;
   5332 #ifdef DEBUG_PUSH
   5333 		    xmlGenericError(xmlGenericErrorContext,
   5334 			    "HPP: Parsing PI\n");
   5335 #endif
   5336 		    htmlParsePI(ctxt);
   5337 		    ctxt->instate = XML_PARSER_MISC;
   5338 		} else if ((cur == '<') && (next == '!') &&
   5339 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
   5340 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
   5341 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
   5342 		    (UPP(8) == 'E')) {
   5343 		    if ((!terminate) &&
   5344 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5345 			goto done;
   5346 #ifdef DEBUG_PUSH
   5347 		    xmlGenericError(xmlGenericErrorContext,
   5348 			    "HPP: Parsing internal subset\n");
   5349 #endif
   5350 		    htmlParseDocTypeDecl(ctxt);
   5351 		    ctxt->instate = XML_PARSER_PROLOG;
   5352 #ifdef DEBUG_PUSH
   5353 		    xmlGenericError(xmlGenericErrorContext,
   5354 			    "HPP: entering PROLOG\n");
   5355 #endif
   5356 		} else if ((cur == '<') && (next == '!') &&
   5357 		           (avail < 9)) {
   5358 		    goto done;
   5359 		} else {
   5360 		    ctxt->instate = XML_PARSER_START_TAG;
   5361 #ifdef DEBUG_PUSH
   5362 		    xmlGenericError(xmlGenericErrorContext,
   5363 			    "HPP: entering START_TAG\n");
   5364 #endif
   5365 		}
   5366 		break;
   5367             case XML_PARSER_PROLOG:
   5368 		SKIP_BLANKS;
   5369 		if (in->buf == NULL)
   5370 		    avail = in->length - (in->cur - in->base);
   5371 		else
   5372 		    avail = in->buf->buffer->use - (in->cur - in->base);
   5373 		if (avail < 2)
   5374 		    goto done;
   5375 		cur = in->cur[0];
   5376 		next = in->cur[1];
   5377 		if ((cur == '<') && (next == '!') &&
   5378 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
   5379 		    if ((!terminate) &&
   5380 		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
   5381 			goto done;
   5382 #ifdef DEBUG_PUSH
   5383 		    xmlGenericError(xmlGenericErrorContext,
   5384 			    "HPP: Parsing Comment\n");
   5385 #endif
   5386 		    htmlParseComment(ctxt);
   5387 		    ctxt->instate = XML_PARSER_PROLOG;
   5388 	        } else if ((cur == '<') && (next == '?')) {
   5389 		    if ((!terminate) &&
   5390 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5391 			goto done;
   5392 #ifdef DEBUG_PUSH
   5393 		    xmlGenericError(xmlGenericErrorContext,
   5394 			    "HPP: Parsing PI\n");
   5395 #endif
   5396 		    htmlParsePI(ctxt);
   5397 		    ctxt->instate = XML_PARSER_PROLOG;
   5398 		} else if ((cur == '<') && (next == '!') &&
   5399 		           (avail < 4)) {
   5400 		    goto done;
   5401 		} else {
   5402 		    ctxt->instate = XML_PARSER_START_TAG;
   5403 #ifdef DEBUG_PUSH
   5404 		    xmlGenericError(xmlGenericErrorContext,
   5405 			    "HPP: entering START_TAG\n");
   5406 #endif
   5407 		}
   5408 		break;
   5409             case XML_PARSER_EPILOG:
   5410 		if (in->buf == NULL)
   5411 		    avail = in->length - (in->cur - in->base);
   5412 		else
   5413 		    avail = in->buf->buffer->use - (in->cur - in->base);
   5414 		if (avail < 1)
   5415 		    goto done;
   5416 		cur = in->cur[0];
   5417 		if (IS_BLANK_CH(cur)) {
   5418 		    htmlParseCharData(ctxt);
   5419 		    goto done;
   5420 		}
   5421 		if (avail < 2)
   5422 		    goto done;
   5423 		next = in->cur[1];
   5424 	        if ((cur == '<') && (next == '!') &&
   5425 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
   5426 		    if ((!terminate) &&
   5427 		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
   5428 			goto done;
   5429 #ifdef DEBUG_PUSH
   5430 		    xmlGenericError(xmlGenericErrorContext,
   5431 			    "HPP: Parsing Comment\n");
   5432 #endif
   5433 		    htmlParseComment(ctxt);
   5434 		    ctxt->instate = XML_PARSER_EPILOG;
   5435 	        } else if ((cur == '<') && (next == '?')) {
   5436 		    if ((!terminate) &&
   5437 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5438 			goto done;
   5439 #ifdef DEBUG_PUSH
   5440 		    xmlGenericError(xmlGenericErrorContext,
   5441 			    "HPP: Parsing PI\n");
   5442 #endif
   5443 		    htmlParsePI(ctxt);
   5444 		    ctxt->instate = XML_PARSER_EPILOG;
   5445 		} else if ((cur == '<') && (next == '!') &&
   5446 		           (avail < 4)) {
   5447 		    goto done;
   5448 		} else {
   5449 		    ctxt->errNo = XML_ERR_DOCUMENT_END;
   5450 		    ctxt->wellFormed = 0;
   5451 		    ctxt->instate = XML_PARSER_EOF;
   5452 #ifdef DEBUG_PUSH
   5453 		    xmlGenericError(xmlGenericErrorContext,
   5454 			    "HPP: entering EOF\n");
   5455 #endif
   5456 		    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
   5457 			ctxt->sax->endDocument(ctxt->userData);
   5458 		    goto done;
   5459 		}
   5460 		break;
   5461             case XML_PARSER_START_TAG: {
   5462 	        const xmlChar *name;
   5463 		int failed;
   5464 		const htmlElemDesc * info;
   5465 
   5466 		if (avail < 2)
   5467 		    goto done;
   5468 		cur = in->cur[0];
   5469 	        if (cur != '<') {
   5470 		    ctxt->instate = XML_PARSER_CONTENT;
   5471 #ifdef DEBUG_PUSH
   5472 		    xmlGenericError(xmlGenericErrorContext,
   5473 			    "HPP: entering CONTENT\n");
   5474 #endif
   5475 		    break;
   5476 		}
   5477 		if (in->cur[1] == '/') {
   5478 		    ctxt->instate = XML_PARSER_END_TAG;
   5479 		    ctxt->checkIndex = 0;
   5480 #ifdef DEBUG_PUSH
   5481 		    xmlGenericError(xmlGenericErrorContext,
   5482 			    "HPP: entering END_TAG\n");
   5483 #endif
   5484 		    break;
   5485 		}
   5486 		if ((!terminate) &&
   5487 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5488 		    goto done;
   5489 
   5490 		failed = htmlParseStartTag(ctxt);
   5491 		name = ctxt->name;
   5492 		if ((failed == -1) ||
   5493 		    (name == NULL)) {
   5494 		    if (CUR == '>')
   5495 			NEXT;
   5496 		    break;
   5497 		}
   5498 
   5499 		/*
   5500 		 * Lookup the info for that element.
   5501 		 */
   5502 		info = htmlTagLookup(name);
   5503 		if (info == NULL) {
   5504 		    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
   5505 		                 "Tag %s invalid\n", name, NULL);
   5506 		}
   5507 
   5508 		/*
   5509 		 * Check for an Empty Element labeled the XML/SGML way
   5510 		 */
   5511 		if ((CUR == '/') && (NXT(1) == '>')) {
   5512 		    SKIP(2);
   5513 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   5514 			ctxt->sax->endElement(ctxt->userData, name);
   5515 		    htmlnamePop(ctxt);
   5516 		    ctxt->instate = XML_PARSER_CONTENT;
   5517 #ifdef DEBUG_PUSH
   5518 		    xmlGenericError(xmlGenericErrorContext,
   5519 			    "HPP: entering CONTENT\n");
   5520 #endif
   5521 		    break;
   5522 		}
   5523 
   5524 		if (CUR == '>') {
   5525 		    NEXT;
   5526 		} else {
   5527 		    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
   5528 		                 "Couldn't find end of Start Tag %s\n",
   5529 				 name, NULL);
   5530 
   5531 		    /*
   5532 		     * end of parsing of this node.
   5533 		     */
   5534 		    if (xmlStrEqual(name, ctxt->name)) {
   5535 			nodePop(ctxt);
   5536 			htmlnamePop(ctxt);
   5537 		    }
   5538 
   5539 		    ctxt->instate = XML_PARSER_CONTENT;
   5540 #ifdef DEBUG_PUSH
   5541 		    xmlGenericError(xmlGenericErrorContext,
   5542 			    "HPP: entering CONTENT\n");
   5543 #endif
   5544 		    break;
   5545 		}
   5546 
   5547 		/*
   5548 		 * Check for an Empty Element from DTD definition
   5549 		 */
   5550 		if ((info != NULL) && (info->empty)) {
   5551 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   5552 			ctxt->sax->endElement(ctxt->userData, name);
   5553 		    htmlnamePop(ctxt);
   5554 		}
   5555 		ctxt->instate = XML_PARSER_CONTENT;
   5556 #ifdef DEBUG_PUSH
   5557 		xmlGenericError(xmlGenericErrorContext,
   5558 			"HPP: entering CONTENT\n");
   5559 #endif
   5560                 break;
   5561 	    }
   5562             case XML_PARSER_CONTENT: {
   5563 		long cons;
   5564                 /*
   5565 		 * Handle preparsed entities and charRef
   5566 		 */
   5567 		if (ctxt->token != 0) {
   5568 		    xmlChar chr[2] = { 0 , 0 } ;
   5569 
   5570 		    chr[0] = (xmlChar) ctxt->token;
   5571 		    htmlCheckParagraph(ctxt);
   5572 		    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
   5573 			ctxt->sax->characters(ctxt->userData, chr, 1);
   5574 		    ctxt->token = 0;
   5575 		    ctxt->checkIndex = 0;
   5576 		}
   5577 		if ((avail == 1) && (terminate)) {
   5578 		    cur = in->cur[0];
   5579 		    if ((cur != '<') && (cur != '&')) {
   5580 			if (ctxt->sax != NULL) {
   5581 			    if (IS_BLANK_CH(cur)) {
   5582 				if (ctxt->sax->ignorableWhitespace != NULL)
   5583 				    ctxt->sax->ignorableWhitespace(
   5584 					    ctxt->userData, &cur, 1);
   5585 			    } else {
   5586 				htmlCheckParagraph(ctxt);
   5587 				if (ctxt->sax->characters != NULL)
   5588 				    ctxt->sax->characters(
   5589 					    ctxt->userData, &cur, 1);
   5590 			    }
   5591 			}
   5592 			ctxt->token = 0;
   5593 			ctxt->checkIndex = 0;
   5594 			in->cur++;
   5595 			break;
   5596 		    }
   5597 		}
   5598 		if (avail < 2)
   5599 		    goto done;
   5600 		cur = in->cur[0];
   5601 		next = in->cur[1];
   5602 		cons = ctxt->nbChars;
   5603 		if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
   5604 		    (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
   5605 		    /*
   5606 		     * Handle SCRIPT/STYLE separately
   5607 		     */
   5608 		    if (!terminate) {
   5609 		        int idx;
   5610 			xmlChar val;
   5611 
   5612 			idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 1);
   5613 			if (idx < 0)
   5614 			    goto done;
   5615 		        val = in->cur[idx + 2];
   5616 			if (val == 0) /* bad cut of input */
   5617 			    goto done;
   5618 		    }
   5619 		    htmlParseScript(ctxt);
   5620 		    if ((cur == '<') && (next == '/')) {
   5621 			ctxt->instate = XML_PARSER_END_TAG;
   5622 			ctxt->checkIndex = 0;
   5623 #ifdef DEBUG_PUSH
   5624 			xmlGenericError(xmlGenericErrorContext,
   5625 				"HPP: entering END_TAG\n");
   5626 #endif
   5627 			break;
   5628 		    }
   5629 		} else {
   5630 		    /*
   5631 		     * Sometimes DOCTYPE arrives in the middle of the document
   5632 		     */
   5633 		    if ((cur == '<') && (next == '!') &&
   5634 			(UPP(2) == 'D') && (UPP(3) == 'O') &&
   5635 			(UPP(4) == 'C') && (UPP(5) == 'T') &&
   5636 			(UPP(6) == 'Y') && (UPP(7) == 'P') &&
   5637 			(UPP(8) == 'E')) {
   5638 			if ((!terminate) &&
   5639 			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5640 			    goto done;
   5641 			htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   5642 			             "Misplaced DOCTYPE declaration\n",
   5643 				     BAD_CAST "DOCTYPE" , NULL);
   5644 			htmlParseDocTypeDecl(ctxt);
   5645 		    } else if ((cur == '<') && (next == '!') &&
   5646 			(in->cur[2] == '-') && (in->cur[3] == '-')) {
   5647 			if ((!terminate) &&
   5648 			    (htmlParseLookupSequence(
   5649 				ctxt, '-', '-', '>', 1, 1) < 0))
   5650 			    goto done;
   5651 #ifdef DEBUG_PUSH
   5652 			xmlGenericError(xmlGenericErrorContext,
   5653 				"HPP: Parsing Comment\n");
   5654 #endif
   5655 			htmlParseComment(ctxt);
   5656 			ctxt->instate = XML_PARSER_CONTENT;
   5657 		    } else if ((cur == '<') && (next == '?')) {
   5658 			if ((!terminate) &&
   5659 			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5660 			    goto done;
   5661 #ifdef DEBUG_PUSH
   5662 			xmlGenericError(xmlGenericErrorContext,
   5663 				"HPP: Parsing PI\n");
   5664 #endif
   5665 			htmlParsePI(ctxt);
   5666 			ctxt->instate = XML_PARSER_CONTENT;
   5667 		    } else if ((cur == '<') && (next == '!') && (avail < 4)) {
   5668 			goto done;
   5669 		    } else if ((cur == '<') && (next == '/')) {
   5670 			ctxt->instate = XML_PARSER_END_TAG;
   5671 			ctxt->checkIndex = 0;
   5672 #ifdef DEBUG_PUSH
   5673 			xmlGenericError(xmlGenericErrorContext,
   5674 				"HPP: entering END_TAG\n");
   5675 #endif
   5676 			break;
   5677 		    } else if (cur == '<') {
   5678 			ctxt->instate = XML_PARSER_START_TAG;
   5679 			ctxt->checkIndex = 0;
   5680 #ifdef DEBUG_PUSH
   5681 			xmlGenericError(xmlGenericErrorContext,
   5682 				"HPP: entering START_TAG\n");
   5683 #endif
   5684 			break;
   5685 		    } else if (cur == '&') {
   5686 			if ((!terminate) &&
   5687 			    (htmlParseLookupChars(ctxt,
   5688                                                   BAD_CAST "; >/", 4) < 0))
   5689 			    goto done;
   5690 #ifdef DEBUG_PUSH
   5691 			xmlGenericError(xmlGenericErrorContext,
   5692 				"HPP: Parsing Reference\n");
   5693 #endif
   5694 			/* TODO: check generation of subtrees if noent !!! */
   5695 			htmlParseReference(ctxt);
   5696 		    } else {
   5697 		        /*
   5698 			 * check that the text sequence is complete
   5699 			 * before handing out the data to the parser
   5700 			 * to avoid problems with erroneous end of
   5701 			 * data detection.
   5702 			 */
   5703 			if ((!terminate) &&
   5704                             (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
   5705 			    goto done;
   5706 			ctxt->checkIndex = 0;
   5707 #ifdef DEBUG_PUSH
   5708 			xmlGenericError(xmlGenericErrorContext,
   5709 				"HPP: Parsing char data\n");
   5710 #endif
   5711 			htmlParseCharData(ctxt);
   5712 		    }
   5713 		}
   5714 		if (cons == ctxt->nbChars) {
   5715 		    if (ctxt->node != NULL) {
   5716 			htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5717 			             "detected an error in element content\n",
   5718 				     NULL, NULL);
   5719 		    }
   5720 		    NEXT;
   5721 		    break;
   5722 		}
   5723 
   5724 		break;
   5725 	    }
   5726             case XML_PARSER_END_TAG:
   5727 		if (avail < 2)
   5728 		    goto done;
   5729 		if ((!terminate) &&
   5730 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5731 		    goto done;
   5732 		htmlParseEndTag(ctxt);
   5733 		if (ctxt->nameNr == 0) {
   5734 		    ctxt->instate = XML_PARSER_EPILOG;
   5735 		} else {
   5736 		    ctxt->instate = XML_PARSER_CONTENT;
   5737 		}
   5738 		ctxt->checkIndex = 0;
   5739 #ifdef DEBUG_PUSH
   5740 		xmlGenericError(xmlGenericErrorContext,
   5741 			"HPP: entering CONTENT\n");
   5742 #endif
   5743 	        break;
   5744             case XML_PARSER_CDATA_SECTION:
   5745 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5746 			"HPP: internal error, state == CDATA\n",
   5747 			     NULL, NULL);
   5748 		ctxt->instate = XML_PARSER_CONTENT;
   5749 		ctxt->checkIndex = 0;
   5750 #ifdef DEBUG_PUSH
   5751 		xmlGenericError(xmlGenericErrorContext,
   5752 			"HPP: entering CONTENT\n");
   5753 #endif
   5754 		break;
   5755             case XML_PARSER_DTD:
   5756 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5757 			"HPP: internal error, state == DTD\n",
   5758 			     NULL, NULL);
   5759 		ctxt->instate = XML_PARSER_CONTENT;
   5760 		ctxt->checkIndex = 0;
   5761 #ifdef DEBUG_PUSH
   5762 		xmlGenericError(xmlGenericErrorContext,
   5763 			"HPP: entering CONTENT\n");
   5764 #endif
   5765 		break;
   5766             case XML_PARSER_COMMENT:
   5767 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5768 			"HPP: internal error, state == COMMENT\n",
   5769 			     NULL, NULL);
   5770 		ctxt->instate = XML_PARSER_CONTENT;
   5771 		ctxt->checkIndex = 0;
   5772 #ifdef DEBUG_PUSH
   5773 		xmlGenericError(xmlGenericErrorContext,
   5774 			"HPP: entering CONTENT\n");
   5775 #endif
   5776 		break;
   5777             case XML_PARSER_PI:
   5778 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5779 			"HPP: internal error, state == PI\n",
   5780 			     NULL, NULL);
   5781 		ctxt->instate = XML_PARSER_CONTENT;
   5782 		ctxt->checkIndex = 0;
   5783 #ifdef DEBUG_PUSH
   5784 		xmlGenericError(xmlGenericErrorContext,
   5785 			"HPP: entering CONTENT\n");
   5786 #endif
   5787 		break;
   5788             case XML_PARSER_ENTITY_DECL:
   5789 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5790 			"HPP: internal error, state == ENTITY_DECL\n",
   5791 			     NULL, NULL);
   5792 		ctxt->instate = XML_PARSER_CONTENT;
   5793 		ctxt->checkIndex = 0;
   5794 #ifdef DEBUG_PUSH
   5795 		xmlGenericError(xmlGenericErrorContext,
   5796 			"HPP: entering CONTENT\n");
   5797 #endif
   5798 		break;
   5799             case XML_PARSER_ENTITY_VALUE:
   5800 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5801 			"HPP: internal error, state == ENTITY_VALUE\n",
   5802 			     NULL, NULL);
   5803 		ctxt->instate = XML_PARSER_CONTENT;
   5804 		ctxt->checkIndex = 0;
   5805 #ifdef DEBUG_PUSH
   5806 		xmlGenericError(xmlGenericErrorContext,
   5807 			"HPP: entering DTD\n");
   5808 #endif
   5809 		break;
   5810             case XML_PARSER_ATTRIBUTE_VALUE:
   5811 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5812 			"HPP: internal error, state == ATTRIBUTE_VALUE\n",
   5813 			     NULL, NULL);
   5814 		ctxt->instate = XML_PARSER_START_TAG;
   5815 		ctxt->checkIndex = 0;
   5816 #ifdef DEBUG_PUSH
   5817 		xmlGenericError(xmlGenericErrorContext,
   5818 			"HPP: entering START_TAG\n");
   5819 #endif
   5820 		break;
   5821 	    case XML_PARSER_SYSTEM_LITERAL:
   5822 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5823 		    "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
   5824 			     NULL, NULL);
   5825 		ctxt->instate = XML_PARSER_CONTENT;
   5826 		ctxt->checkIndex = 0;
   5827 #ifdef DEBUG_PUSH
   5828 		xmlGenericError(xmlGenericErrorContext,
   5829 			"HPP: entering CONTENT\n");
   5830 #endif
   5831 		break;
   5832 	    case XML_PARSER_IGNORE:
   5833 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5834 			"HPP: internal error, state == XML_PARSER_IGNORE\n",
   5835 			     NULL, NULL);
   5836 		ctxt->instate = XML_PARSER_CONTENT;
   5837 		ctxt->checkIndex = 0;
   5838 #ifdef DEBUG_PUSH
   5839 		xmlGenericError(xmlGenericErrorContext,
   5840 			"HPP: entering CONTENT\n");
   5841 #endif
   5842 		break;
   5843 	    case XML_PARSER_PUBLIC_LITERAL:
   5844 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5845 			"HPP: internal error, state == XML_PARSER_LITERAL\n",
   5846 			     NULL, NULL);
   5847 		ctxt->instate = XML_PARSER_CONTENT;
   5848 		ctxt->checkIndex = 0;
   5849 #ifdef DEBUG_PUSH
   5850 		xmlGenericError(xmlGenericErrorContext,
   5851 			"HPP: entering CONTENT\n");
   5852 #endif
   5853 		break;
   5854 
   5855 	}
   5856     }
   5857 done:
   5858     if ((avail == 0) && (terminate)) {
   5859 	htmlAutoCloseOnEnd(ctxt);
   5860 	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
   5861 	    /*
   5862 	     * SAX: end of the document processing.
   5863 	     */
   5864 	    ctxt->instate = XML_PARSER_EOF;
   5865 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
   5866 		ctxt->sax->endDocument(ctxt->userData);
   5867 	}
   5868     }
   5869     if ((ctxt->myDoc != NULL) &&
   5870 	((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
   5871 	 (ctxt->instate == XML_PARSER_EPILOG))) {
   5872 	xmlDtdPtr dtd;
   5873 	dtd = xmlGetIntSubset(ctxt->myDoc);
   5874 	if (dtd == NULL)
   5875 	    ctxt->myDoc->intSubset =
   5876 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
   5877 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
   5878 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
   5879     }
   5880 #ifdef DEBUG_PUSH
   5881     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
   5882 #endif
   5883     return(ret);
   5884 }
   5885 
   5886 /**
   5887  * htmlParseChunk:
   5888  * @ctxt:  an HTML parser context
   5889  * @chunk:  an char array
   5890  * @size:  the size in byte of the chunk
   5891  * @terminate:  last chunk indicator
   5892  *
   5893  * Parse a Chunk of memory
   5894  *
   5895  * Returns zero if no error, the xmlParserErrors otherwise.
   5896  */
   5897 int
   5898 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
   5899               int terminate) {
   5900     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   5901 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5902 		     "htmlParseChunk: context error\n", NULL, NULL);
   5903 	return(XML_ERR_INTERNAL_ERROR);
   5904     }
   5905     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
   5906         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
   5907 	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
   5908 	int cur = ctxt->input->cur - ctxt->input->base;
   5909 	int res;
   5910 
   5911 	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
   5912 	if (res < 0) {
   5913 	    ctxt->errNo = XML_PARSER_EOF;
   5914 	    ctxt->disableSAX = 1;
   5915 	    return (XML_PARSER_EOF);
   5916 	}
   5917 	ctxt->input->base = ctxt->input->buf->buffer->content + base;
   5918 	ctxt->input->cur = ctxt->input->base + cur;
   5919 	ctxt->input->end =
   5920 	  &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
   5921 #ifdef DEBUG_PUSH
   5922 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
   5923 #endif
   5924 
   5925 #if 0
   5926 	if ((terminate) || (ctxt->input->buf->buffer->use > 80))
   5927 	    htmlParseTryOrFinish(ctxt, terminate);
   5928 #endif
   5929     } else if (ctxt->instate != XML_PARSER_EOF) {
   5930 	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
   5931 	    xmlParserInputBufferPtr in = ctxt->input->buf;
   5932 	    if ((in->encoder != NULL) && (in->buffer != NULL) &&
   5933 		    (in->raw != NULL)) {
   5934 		int nbchars;
   5935 
   5936 		nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
   5937 		if (nbchars < 0) {
   5938 		    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
   5939 			         "encoder error\n", NULL, NULL);
   5940 		    return(XML_ERR_INVALID_ENCODING);
   5941 		}
   5942 	    }
   5943 	}
   5944     }
   5945     htmlParseTryOrFinish(ctxt, terminate);
   5946     if (terminate) {
   5947 	if ((ctxt->instate != XML_PARSER_EOF) &&
   5948 	    (ctxt->instate != XML_PARSER_EPILOG) &&
   5949 	    (ctxt->instate != XML_PARSER_MISC)) {
   5950 	    ctxt->errNo = XML_ERR_DOCUMENT_END;
   5951 	    ctxt->wellFormed = 0;
   5952 	}
   5953 	if (ctxt->instate != XML_PARSER_EOF) {
   5954 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
   5955 		ctxt->sax->endDocument(ctxt->userData);
   5956 	}
   5957 	ctxt->instate = XML_PARSER_EOF;
   5958     }
   5959     return((xmlParserErrors) ctxt->errNo);
   5960 }
   5961 
   5962 /************************************************************************
   5963  *									*
   5964  *			User entry points				*
   5965  *									*
   5966  ************************************************************************/
   5967 
   5968 /**
   5969  * htmlCreatePushParserCtxt:
   5970  * @sax:  a SAX handler
   5971  * @user_data:  The user data returned on SAX callbacks
   5972  * @chunk:  a pointer to an array of chars
   5973  * @size:  number of chars in the array
   5974  * @filename:  an optional file name or URI
   5975  * @enc:  an optional encoding
   5976  *
   5977  * Create a parser context for using the HTML parser in push mode
   5978  * The value of @filename is used for fetching external entities
   5979  * and error/warning reports.
   5980  *
   5981  * Returns the new parser context or NULL
   5982  */
   5983 htmlParserCtxtPtr
   5984 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
   5985                          const char *chunk, int size, const char *filename,
   5986 			 xmlCharEncoding enc) {
   5987     htmlParserCtxtPtr ctxt;
   5988     htmlParserInputPtr inputStream;
   5989     xmlParserInputBufferPtr buf;
   5990 
   5991     xmlInitParser();
   5992 
   5993     buf = xmlAllocParserInputBuffer(enc);
   5994     if (buf == NULL) return(NULL);
   5995 
   5996     ctxt = htmlNewParserCtxt();
   5997     if (ctxt == NULL) {
   5998 	xmlFreeParserInputBuffer(buf);
   5999 	return(NULL);
   6000     }
   6001     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
   6002 	ctxt->charset=XML_CHAR_ENCODING_UTF8;
   6003     if (sax != NULL) {
   6004 	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
   6005 	    xmlFree(ctxt->sax);
   6006 	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
   6007 	if (ctxt->sax == NULL) {
   6008 	    xmlFree(buf);
   6009 	    xmlFree(ctxt);
   6010 	    return(NULL);
   6011 	}
   6012 	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
   6013 	if (user_data != NULL)
   6014 	    ctxt->userData = user_data;
   6015     }
   6016     if (filename == NULL) {
   6017 	ctxt->directory = NULL;
   6018     } else {
   6019         ctxt->directory = xmlParserGetDirectory(filename);
   6020     }
   6021 
   6022     inputStream = htmlNewInputStream(ctxt);
   6023     if (inputStream == NULL) {
   6024 	xmlFreeParserCtxt(ctxt);
   6025 	xmlFree(buf);
   6026 	return(NULL);
   6027     }
   6028 
   6029     if (filename == NULL)
   6030 	inputStream->filename = NULL;
   6031     else
   6032 	inputStream->filename = (char *)
   6033 	    xmlCanonicPath((const xmlChar *) filename);
   6034     inputStream->buf = buf;
   6035     inputStream->base = inputStream->buf->buffer->content;
   6036     inputStream->cur = inputStream->buf->buffer->content;
   6037     inputStream->end =
   6038 	&inputStream->buf->buffer->content[inputStream->buf->buffer->use];
   6039 
   6040     inputPush(ctxt, inputStream);
   6041 
   6042     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
   6043         (ctxt->input->buf != NULL))  {
   6044 	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
   6045 	int cur = ctxt->input->cur - ctxt->input->base;
   6046 
   6047 	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
   6048 
   6049 	ctxt->input->base = ctxt->input->buf->buffer->content + base;
   6050 	ctxt->input->cur = ctxt->input->base + cur;
   6051 	ctxt->input->end =
   6052 	    &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
   6053 #ifdef DEBUG_PUSH
   6054 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
   6055 #endif
   6056     }
   6057     ctxt->progressive = 1;
   6058 
   6059     return(ctxt);
   6060 }
   6061 #endif /* LIBXML_PUSH_ENABLED */
   6062 
   6063 /**
   6064  * htmlSAXParseDoc:
   6065  * @cur:  a pointer to an array of xmlChar
   6066  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   6067  * @sax:  the SAX handler block
   6068  * @userData: if using SAX, this pointer will be provided on callbacks.
   6069  *
   6070  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
   6071  * to handle parse events. If sax is NULL, fallback to the default DOM
   6072  * behavior and return a tree.
   6073  *
   6074  * Returns the resulting document tree unless SAX is NULL or the document is
   6075  *     not well formed.
   6076  */
   6077 
   6078 htmlDocPtr
   6079 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
   6080     htmlDocPtr ret;
   6081     htmlParserCtxtPtr ctxt;
   6082 
   6083     xmlInitParser();
   6084 
   6085     if (cur == NULL) return(NULL);
   6086 
   6087 
   6088     ctxt = htmlCreateDocParserCtxt(cur, encoding);
   6089     if (ctxt == NULL) return(NULL);
   6090     if (sax != NULL) {
   6091         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
   6092         ctxt->sax = sax;
   6093         ctxt->userData = userData;
   6094     }
   6095 
   6096     htmlParseDocument(ctxt);
   6097     ret = ctxt->myDoc;
   6098     if (sax != NULL) {
   6099 	ctxt->sax = NULL;
   6100 	ctxt->userData = NULL;
   6101     }
   6102     htmlFreeParserCtxt(ctxt);
   6103 
   6104     return(ret);
   6105 }
   6106 
   6107 /**
   6108  * htmlParseDoc:
   6109  * @cur:  a pointer to an array of xmlChar
   6110  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   6111  *
   6112  * parse an HTML in-memory document and build a tree.
   6113  *
   6114  * Returns the resulting document tree
   6115  */
   6116 
   6117 htmlDocPtr
   6118 htmlParseDoc(xmlChar *cur, const char *encoding) {
   6119     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
   6120 }
   6121 
   6122 
   6123 /**
   6124  * htmlCreateFileParserCtxt:
   6125  * @filename:  the filename
   6126  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   6127  *
   6128  * Create a parser context for a file content.
   6129  * Automatic support for ZLIB/Compress compressed document is provided
   6130  * by default if found at compile-time.
   6131  *
   6132  * Returns the new parser context or NULL
   6133  */
   6134 htmlParserCtxtPtr
   6135 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
   6136 {
   6137     htmlParserCtxtPtr ctxt;
   6138     htmlParserInputPtr inputStream;
   6139     char *canonicFilename;
   6140     /* htmlCharEncoding enc; */
   6141     xmlChar *content, *content_line = (xmlChar *) "charset=";
   6142 
   6143     if (filename == NULL)
   6144         return(NULL);
   6145 
   6146     ctxt = htmlNewParserCtxt();
   6147     if (ctxt == NULL) {
   6148 	return(NULL);
   6149     }
   6150     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
   6151     if (canonicFilename == NULL) {
   6152 #ifdef LIBXML_SAX1_ENABLED
   6153 	if (xmlDefaultSAXHandler.error != NULL) {
   6154 	    xmlDefaultSAXHandler.error(NULL, "out of memory\n");
   6155 	}
   6156 #endif
   6157 	xmlFreeParserCtxt(ctxt);
   6158 	return(NULL);
   6159     }
   6160 
   6161     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
   6162     xmlFree(canonicFilename);
   6163     if (inputStream == NULL) {
   6164 	xmlFreeParserCtxt(ctxt);
   6165 	return(NULL);
   6166     }
   6167 
   6168     inputPush(ctxt, inputStream);
   6169 
   6170     /* set encoding */
   6171     if (encoding) {
   6172         content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
   6173 	if (content) {
   6174 	    strcpy ((char *)content, (char *)content_line);
   6175             strcat ((char *)content, (char *)encoding);
   6176             htmlCheckEncoding (ctxt, content);
   6177 	    xmlFree (content);
   6178 	}
   6179     }
   6180 
   6181     return(ctxt);
   6182 }
   6183 
   6184 /**
   6185  * htmlSAXParseFile:
   6186  * @filename:  the filename
   6187  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   6188  * @sax:  the SAX handler block
   6189  * @userData: if using SAX, this pointer will be provided on callbacks.
   6190  *
   6191  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
   6192  * compressed document is provided by default if found at compile-time.
   6193  * It use the given SAX function block to handle the parsing callback.
   6194  * If sax is NULL, fallback to the default DOM tree building routines.
   6195  *
   6196  * Returns the resulting document tree unless SAX is NULL or the document is
   6197  *     not well formed.
   6198  */
   6199 
   6200 htmlDocPtr
   6201 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
   6202                  void *userData) {
   6203     htmlDocPtr ret;
   6204     htmlParserCtxtPtr ctxt;
   6205     htmlSAXHandlerPtr oldsax = NULL;
   6206 
   6207     xmlInitParser();
   6208 
   6209     ctxt = htmlCreateFileParserCtxt(filename, encoding);
   6210     if (ctxt == NULL) return(NULL);
   6211     if (sax != NULL) {
   6212 	oldsax = ctxt->sax;
   6213         ctxt->sax = sax;
   6214         ctxt->userData = userData;
   6215     }
   6216 
   6217     htmlParseDocument(ctxt);
   6218 
   6219     ret = ctxt->myDoc;
   6220     if (sax != NULL) {
   6221         ctxt->sax = oldsax;
   6222         ctxt->userData = NULL;
   6223     }
   6224     htmlFreeParserCtxt(ctxt);
   6225 
   6226     return(ret);
   6227 }
   6228 
   6229 /**
   6230  * htmlParseFile:
   6231  * @filename:  the filename
   6232  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   6233  *
   6234  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
   6235  * compressed document is provided by default if found at compile-time.
   6236  *
   6237  * Returns the resulting document tree
   6238  */
   6239 
   6240 htmlDocPtr
   6241 htmlParseFile(const char *filename, const char *encoding) {
   6242     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
   6243 }
   6244 
   6245 /**
   6246  * htmlHandleOmittedElem:
   6247  * @val:  int 0 or 1
   6248  *
   6249  * Set and return the previous value for handling HTML omitted tags.
   6250  *
   6251  * Returns the last value for 0 for no handling, 1 for auto insertion.
   6252  */
   6253 
   6254 int
   6255 htmlHandleOmittedElem(int val) {
   6256     int old = htmlOmittedDefaultValue;
   6257 
   6258     htmlOmittedDefaultValue = val;
   6259     return(old);
   6260 }
   6261 
   6262 /**
   6263  * htmlElementAllowedHere:
   6264  * @parent: HTML parent element
   6265  * @elt: HTML element
   6266  *
   6267  * Checks whether an HTML element may be a direct child of a parent element.
   6268  * Note - doesn't check for deprecated elements
   6269  *
   6270  * Returns 1 if allowed; 0 otherwise.
   6271  */
   6272 int
   6273 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
   6274   const char** p ;
   6275 
   6276   if ( ! elt || ! parent || ! parent->subelts )
   6277 	return 0 ;
   6278 
   6279   for ( p = parent->subelts; *p; ++p )
   6280     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
   6281       return 1 ;
   6282 
   6283   return 0 ;
   6284 }
   6285 /**
   6286  * htmlElementStatusHere:
   6287  * @parent: HTML parent element
   6288  * @elt: HTML element
   6289  *
   6290  * Checks whether an HTML element may be a direct child of a parent element.
   6291  * and if so whether it is valid or deprecated.
   6292  *
   6293  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
   6294  */
   6295 htmlStatus
   6296 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
   6297   if ( ! parent || ! elt )
   6298     return HTML_INVALID ;
   6299   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
   6300     return HTML_INVALID ;
   6301 
   6302   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
   6303 }
   6304 /**
   6305  * htmlAttrAllowed:
   6306  * @elt: HTML element
   6307  * @attr: HTML attribute
   6308  * @legacy: whether to allow deprecated attributes
   6309  *
   6310  * Checks whether an attribute is valid for an element
   6311  * Has full knowledge of Required and Deprecated attributes
   6312  *
   6313  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
   6314  */
   6315 htmlStatus
   6316 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
   6317   const char** p ;
   6318 
   6319   if ( !elt || ! attr )
   6320 	return HTML_INVALID ;
   6321 
   6322   if ( elt->attrs_req )
   6323     for ( p = elt->attrs_req; *p; ++p)
   6324       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
   6325         return HTML_REQUIRED ;
   6326 
   6327   if ( elt->attrs_opt )
   6328     for ( p = elt->attrs_opt; *p; ++p)
   6329       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
   6330         return HTML_VALID ;
   6331 
   6332   if ( legacy && elt->attrs_depr )
   6333     for ( p = elt->attrs_depr; *p; ++p)
   6334       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
   6335         return HTML_DEPRECATED ;
   6336 
   6337   return HTML_INVALID ;
   6338 }
   6339 /**
   6340  * htmlNodeStatus:
   6341  * @node: an htmlNodePtr in a tree
   6342  * @legacy: whether to allow deprecated elements (YES is faster here
   6343  *	for Element nodes)
   6344  *
   6345  * Checks whether the tree node is valid.  Experimental (the author
   6346  *     only uses the HTML enhancements in a SAX parser)
   6347  *
   6348  * Return: for Element nodes, a return from htmlElementAllowedHere (if
   6349  *	legacy allowed) or htmlElementStatusHere (otherwise).
   6350  *	for Attribute nodes, a return from htmlAttrAllowed
   6351  *	for other nodes, HTML_NA (no checks performed)
   6352  */
   6353 htmlStatus
   6354 htmlNodeStatus(const htmlNodePtr node, int legacy) {
   6355   if ( ! node )
   6356     return HTML_INVALID ;
   6357 
   6358   switch ( node->type ) {
   6359     case XML_ELEMENT_NODE:
   6360       return legacy
   6361 	? ( htmlElementAllowedHere (
   6362 		htmlTagLookup(node->parent->name) , node->name
   6363 		) ? HTML_VALID : HTML_INVALID )
   6364 	: htmlElementStatusHere(
   6365 		htmlTagLookup(node->parent->name) ,
   6366 		htmlTagLookup(node->name) )
   6367 	;
   6368     case XML_ATTRIBUTE_NODE:
   6369       return htmlAttrAllowed(
   6370 	htmlTagLookup(node->parent->name) , node->name, legacy) ;
   6371     default: return HTML_NA ;
   6372   }
   6373 }
   6374 /************************************************************************
   6375  *									*
   6376  *	New set (2.6.0) of simpler and more flexible APIs		*
   6377  *									*
   6378  ************************************************************************/
   6379 /**
   6380  * DICT_FREE:
   6381  * @str:  a string
   6382  *
   6383  * Free a string if it is not owned by the "dict" dictionnary in the
   6384  * current scope
   6385  */
   6386 #define DICT_FREE(str)						\
   6387 	if ((str) && ((!dict) ||				\
   6388 	    (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))	\
   6389 	    xmlFree((char *)(str));
   6390 
   6391 /**
   6392  * htmlCtxtReset:
   6393  * @ctxt: an HTML parser context
   6394  *
   6395  * Reset a parser context
   6396  */
   6397 void
   6398 htmlCtxtReset(htmlParserCtxtPtr ctxt)
   6399 {
   6400     xmlParserInputPtr input;
   6401     xmlDictPtr dict;
   6402 
   6403     if (ctxt == NULL)
   6404         return;
   6405 
   6406     xmlInitParser();
   6407     dict = ctxt->dict;
   6408 
   6409     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
   6410         xmlFreeInputStream(input);
   6411     }
   6412     ctxt->inputNr = 0;
   6413     ctxt->input = NULL;
   6414 
   6415     ctxt->spaceNr = 0;
   6416     if (ctxt->spaceTab != NULL) {
   6417 	ctxt->spaceTab[0] = -1;
   6418 	ctxt->space = &ctxt->spaceTab[0];
   6419     } else {
   6420 	ctxt->space = NULL;
   6421     }
   6422 
   6423 
   6424     ctxt->nodeNr = 0;
   6425     ctxt->node = NULL;
   6426 
   6427     ctxt->nameNr = 0;
   6428     ctxt->name = NULL;
   6429 
   6430     DICT_FREE(ctxt->version);
   6431     ctxt->version = NULL;
   6432     DICT_FREE(ctxt->encoding);
   6433     ctxt->encoding = NULL;
   6434     DICT_FREE(ctxt->directory);
   6435     ctxt->directory = NULL;
   6436     DICT_FREE(ctxt->extSubURI);
   6437     ctxt->extSubURI = NULL;
   6438     DICT_FREE(ctxt->extSubSystem);
   6439     ctxt->extSubSystem = NULL;
   6440     if (ctxt->myDoc != NULL)
   6441         xmlFreeDoc(ctxt->myDoc);
   6442     ctxt->myDoc = NULL;
   6443 
   6444     ctxt->standalone = -1;
   6445     ctxt->hasExternalSubset = 0;
   6446     ctxt->hasPErefs = 0;
   6447     ctxt->html = 1;
   6448     ctxt->external = 0;
   6449     ctxt->instate = XML_PARSER_START;
   6450     ctxt->token = 0;
   6451 
   6452     ctxt->wellFormed = 1;
   6453     ctxt->nsWellFormed = 1;
   6454     ctxt->valid = 1;
   6455     ctxt->vctxt.userData = ctxt;
   6456     ctxt->vctxt.error = xmlParserValidityError;
   6457     ctxt->vctxt.warning = xmlParserValidityWarning;
   6458     ctxt->record_info = 0;
   6459     ctxt->nbChars = 0;
   6460     ctxt->checkIndex = 0;
   6461     ctxt->inSubset = 0;
   6462     ctxt->errNo = XML_ERR_OK;
   6463     ctxt->depth = 0;
   6464     ctxt->charset = XML_CHAR_ENCODING_NONE;
   6465     ctxt->catalogs = NULL;
   6466     xmlInitNodeInfoSeq(&ctxt->node_seq);
   6467 
   6468     if (ctxt->attsDefault != NULL) {
   6469         xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
   6470         ctxt->attsDefault = NULL;
   6471     }
   6472     if (ctxt->attsSpecial != NULL) {
   6473         xmlHashFree(ctxt->attsSpecial, NULL);
   6474         ctxt->attsSpecial = NULL;
   6475     }
   6476 }
   6477 
   6478 /**
   6479  * htmlCtxtUseOptions:
   6480  * @ctxt: an HTML parser context
   6481  * @options:  a combination of htmlParserOption(s)
   6482  *
   6483  * Applies the options to the parser context
   6484  *
   6485  * Returns 0 in case of success, the set of unknown or unimplemented options
   6486  *         in case of error.
   6487  */
   6488 int
   6489 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
   6490 {
   6491     if (ctxt == NULL)
   6492         return(-1);
   6493 
   6494     if (options & HTML_PARSE_NOWARNING) {
   6495         ctxt->sax->warning = NULL;
   6496         ctxt->vctxt.warning = NULL;
   6497         options -= XML_PARSE_NOWARNING;
   6498 	ctxt->options |= XML_PARSE_NOWARNING;
   6499     }
   6500     if (options & HTML_PARSE_NOERROR) {
   6501         ctxt->sax->error = NULL;
   6502         ctxt->vctxt.error = NULL;
   6503         ctxt->sax->fatalError = NULL;
   6504         options -= XML_PARSE_NOERROR;
   6505 	ctxt->options |= XML_PARSE_NOERROR;
   6506     }
   6507     if (options & HTML_PARSE_PEDANTIC) {
   6508         ctxt->pedantic = 1;
   6509         options -= XML_PARSE_PEDANTIC;
   6510 	ctxt->options |= XML_PARSE_PEDANTIC;
   6511     } else
   6512         ctxt->pedantic = 0;
   6513     if (options & XML_PARSE_NOBLANKS) {
   6514         ctxt->keepBlanks = 0;
   6515         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
   6516         options -= XML_PARSE_NOBLANKS;
   6517 	ctxt->options |= XML_PARSE_NOBLANKS;
   6518     } else
   6519         ctxt->keepBlanks = 1;
   6520     if (options & HTML_PARSE_RECOVER) {
   6521         ctxt->recovery = 1;
   6522 	options -= HTML_PARSE_RECOVER;
   6523     } else
   6524         ctxt->recovery = 0;
   6525     if (options & HTML_PARSE_COMPACT) {
   6526 	ctxt->options |= HTML_PARSE_COMPACT;
   6527         options -= HTML_PARSE_COMPACT;
   6528     }
   6529     if (options & XML_PARSE_HUGE) {
   6530 	ctxt->options |= XML_PARSE_HUGE;
   6531         options -= XML_PARSE_HUGE;
   6532     }
   6533     ctxt->dictNames = 0;
   6534     return (options);
   6535 }
   6536 
   6537 /**
   6538  * htmlDoRead:
   6539  * @ctxt:  an HTML parser context
   6540  * @URL:  the base URL to use for the document
   6541  * @encoding:  the document encoding, or NULL
   6542  * @options:  a combination of htmlParserOption(s)
   6543  * @reuse:  keep the context for reuse
   6544  *
   6545  * Common front-end for the htmlRead functions
   6546  *
   6547  * Returns the resulting document tree or NULL
   6548  */
   6549 static htmlDocPtr
   6550 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
   6551           int options, int reuse)
   6552 {
   6553     htmlDocPtr ret;
   6554 
   6555     htmlCtxtUseOptions(ctxt, options);
   6556     ctxt->html = 1;
   6557     if (encoding != NULL) {
   6558         xmlCharEncodingHandlerPtr hdlr;
   6559 
   6560 	hdlr = xmlFindCharEncodingHandler(encoding);
   6561 	if (hdlr != NULL) {
   6562 	    xmlSwitchToEncoding(ctxt, hdlr);
   6563 	    if (ctxt->input->encoding != NULL)
   6564 	      xmlFree((xmlChar *) ctxt->input->encoding);
   6565             ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
   6566         }
   6567     }
   6568     if ((URL != NULL) && (ctxt->input != NULL) &&
   6569         (ctxt->input->filename == NULL))
   6570         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
   6571     htmlParseDocument(ctxt);
   6572     ret = ctxt->myDoc;
   6573     ctxt->myDoc = NULL;
   6574     if (!reuse) {
   6575         if ((ctxt->dictNames) &&
   6576 	    (ret != NULL) &&
   6577 	    (ret->dict == ctxt->dict))
   6578 	    ctxt->dict = NULL;
   6579 	xmlFreeParserCtxt(ctxt);
   6580     }
   6581     return (ret);
   6582 }
   6583 
   6584 /**
   6585  * htmlReadDoc:
   6586  * @cur:  a pointer to a zero terminated string
   6587  * @URL:  the base URL to use for the document
   6588  * @encoding:  the document encoding, or NULL
   6589  * @options:  a combination of htmlParserOption(s)
   6590  *
   6591  * parse an XML in-memory document and build a tree.
   6592  *
   6593  * Returns the resulting document tree
   6594  */
   6595 htmlDocPtr
   6596 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
   6597 {
   6598     htmlParserCtxtPtr ctxt;
   6599 
   6600     if (cur == NULL)
   6601         return (NULL);
   6602 
   6603     xmlInitParser();
   6604     ctxt = htmlCreateDocParserCtxt(cur, NULL);
   6605     if (ctxt == NULL)
   6606         return (NULL);
   6607     return (htmlDoRead(ctxt, URL, encoding, options, 0));
   6608 }
   6609 
   6610 /**
   6611  * htmlReadFile:
   6612  * @filename:  a file or URL
   6613  * @encoding:  the document encoding, or NULL
   6614  * @options:  a combination of htmlParserOption(s)
   6615  *
   6616  * parse an XML file from the filesystem or the network.
   6617  *
   6618  * Returns the resulting document tree
   6619  */
   6620 htmlDocPtr
   6621 htmlReadFile(const char *filename, const char *encoding, int options)
   6622 {
   6623     htmlParserCtxtPtr ctxt;
   6624 
   6625     xmlInitParser();
   6626     ctxt = htmlCreateFileParserCtxt(filename, encoding);
   6627     if (ctxt == NULL)
   6628         return (NULL);
   6629     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
   6630 }
   6631 
   6632 /**
   6633  * htmlReadMemory:
   6634  * @buffer:  a pointer to a char array
   6635  * @size:  the size of the array
   6636  * @URL:  the base URL to use for the document
   6637  * @encoding:  the document encoding, or NULL
   6638  * @options:  a combination of htmlParserOption(s)
   6639  *
   6640  * parse an XML in-memory document and build a tree.
   6641  *
   6642  * Returns the resulting document tree
   6643  */
   6644 htmlDocPtr
   6645 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
   6646 {
   6647     htmlParserCtxtPtr ctxt;
   6648 
   6649     xmlInitParser();
   6650     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
   6651     if (ctxt == NULL)
   6652         return (NULL);
   6653     htmlDefaultSAXHandlerInit();
   6654     if (ctxt->sax != NULL)
   6655         memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
   6656     return (htmlDoRead(ctxt, URL, encoding, options, 0));
   6657 }
   6658 
   6659 /**
   6660  * htmlReadFd:
   6661  * @fd:  an open file descriptor
   6662  * @URL:  the base URL to use for the document
   6663  * @encoding:  the document encoding, or NULL
   6664  * @options:  a combination of htmlParserOption(s)
   6665  *
   6666  * parse an XML from a file descriptor and build a tree.
   6667  *
   6668  * Returns the resulting document tree
   6669  */
   6670 htmlDocPtr
   6671 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
   6672 {
   6673     htmlParserCtxtPtr ctxt;
   6674     xmlParserInputBufferPtr input;
   6675     xmlParserInputPtr stream;
   6676 
   6677     if (fd < 0)
   6678         return (NULL);
   6679 
   6680     xmlInitParser();
   6681     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
   6682     if (input == NULL)
   6683         return (NULL);
   6684     ctxt = xmlNewParserCtxt();
   6685     if (ctxt == NULL) {
   6686         xmlFreeParserInputBuffer(input);
   6687         return (NULL);
   6688     }
   6689     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
   6690     if (stream == NULL) {
   6691         xmlFreeParserInputBuffer(input);
   6692 	xmlFreeParserCtxt(ctxt);
   6693         return (NULL);
   6694     }
   6695     inputPush(ctxt, stream);
   6696     return (htmlDoRead(ctxt, URL, encoding, options, 0));
   6697 }
   6698 
   6699 /**
   6700  * htmlReadIO:
   6701  * @ioread:  an I/O read function
   6702  * @ioclose:  an I/O close function
   6703  * @ioctx:  an I/O handler
   6704  * @URL:  the base URL to use for the document
   6705  * @encoding:  the document encoding, or NULL
   6706  * @options:  a combination of htmlParserOption(s)
   6707  *
   6708  * parse an HTML document from I/O functions and source and build a tree.
   6709  *
   6710  * Returns the resulting document tree
   6711  */
   6712 htmlDocPtr
   6713 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
   6714           void *ioctx, const char *URL, const char *encoding, int options)
   6715 {
   6716     htmlParserCtxtPtr ctxt;
   6717     xmlParserInputBufferPtr input;
   6718     xmlParserInputPtr stream;
   6719 
   6720     if (ioread == NULL)
   6721         return (NULL);
   6722     xmlInitParser();
   6723 
   6724     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
   6725                                          XML_CHAR_ENCODING_NONE);
   6726     if (input == NULL)
   6727         return (NULL);
   6728     ctxt = htmlNewParserCtxt();
   6729     if (ctxt == NULL) {
   6730         xmlFreeParserInputBuffer(input);
   6731         return (NULL);
   6732     }
   6733     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
   6734     if (stream == NULL) {
   6735         xmlFreeParserInputBuffer(input);
   6736 	xmlFreeParserCtxt(ctxt);
   6737         return (NULL);
   6738     }
   6739     inputPush(ctxt, stream);
   6740     return (htmlDoRead(ctxt, URL, encoding, options, 0));
   6741 }
   6742 
   6743 /**
   6744  * htmlCtxtReadDoc:
   6745  * @ctxt:  an HTML parser context
   6746  * @cur:  a pointer to a zero terminated string
   6747  * @URL:  the base URL to use for the document
   6748  * @encoding:  the document encoding, or NULL
   6749  * @options:  a combination of htmlParserOption(s)
   6750  *
   6751  * parse an XML in-memory document and build a tree.
   6752  * This reuses the existing @ctxt parser context
   6753  *
   6754  * Returns the resulting document tree
   6755  */
   6756 htmlDocPtr
   6757 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
   6758                const char *URL, const char *encoding, int options)
   6759 {
   6760     xmlParserInputPtr stream;
   6761 
   6762     if (cur == NULL)
   6763         return (NULL);
   6764     if (ctxt == NULL)
   6765         return (NULL);
   6766 
   6767     htmlCtxtReset(ctxt);
   6768 
   6769     stream = xmlNewStringInputStream(ctxt, cur);
   6770     if (stream == NULL) {
   6771         return (NULL);
   6772     }
   6773     inputPush(ctxt, stream);
   6774     return (htmlDoRead(ctxt, URL, encoding, options, 1));
   6775 }
   6776 
   6777 /**
   6778  * htmlCtxtReadFile:
   6779  * @ctxt:  an HTML parser context
   6780  * @filename:  a file or URL
   6781  * @encoding:  the document encoding, or NULL
   6782  * @options:  a combination of htmlParserOption(s)
   6783  *
   6784  * parse an XML file from the filesystem or the network.
   6785  * This reuses the existing @ctxt parser context
   6786  *
   6787  * Returns the resulting document tree
   6788  */
   6789 htmlDocPtr
   6790 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
   6791                 const char *encoding, int options)
   6792 {
   6793     xmlParserInputPtr stream;
   6794 
   6795     if (filename == NULL)
   6796         return (NULL);
   6797     if (ctxt == NULL)
   6798         return (NULL);
   6799 
   6800     htmlCtxtReset(ctxt);
   6801 
   6802     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
   6803     if (stream == NULL) {
   6804         return (NULL);
   6805     }
   6806     inputPush(ctxt, stream);
   6807     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
   6808 }
   6809 
   6810 /**
   6811  * htmlCtxtReadMemory:
   6812  * @ctxt:  an HTML parser context
   6813  * @buffer:  a pointer to a char array
   6814  * @size:  the size of the array
   6815  * @URL:  the base URL to use for the document
   6816  * @encoding:  the document encoding, or NULL
   6817  * @options:  a combination of htmlParserOption(s)
   6818  *
   6819  * parse an XML in-memory document and build a tree.
   6820  * This reuses the existing @ctxt parser context
   6821  *
   6822  * Returns the resulting document tree
   6823  */
   6824 htmlDocPtr
   6825 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
   6826                   const char *URL, const char *encoding, int options)
   6827 {
   6828     xmlParserInputBufferPtr input;
   6829     xmlParserInputPtr stream;
   6830 
   6831     if (ctxt == NULL)
   6832         return (NULL);
   6833     if (buffer == NULL)
   6834         return (NULL);
   6835 
   6836     htmlCtxtReset(ctxt);
   6837 
   6838     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
   6839     if (input == NULL) {
   6840 	return(NULL);
   6841     }
   6842 
   6843     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
   6844     if (stream == NULL) {
   6845 	xmlFreeParserInputBuffer(input);
   6846 	return(NULL);
   6847     }
   6848 
   6849     inputPush(ctxt, stream);
   6850     return (htmlDoRead(ctxt, URL, encoding, options, 1));
   6851 }
   6852 
   6853 /**
   6854  * htmlCtxtReadFd:
   6855  * @ctxt:  an HTML parser context
   6856  * @fd:  an open file descriptor
   6857  * @URL:  the base URL to use for the document
   6858  * @encoding:  the document encoding, or NULL
   6859  * @options:  a combination of htmlParserOption(s)
   6860  *
   6861  * parse an XML from a file descriptor and build a tree.
   6862  * This reuses the existing @ctxt parser context
   6863  *
   6864  * Returns the resulting document tree
   6865  */
   6866 htmlDocPtr
   6867 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
   6868               const char *URL, const char *encoding, int options)
   6869 {
   6870     xmlParserInputBufferPtr input;
   6871     xmlParserInputPtr stream;
   6872 
   6873     if (fd < 0)
   6874         return (NULL);
   6875     if (ctxt == NULL)
   6876         return (NULL);
   6877 
   6878     htmlCtxtReset(ctxt);
   6879 
   6880 
   6881     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
   6882     if (input == NULL)
   6883         return (NULL);
   6884     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
   6885     if (stream == NULL) {
   6886         xmlFreeParserInputBuffer(input);
   6887         return (NULL);
   6888     }
   6889     inputPush(ctxt, stream);
   6890     return (htmlDoRead(ctxt, URL, encoding, options, 1));
   6891 }
   6892 
   6893 /**
   6894  * htmlCtxtReadIO:
   6895  * @ctxt:  an HTML parser context
   6896  * @ioread:  an I/O read function
   6897  * @ioclose:  an I/O close function
   6898  * @ioctx:  an I/O handler
   6899  * @URL:  the base URL to use for the document
   6900  * @encoding:  the document encoding, or NULL
   6901  * @options:  a combination of htmlParserOption(s)
   6902  *
   6903  * parse an HTML document from I/O functions and source and build a tree.
   6904  * This reuses the existing @ctxt parser context
   6905  *
   6906  * Returns the resulting document tree
   6907  */
   6908 htmlDocPtr
   6909 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
   6910               xmlInputCloseCallback ioclose, void *ioctx,
   6911 	      const char *URL,
   6912               const char *encoding, int options)
   6913 {
   6914     xmlParserInputBufferPtr input;
   6915     xmlParserInputPtr stream;
   6916 
   6917     if (ioread == NULL)
   6918         return (NULL);
   6919     if (ctxt == NULL)
   6920         return (NULL);
   6921 
   6922     htmlCtxtReset(ctxt);
   6923 
   6924     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
   6925                                          XML_CHAR_ENCODING_NONE);
   6926     if (input == NULL)
   6927         return (NULL);
   6928     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
   6929     if (stream == NULL) {
   6930         xmlFreeParserInputBuffer(input);
   6931         return (NULL);
   6932     }
   6933     inputPush(ctxt, stream);
   6934     return (htmlDoRead(ctxt, URL, encoding, options, 1));
   6935 }
   6936 
   6937 #define bottom_HTMLparser
   6938 #include "elfgcchack.h"
   6939 #endif /* LIBXML_HTML_ENABLED */
   6940