Home | History | Annotate | Download | only in libxml2
      1 /*
      2  * HTMLparser.c : an HTML 4.0 non-verifying parser
      3  *
      4  * See Copyright for the status of this software.
      5  *
      6  * daniel (at) veillard.com
      7  */
      8 
      9 #define IN_LIBXML
     10 #include "libxml.h"
     11 #ifdef LIBXML_HTML_ENABLED
     12 
     13 #include <string.h>
     14 #ifdef HAVE_CTYPE_H
     15 #include <ctype.h>
     16 #endif
     17 #ifdef HAVE_STDLIB_H
     18 #include <stdlib.h>
     19 #endif
     20 #ifdef HAVE_SYS_STAT_H
     21 #include <sys/stat.h>
     22 #endif
     23 #ifdef HAVE_FCNTL_H
     24 #include <fcntl.h>
     25 #endif
     26 #ifdef HAVE_UNISTD_H
     27 #include <unistd.h>
     28 #endif
     29 #ifdef HAVE_ZLIB_H
     30 #include <zlib.h>
     31 #endif
     32 
     33 #include <libxml/xmlmemory.h>
     34 #include <libxml/tree.h>
     35 #include <libxml/parser.h>
     36 #include <libxml/parserInternals.h>
     37 #include <libxml/xmlerror.h>
     38 #include <libxml/HTMLparser.h>
     39 #include <libxml/HTMLtree.h>
     40 #include <libxml/entities.h>
     41 #include <libxml/encoding.h>
     42 #include <libxml/valid.h>
     43 #include <libxml/xmlIO.h>
     44 #include <libxml/globals.h>
     45 #include <libxml/uri.h>
     46 
     47 #define HTML_MAX_NAMELEN 1000
     48 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
     49 #define HTML_PARSER_BUFFER_SIZE 100
     50 
     51 /* #define DEBUG */
     52 /* #define DEBUG_PUSH */
     53 
     54 static int htmlOmittedDefaultValue = 1;
     55 
     56 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
     57 			     xmlChar end, xmlChar  end2, xmlChar end3);
     58 static void htmlParseComment(htmlParserCtxtPtr ctxt);
     59 
     60 /************************************************************************
     61  *									*
     62  *		Some factorized error routines				*
     63  *									*
     64  ************************************************************************/
     65 
     66 /**
     67  * htmlErrMemory:
     68  * @ctxt:  an HTML parser context
     69  * @extra:  extra informations
     70  *
     71  * Handle a redefinition of attribute error
     72  */
     73 static void
     74 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
     75 {
     76     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
     77         (ctxt->instate == XML_PARSER_EOF))
     78 	return;
     79     if (ctxt != NULL) {
     80         ctxt->errNo = XML_ERR_NO_MEMORY;
     81         ctxt->instate = XML_PARSER_EOF;
     82         ctxt->disableSAX = 1;
     83     }
     84     if (extra)
     85         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
     86                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
     87                         NULL, NULL, 0, 0,
     88                         "Memory allocation failed : %s\n", extra);
     89     else
     90         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
     91                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
     92                         NULL, NULL, 0, 0, "Memory allocation failed\n");
     93 }
     94 
     95 /**
     96  * htmlParseErr:
     97  * @ctxt:  an HTML parser context
     98  * @error:  the error number
     99  * @msg:  the error message
    100  * @str1:  string infor
    101  * @str2:  string infor
    102  *
    103  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
    104  */
    105 static void
    106 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
    107              const char *msg, const xmlChar *str1, const xmlChar *str2)
    108 {
    109     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
    110         (ctxt->instate == XML_PARSER_EOF))
    111 	return;
    112     if (ctxt != NULL)
    113 	ctxt->errNo = error;
    114     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
    115                     XML_ERR_ERROR, NULL, 0,
    116 		    (const char *) str1, (const char *) str2,
    117 		    NULL, 0, 0,
    118 		    msg, str1, str2);
    119     if (ctxt != NULL)
    120 	ctxt->wellFormed = 0;
    121 }
    122 
    123 /**
    124  * htmlParseErrInt:
    125  * @ctxt:  an HTML parser context
    126  * @error:  the error number
    127  * @msg:  the error message
    128  * @val:  integer info
    129  *
    130  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
    131  */
    132 static void
    133 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
    134              const char *msg, int val)
    135 {
    136     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
    137         (ctxt->instate == XML_PARSER_EOF))
    138 	return;
    139     if (ctxt != NULL)
    140 	ctxt->errNo = error;
    141     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
    142                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
    143 		    NULL, val, 0, msg, val);
    144     if (ctxt != NULL)
    145 	ctxt->wellFormed = 0;
    146 }
    147 
    148 /************************************************************************
    149  *									*
    150  *	Parser stacks related functions and macros		*
    151  *									*
    152  ************************************************************************/
    153 
    154 /**
    155  * htmlnamePush:
    156  * @ctxt:  an HTML parser context
    157  * @value:  the element name
    158  *
    159  * Pushes a new element name on top of the name stack
    160  *
    161  * Returns 0 in case of error, the index in the stack otherwise
    162  */
    163 static int
    164 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
    165 {
    166     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
    167         ctxt->html = 3;
    168     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
    169         ctxt->html = 10;
    170     if (ctxt->nameNr >= ctxt->nameMax) {
    171         ctxt->nameMax *= 2;
    172         ctxt->nameTab = (const xmlChar * *)
    173                          xmlRealloc((xmlChar * *)ctxt->nameTab,
    174                                     ctxt->nameMax *
    175                                     sizeof(ctxt->nameTab[0]));
    176         if (ctxt->nameTab == NULL) {
    177             htmlErrMemory(ctxt, NULL);
    178             return (0);
    179         }
    180     }
    181     ctxt->nameTab[ctxt->nameNr] = value;
    182     ctxt->name = value;
    183     return (ctxt->nameNr++);
    184 }
    185 /**
    186  * htmlnamePop:
    187  * @ctxt: an HTML parser context
    188  *
    189  * Pops the top element name from the name stack
    190  *
    191  * Returns the name just removed
    192  */
    193 static const xmlChar *
    194 htmlnamePop(htmlParserCtxtPtr ctxt)
    195 {
    196     const xmlChar *ret;
    197 
    198     if (ctxt->nameNr <= 0)
    199         return (NULL);
    200     ctxt->nameNr--;
    201     if (ctxt->nameNr < 0)
    202         return (NULL);
    203     if (ctxt->nameNr > 0)
    204         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
    205     else
    206         ctxt->name = NULL;
    207     ret = ctxt->nameTab[ctxt->nameNr];
    208     ctxt->nameTab[ctxt->nameNr] = NULL;
    209     return (ret);
    210 }
    211 
    212 /**
    213  * htmlNodeInfoPush:
    214  * @ctxt:  an HTML parser context
    215  * @value:  the node info
    216  *
    217  * Pushes a new element name on top of the node info stack
    218  *
    219  * Returns 0 in case of error, the index in the stack otherwise
    220  */
    221 static int
    222 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
    223 {
    224     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
    225         if (ctxt->nodeInfoMax == 0)
    226                 ctxt->nodeInfoMax = 5;
    227         ctxt->nodeInfoMax *= 2;
    228         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
    229                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
    230                                     ctxt->nodeInfoMax *
    231                                     sizeof(ctxt->nodeInfoTab[0]));
    232         if (ctxt->nodeInfoTab == NULL) {
    233             htmlErrMemory(ctxt, NULL);
    234             return (0);
    235         }
    236     }
    237     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
    238     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
    239     return (ctxt->nodeInfoNr++);
    240 }
    241 
    242 /**
    243  * htmlNodeInfoPop:
    244  * @ctxt:  an HTML parser context
    245  *
    246  * Pops the top element name from the node info stack
    247  *
    248  * Returns 0 in case of error, the pointer to NodeInfo otherwise
    249  */
    250 static htmlParserNodeInfo *
    251 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
    252 {
    253     if (ctxt->nodeInfoNr <= 0)
    254         return (NULL);
    255     ctxt->nodeInfoNr--;
    256     if (ctxt->nodeInfoNr < 0)
    257         return (NULL);
    258     if (ctxt->nodeInfoNr > 0)
    259         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
    260     else
    261         ctxt->nodeInfo = NULL;
    262     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
    263 }
    264 
    265 /*
    266  * Macros for accessing the content. Those should be used only by the parser,
    267  * and not exported.
    268  *
    269  * Dirty macros, i.e. one need to make assumption on the context to use them
    270  *
    271  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
    272  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
    273  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
    274  *           in UNICODE mode. This should be used internally by the parser
    275  *           only to compare to ASCII values otherwise it would break when
    276  *           running with UTF-8 encoding.
    277  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
    278  *           to compare on ASCII based substring.
    279  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
    280  *           it should be used only to compare on ASCII based substring.
    281  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
    282  *           strings without newlines within the parser.
    283  *
    284  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
    285  *
    286  *   CURRENT Returns the current char value, with the full decoding of
    287  *           UTF-8 if we are using this mode. It returns an int.
    288  *   NEXT    Skip to the next character, this does the proper decoding
    289  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
    290  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
    291  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
    292  */
    293 
    294 #define UPPER (toupper(*ctxt->input->cur))
    295 
    296 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
    297 
    298 #define NXT(val) ctxt->input->cur[(val)]
    299 
    300 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
    301 
    302 #define CUR_PTR ctxt->input->cur
    303 
    304 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
    305 		   (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
    306 	xmlParserInputShrink(ctxt->input)
    307 
    308 #define GROW if ((ctxt->progressive == 0) &&				\
    309 		 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))	\
    310 	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
    311 
    312 #define CURRENT ((int) (*ctxt->input->cur))
    313 
    314 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
    315 
    316 /* Inported from XML */
    317 
    318 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
    319 #define CUR ((int) (*ctxt->input->cur))
    320 #define NEXT xmlNextChar(ctxt)
    321 
    322 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
    323 
    324 
    325 #define NEXTL(l) do {							\
    326     if (*(ctxt->input->cur) == '\n') {					\
    327 	ctxt->input->line++; ctxt->input->col = 1;			\
    328     } else ctxt->input->col++;						\
    329     ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;		\
    330   } while (0)
    331 
    332 /************
    333     \
    334     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);	\
    335     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
    336  ************/
    337 
    338 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
    339 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
    340 
    341 #define COPY_BUF(l,b,i,v)						\
    342     if (l == 1) b[i++] = (xmlChar) v;					\
    343     else i += xmlCopyChar(l,&b[i],v)
    344 
    345 /**
    346  * htmlFindEncoding:
    347  * @the HTML parser context
    348  *
    349  * Ty to find and encoding in the current data available in the input
    350  * buffer this is needed to try to switch to the proper encoding when
    351  * one face a character error.
    352  * That's an heuristic, since it's operating outside of parsing it could
    353  * try to use a meta which had been commented out, that's the reason it
    354  * should only be used in case of error, not as a default.
    355  *
    356  * Returns an encoding string or NULL if not found, the string need to
    357  *   be freed
    358  */
    359 static xmlChar *
    360 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
    361     const xmlChar *start, *cur, *end;
    362 
    363     if ((ctxt == NULL) || (ctxt->input == NULL) ||
    364         (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
    365         (ctxt->input->buf->encoder != NULL))
    366         return(NULL);
    367     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
    368         return(NULL);
    369 
    370     start = ctxt->input->cur;
    371     end = ctxt->input->end;
    372     /* we also expect the input buffer to be zero terminated */
    373     if (*end != 0)
    374         return(NULL);
    375 
    376     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
    377     if (cur == NULL)
    378         return(NULL);
    379     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
    380     if (cur == NULL)
    381         return(NULL);
    382     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
    383     if (cur == NULL)
    384         return(NULL);
    385     cur += 8;
    386     start = cur;
    387     while (((*cur >= 'A') && (*cur <= 'Z')) ||
    388            ((*cur >= 'a') && (*cur <= 'z')) ||
    389            ((*cur >= '0') && (*cur <= '9')) ||
    390            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
    391            cur++;
    392     if (cur == start)
    393         return(NULL);
    394     return(xmlStrndup(start, cur - start));
    395 }
    396 
    397 /**
    398  * htmlCurrentChar:
    399  * @ctxt:  the HTML parser context
    400  * @len:  pointer to the length of the char read
    401  *
    402  * The current char value, if using UTF-8 this may actually span multiple
    403  * bytes in the input buffer. Implement the end of line normalization:
    404  * 2.11 End-of-Line Handling
    405  * If the encoding is unspecified, in the case we find an ISO-Latin-1
    406  * char, then the encoding converter is plugged in automatically.
    407  *
    408  * Returns the current char value and its length
    409  */
    410 
    411 static int
    412 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
    413     if (ctxt->instate == XML_PARSER_EOF)
    414 	return(0);
    415 
    416     if (ctxt->token != 0) {
    417 	*len = 0;
    418 	return(ctxt->token);
    419     }
    420     if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
    421 	/*
    422 	 * We are supposed to handle UTF8, check it's valid
    423 	 * From rfc2044: encoding of the Unicode values on UTF-8:
    424 	 *
    425 	 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
    426 	 * 0000 0000-0000 007F   0xxxxxxx
    427 	 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
    428 	 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
    429 	 *
    430 	 * Check for the 0x110000 limit too
    431 	 */
    432 	const unsigned char *cur = ctxt->input->cur;
    433 	unsigned char c;
    434 	unsigned int val;
    435 
    436 	c = *cur;
    437 	if (c & 0x80) {
    438 	    if (cur[1] == 0) {
    439 		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
    440                 cur = ctxt->input->cur;
    441             }
    442 	    if ((cur[1] & 0xc0) != 0x80)
    443 		goto encoding_error;
    444 	    if ((c & 0xe0) == 0xe0) {
    445 
    446 		if (cur[2] == 0) {
    447 		    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
    448                     cur = ctxt->input->cur;
    449                 }
    450 		if ((cur[2] & 0xc0) != 0x80)
    451 		    goto encoding_error;
    452 		if ((c & 0xf0) == 0xf0) {
    453 		    if (cur[3] == 0) {
    454 			xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
    455                         cur = ctxt->input->cur;
    456                     }
    457 		    if (((c & 0xf8) != 0xf0) ||
    458 			((cur[3] & 0xc0) != 0x80))
    459 			goto encoding_error;
    460 		    /* 4-byte code */
    461 		    *len = 4;
    462 		    val = (cur[0] & 0x7) << 18;
    463 		    val |= (cur[1] & 0x3f) << 12;
    464 		    val |= (cur[2] & 0x3f) << 6;
    465 		    val |= cur[3] & 0x3f;
    466 		} else {
    467 		  /* 3-byte code */
    468 		    *len = 3;
    469 		    val = (cur[0] & 0xf) << 12;
    470 		    val |= (cur[1] & 0x3f) << 6;
    471 		    val |= cur[2] & 0x3f;
    472 		}
    473 	    } else {
    474 	      /* 2-byte code */
    475 		*len = 2;
    476 		val = (cur[0] & 0x1f) << 6;
    477 		val |= cur[1] & 0x3f;
    478 	    }
    479 	    if (!IS_CHAR(val)) {
    480 	        htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
    481 				"Char 0x%X out of allowed range\n", val);
    482 	    }
    483 	    return(val);
    484 	} else {
    485             if ((*ctxt->input->cur == 0) &&
    486                 (ctxt->input->cur < ctxt->input->end)) {
    487                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
    488 				"Char 0x%X out of allowed range\n", 0);
    489                 *len = 1;
    490                 return(' ');
    491             }
    492 	    /* 1-byte code */
    493 	    *len = 1;
    494 	    return((int) *ctxt->input->cur);
    495 	}
    496     }
    497     /*
    498      * Assume it's a fixed length encoding (1) with
    499      * a compatible encoding for the ASCII set, since
    500      * XML constructs only use < 128 chars
    501      */
    502     *len = 1;
    503     if ((int) *ctxt->input->cur < 0x80)
    504 	return((int) *ctxt->input->cur);
    505 
    506     /*
    507      * Humm this is bad, do an automatic flow conversion
    508      */
    509     {
    510         xmlChar * guess;
    511         xmlCharEncodingHandlerPtr handler;
    512 
    513         guess = htmlFindEncoding(ctxt);
    514         if (guess == NULL) {
    515             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
    516         } else {
    517             if (ctxt->input->encoding != NULL)
    518                 xmlFree((xmlChar *) ctxt->input->encoding);
    519             ctxt->input->encoding = guess;
    520             handler = xmlFindCharEncodingHandler((const char *) guess);
    521             if (handler != NULL) {
    522                 xmlSwitchToEncoding(ctxt, handler);
    523             } else {
    524                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
    525                              "Unsupported encoding %s", guess, NULL);
    526             }
    527         }
    528         ctxt->charset = XML_CHAR_ENCODING_UTF8;
    529     }
    530 
    531     return(xmlCurrentChar(ctxt, len));
    532 
    533 encoding_error:
    534     /*
    535      * If we detect an UTF8 error that probably mean that the
    536      * input encoding didn't get properly advertized in the
    537      * declaration header. Report the error and switch the encoding
    538      * to ISO-Latin-1 (if you don't like this policy, just declare the
    539      * encoding !)
    540      */
    541     {
    542         char buffer[150];
    543 
    544 	if (ctxt->input->end - ctxt->input->cur >= 4) {
    545 	    snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
    546 			    ctxt->input->cur[0], ctxt->input->cur[1],
    547 			    ctxt->input->cur[2], ctxt->input->cur[3]);
    548 	} else {
    549 	    snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
    550 	}
    551 	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
    552 		     "Input is not proper UTF-8, indicate encoding !\n",
    553 		     BAD_CAST buffer, NULL);
    554     }
    555 
    556     ctxt->charset = XML_CHAR_ENCODING_8859_1;
    557     *len = 1;
    558     return((int) *ctxt->input->cur);
    559 }
    560 
    561 /**
    562  * htmlSkipBlankChars:
    563  * @ctxt:  the HTML parser context
    564  *
    565  * skip all blanks character found at that point in the input streams.
    566  *
    567  * Returns the number of space chars skipped
    568  */
    569 
    570 static int
    571 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
    572     int res = 0;
    573 
    574     while (IS_BLANK_CH(*(ctxt->input->cur))) {
    575 	if ((*ctxt->input->cur == 0) &&
    576 	    (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
    577 		xmlPopInput(ctxt);
    578 	} else {
    579 	    if (*(ctxt->input->cur) == '\n') {
    580 		ctxt->input->line++; ctxt->input->col = 1;
    581 	    } else ctxt->input->col++;
    582 	    ctxt->input->cur++;
    583 	    ctxt->nbChars++;
    584 	    if (*ctxt->input->cur == 0)
    585 		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
    586 	}
    587 	res++;
    588     }
    589     return(res);
    590 }
    591 
    592 
    593 
    594 /************************************************************************
    595  *									*
    596  *	The list of HTML elements and their properties		*
    597  *									*
    598  ************************************************************************/
    599 
    600 /*
    601  *  Start Tag: 1 means the start tag can be ommited
    602  *  End Tag:   1 means the end tag can be ommited
    603  *             2 means it's forbidden (empty elements)
    604  *             3 means the tag is stylistic and should be closed easily
    605  *  Depr:      this element is deprecated
    606  *  DTD:       1 means that this element is valid only in the Loose DTD
    607  *             2 means that this element is valid only in the Frameset DTD
    608  *
    609  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
    610 	, subElements , impliedsubelt , Attributes, userdata
    611  */
    612 
    613 /* Definitions and a couple of vars for HTML Elements */
    614 
    615 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
    616 #define NB_FONTSTYLE 8
    617 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
    618 #define NB_PHRASE 10
    619 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
    620 #define NB_SPECIAL 16
    621 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
    622 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
    623 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
    624 #define NB_BLOCK NB_HEADING + NB_LIST + 14
    625 #define FORMCTRL "input", "select", "textarea", "label", "button"
    626 #define NB_FORMCTRL 5
    627 #define PCDATA
    628 #define NB_PCDATA 0
    629 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
    630 #define NB_HEADING 6
    631 #define LIST "ul", "ol", "dir", "menu"
    632 #define NB_LIST 4
    633 #define MODIFIER
    634 #define NB_MODIFIER 0
    635 #define FLOW BLOCK,INLINE
    636 #define NB_FLOW NB_BLOCK + NB_INLINE
    637 #define EMPTY NULL
    638 
    639 
    640 static const char* const html_flow[] = { FLOW, NULL } ;
    641 static const char* const html_inline[] = { INLINE, NULL } ;
    642 
    643 /* placeholders: elts with content but no subelements */
    644 static const char* const html_pcdata[] = { NULL } ;
    645 #define html_cdata html_pcdata
    646 
    647 
    648 /* ... and for HTML Attributes */
    649 
    650 #define COREATTRS "id", "class", "style", "title"
    651 #define NB_COREATTRS 4
    652 #define I18N "lang", "dir"
    653 #define NB_I18N 2
    654 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
    655 #define NB_EVENTS 9
    656 #define ATTRS COREATTRS,I18N,EVENTS
    657 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
    658 #define CELLHALIGN "align", "char", "charoff"
    659 #define NB_CELLHALIGN 3
    660 #define CELLVALIGN "valign"
    661 #define NB_CELLVALIGN 1
    662 
    663 static const char* const html_attrs[] = { ATTRS, NULL } ;
    664 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
    665 static const char* const core_attrs[] = { COREATTRS, NULL } ;
    666 static const char* const i18n_attrs[] = { I18N, NULL } ;
    667 
    668 
    669 /* Other declarations that should go inline ... */
    670 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
    671 	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
    672 	"tabindex", "onfocus", "onblur", NULL } ;
    673 static const char* const target_attr[] = { "target", NULL } ;
    674 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
    675 static const char* const alt_attr[] = { "alt", NULL } ;
    676 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
    677 static const char* const href_attrs[] = { "href", NULL } ;
    678 static const char* const clear_attrs[] = { "clear", NULL } ;
    679 static const char* const inline_p[] = { INLINE, "p", NULL } ;
    680 
    681 static const char* const flow_param[] = { FLOW, "param", NULL } ;
    682 static const char* const applet_attrs[] = { COREATTRS , "codebase",
    683 		"archive", "alt", "name", "height", "width", "align",
    684 		"hspace", "vspace", NULL } ;
    685 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
    686 	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
    687 static const char* const basefont_attrs[] =
    688 	{ "id", "size", "color", "face", NULL } ;
    689 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
    690 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
    691 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
    692 static const char* const body_depr[] = { "background", "bgcolor", "text",
    693 	"link", "vlink", "alink", NULL } ;
    694 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
    695 	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
    696 
    697 
    698 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
    699 static const char* const col_elt[] = { "col", NULL } ;
    700 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
    701 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
    702 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
    703 static const char* const compact_attr[] = { "compact", NULL } ;
    704 static const char* const label_attr[] = { "label", NULL } ;
    705 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
    706 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
    707 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
    708 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
    709 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
    710 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
    711 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
    712 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
    713 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
    714 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
    715 static const char* const version_attr[] = { "version", NULL } ;
    716 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
    717 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
    718 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
    719 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
    720 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
    721 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
    722 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
    723 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
    724 static const char* const align_attr[] = { "align", NULL } ;
    725 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
    726 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
    727 static const char* const name_attr[] = { "name", NULL } ;
    728 static const char* const action_attr[] = { "action", NULL } ;
    729 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
    730 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
    731 static const char* const content_attr[] = { "content", NULL } ;
    732 static const char* const type_attr[] = { "type", NULL } ;
    733 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
    734 static const char* const object_contents[] = { FLOW, "param", NULL } ;
    735 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
    736 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
    737 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
    738 static const char* const option_elt[] = { "option", NULL } ;
    739 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
    740 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
    741 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
    742 static const char* const width_attr[] = { "width", NULL } ;
    743 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
    744 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
    745 static const char* const language_attr[] = { "language", NULL } ;
    746 static const char* const select_content[] = { "optgroup", "option", NULL } ;
    747 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
    748 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
    749 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
    750 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
    751 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
    752 static const char* const tr_elt[] = { "tr", NULL } ;
    753 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
    754 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
    755 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
    756 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
    757 static const char* const tr_contents[] = { "th", "td", NULL } ;
    758 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
    759 static const char* const li_elt[] = { "li", NULL } ;
    760 static const char* const ul_depr[] = { "type", "compact", NULL} ;
    761 static const char* const dir_attr[] = { "dir", NULL} ;
    762 
    763 #define DECL (const char**)
    764 
    765 static const htmlElemDesc
    766 html40ElementTable[] = {
    767 { "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
    768 	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
    769 },
    770 { "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
    771 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    772 },
    773 { "acronym",	0, 0, 0, 0, 0, 0, 1, "",
    774 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    775 },
    776 { "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
    777 	DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
    778 },
    779 { "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
    780 	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
    781 },
    782 { "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
    783 	EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
    784 },
    785 { "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
    786 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    787 },
    788 { "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
    789 	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
    790 },
    791 { "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
    792 	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
    793 },
    794 { "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
    795 	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
    796 },
    797 { "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
    798 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    799 },
    800 { "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
    801 	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
    802 },
    803 { "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
    804 	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
    805 },
    806 { "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
    807 	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
    808 },
    809 { "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
    810 	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
    811 },
    812 { "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
    813 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    814 },
    815 { "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
    816 	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
    817 },
    818 { "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
    819 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    820 },
    821 { "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
    822 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    823 },
    824 { "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
    825 	EMPTY , NULL , DECL col_attrs , NULL, NULL
    826 },
    827 { "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
    828 	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
    829 },
    830 { "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
    831 	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
    832 },
    833 { "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
    834 	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
    835 },
    836 { "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
    837 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    838 },
    839 { "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
    840 	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
    841 },
    842 { "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
    843 	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
    844 },
    845 { "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
    846 	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
    847 },
    848 { "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
    849 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    850 },
    851 { "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
    852 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    853 },
    854 { "embed",	0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
    855 	EMPTY, NULL, DECL embed_attrs, NULL, NULL
    856 },
    857 { "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
    858 	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
    859 },
    860 { "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
    861 	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
    862 },
    863 { "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
    864 	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
    865 },
    866 { "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
    867 	EMPTY, NULL, NULL, DECL frame_attrs, NULL
    868 },
    869 { "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
    870 	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
    871 },
    872 { "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
    873 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    874 },
    875 { "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
    876 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    877 },
    878 { "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
    879 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    880 },
    881 { "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
    882 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    883 },
    884 { "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
    885 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    886 },
    887 { "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
    888 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    889 },
    890 { "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
    891 	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
    892 },
    893 { "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
    894 	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
    895 },
    896 { "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
    897 	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
    898 },
    899 { "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
    900 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    901 },
    902 { "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
    903 	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
    904 },
    905 { "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
    906 	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
    907 },
    908 { "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
    909 	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
    910 },
    911 { "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
    912 	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
    913 },
    914 { "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
    915 	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
    916 },
    917 { "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
    918 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    919 },
    920 { "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
    921 	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
    922 },
    923 { "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
    924 	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
    925 },
    926 { "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
    927 	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
    928 },
    929 { "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
    930 	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
    931 },
    932 { "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
    933 	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
    934 },
    935 { "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
    936 	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
    937 },
    938 { "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
    939 	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
    940 },
    941 { "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
    942 	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
    943 },
    944 { "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
    945 	DECL html_flow, "div", DECL html_attrs, NULL, NULL
    946 },
    947 { "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
    948 	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
    949 },
    950 { "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
    951 	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
    952 },
    953 { "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
    954 	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
    955 },
    956 { "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
    957 	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
    958 },
    959 { "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
    960 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    961 },
    962 { "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
    963 	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
    964 },
    965 { "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
    966 	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
    967 },
    968 { "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
    969 	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
    970 },
    971 { "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
    972 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
    973 },
    974 { "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
    975 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    976 },
    977 { "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
    978 	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
    979 },
    980 { "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
    981 	DECL select_content, NULL, DECL select_attrs, NULL, NULL
    982 },
    983 { "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
    984 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    985 },
    986 { "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
    987 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    988 },
    989 { "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
    990 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
    991 },
    992 { "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
    993 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    994 },
    995 { "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
    996 	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
    997 },
    998 { "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
    999 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
   1000 },
   1001 { "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
   1002 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
   1003 },
   1004 { "table",	0, 0, 0, 0, 0, 0, 0, "",
   1005 	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
   1006 },
   1007 { "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
   1008 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
   1009 },
   1010 { "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
   1011 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
   1012 },
   1013 { "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
   1014 	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
   1015 },
   1016 { "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
   1017 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
   1018 },
   1019 { "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
   1020 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
   1021 },
   1022 { "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
   1023 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
   1024 },
   1025 { "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
   1026 	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
   1027 },
   1028 { "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
   1029 	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
   1030 },
   1031 { "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
   1032 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
   1033 },
   1034 { "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
   1035 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
   1036 },
   1037 { "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
   1038 	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
   1039 },
   1040 { "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
   1041 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
   1042 }
   1043 };
   1044 
   1045 /*
   1046  * start tags that imply the end of current element
   1047  */
   1048 static const char * const htmlStartClose[] = {
   1049 "form",		"form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
   1050 		"dl", "ul", "ol", "menu", "dir", "address", "pre",
   1051 		"listing", "xmp", "head", NULL,
   1052 "head",		"p", NULL,
   1053 "title",	"p", NULL,
   1054 "body",		"head", "style", "link", "title", "p", NULL,
   1055 "frameset",	"head", "style", "link", "title", "p", NULL,
   1056 "li",		"p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
   1057 		"pre", "listing", "xmp", "head", "li", NULL,
   1058 "hr",		"p", "head", NULL,
   1059 "h1",		"p", "head", NULL,
   1060 "h2",		"p", "head", NULL,
   1061 "h3",		"p", "head", NULL,
   1062 "h4",		"p", "head", NULL,
   1063 "h5",		"p", "head", NULL,
   1064 "h6",		"p", "head", NULL,
   1065 "dir",		"p", "head", NULL,
   1066 "address",	"p", "head", "ul", NULL,
   1067 "pre",		"p", "head", "ul", NULL,
   1068 "listing",	"p", "head", NULL,
   1069 "xmp",		"p", "head", NULL,
   1070 "blockquote",	"p", "head", NULL,
   1071 "dl",		"p", "dt", "menu", "dir", "address", "pre", "listing",
   1072 		"xmp", "head", NULL,
   1073 "dt",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
   1074                 "head", "dd", NULL,
   1075 "dd",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
   1076                 "head", "dt", NULL,
   1077 "ul",		"p", "head", "ol", "menu", "dir", "address", "pre",
   1078 		"listing", "xmp", NULL,
   1079 "ol",		"p", "head", "ul", NULL,
   1080 "menu",		"p", "head", "ul", NULL,
   1081 "p",		"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
   1082 "div",		"p", "head", NULL,
   1083 "noscript",	"p", "head", NULL,
   1084 "center",	"font", "b", "i", "p", "head", NULL,
   1085 "a",		"a", NULL,
   1086 "caption",	"p", NULL,
   1087 "colgroup",	"caption", "colgroup", "col", "p", NULL,
   1088 "col",		"caption", "col", "p", NULL,
   1089 "table",	"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
   1090 		"listing", "xmp", "a", NULL,
   1091 "th",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
   1092 "td",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
   1093 "tr",		"th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
   1094 "thead",	"caption", "col", "colgroup", NULL,
   1095 "tfoot",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
   1096 		"tbody", "p", NULL,
   1097 "tbody",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
   1098 		"tfoot", "tbody", "p", NULL,
   1099 "optgroup",	"option", NULL,
   1100 "option",	"option", NULL,
   1101 "fieldset",	"legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
   1102 		"pre", "listing", "xmp", "a", NULL,
   1103 NULL
   1104 };
   1105 
   1106 /*
   1107  * The list of HTML elements which are supposed not to have
   1108  * CDATA content and where a p element will be implied
   1109  *
   1110  * TODO: extend that list by reading the HTML SGML DTD on
   1111  *       implied paragraph
   1112  */
   1113 static const char *const htmlNoContentElements[] = {
   1114     "html",
   1115     "head",
   1116     NULL
   1117 };
   1118 
   1119 /*
   1120  * The list of HTML attributes which are of content %Script;
   1121  * NOTE: when adding ones, check htmlIsScriptAttribute() since
   1122  *       it assumes the name starts with 'on'
   1123  */
   1124 static const char *const htmlScriptAttributes[] = {
   1125     "onclick",
   1126     "ondblclick",
   1127     "onmousedown",
   1128     "onmouseup",
   1129     "onmouseover",
   1130     "onmousemove",
   1131     "onmouseout",
   1132     "onkeypress",
   1133     "onkeydown",
   1134     "onkeyup",
   1135     "onload",
   1136     "onunload",
   1137     "onfocus",
   1138     "onblur",
   1139     "onsubmit",
   1140     "onrest",
   1141     "onchange",
   1142     "onselect"
   1143 };
   1144 
   1145 /*
   1146  * This table is used by the htmlparser to know what to do with
   1147  * broken html pages. By assigning different priorities to different
   1148  * elements the parser can decide how to handle extra endtags.
   1149  * Endtags are only allowed to close elements with lower or equal
   1150  * priority.
   1151  */
   1152 
   1153 typedef struct {
   1154     const char *name;
   1155     int priority;
   1156 } elementPriority;
   1157 
   1158 static const elementPriority htmlEndPriority[] = {
   1159     {"div",   150},
   1160     {"td",    160},
   1161     {"th",    160},
   1162     {"tr",    170},
   1163     {"thead", 180},
   1164     {"tbody", 180},
   1165     {"tfoot", 180},
   1166     {"table", 190},
   1167     {"head",  200},
   1168     {"body",  200},
   1169     {"html",  220},
   1170     {NULL,    100} /* Default priority */
   1171 };
   1172 
   1173 static const char** htmlStartCloseIndex[100];
   1174 static int htmlStartCloseIndexinitialized = 0;
   1175 
   1176 /************************************************************************
   1177  *									*
   1178  *	functions to handle HTML specific data			*
   1179  *									*
   1180  ************************************************************************/
   1181 
   1182 /**
   1183  * htmlInitAutoClose:
   1184  *
   1185  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
   1186  * This is not reentrant. Call xmlInitParser() once before processing in
   1187  * case of use in multithreaded programs.
   1188  */
   1189 void
   1190 htmlInitAutoClose(void) {
   1191     int indx, i = 0;
   1192 
   1193     if (htmlStartCloseIndexinitialized) return;
   1194 
   1195     for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
   1196     indx = 0;
   1197     while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
   1198         htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
   1199 	while (htmlStartClose[i] != NULL) i++;
   1200 	i++;
   1201     }
   1202     htmlStartCloseIndexinitialized = 1;
   1203 }
   1204 
   1205 /**
   1206  * htmlTagLookup:
   1207  * @tag:  The tag name in lowercase
   1208  *
   1209  * Lookup the HTML tag in the ElementTable
   1210  *
   1211  * Returns the related htmlElemDescPtr or NULL if not found.
   1212  */
   1213 const htmlElemDesc *
   1214 htmlTagLookup(const xmlChar *tag) {
   1215     unsigned int i;
   1216 
   1217     for (i = 0; i < (sizeof(html40ElementTable) /
   1218                      sizeof(html40ElementTable[0]));i++) {
   1219         if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
   1220 	    return((htmlElemDescPtr) &html40ElementTable[i]);
   1221     }
   1222     return(NULL);
   1223 }
   1224 
   1225 /**
   1226  * htmlGetEndPriority:
   1227  * @name: The name of the element to look up the priority for.
   1228  *
   1229  * Return value: The "endtag" priority.
   1230  **/
   1231 static int
   1232 htmlGetEndPriority (const xmlChar *name) {
   1233     int i = 0;
   1234 
   1235     while ((htmlEndPriority[i].name != NULL) &&
   1236 	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
   1237 	i++;
   1238 
   1239     return(htmlEndPriority[i].priority);
   1240 }
   1241 
   1242 
   1243 /**
   1244  * htmlCheckAutoClose:
   1245  * @newtag:  The new tag name
   1246  * @oldtag:  The old tag name
   1247  *
   1248  * Checks whether the new tag is one of the registered valid tags for
   1249  * closing old.
   1250  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
   1251  *
   1252  * Returns 0 if no, 1 if yes.
   1253  */
   1254 static int
   1255 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
   1256 {
   1257     int i, indx;
   1258     const char **closed = NULL;
   1259 
   1260     if (htmlStartCloseIndexinitialized == 0)
   1261         htmlInitAutoClose();
   1262 
   1263     /* inefficient, but not a big deal */
   1264     for (indx = 0; indx < 100; indx++) {
   1265         closed = htmlStartCloseIndex[indx];
   1266         if (closed == NULL)
   1267             return (0);
   1268         if (xmlStrEqual(BAD_CAST * closed, newtag))
   1269             break;
   1270     }
   1271 
   1272     i = closed - htmlStartClose;
   1273     i++;
   1274     while (htmlStartClose[i] != NULL) {
   1275         if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
   1276             return (1);
   1277         }
   1278         i++;
   1279     }
   1280     return (0);
   1281 }
   1282 
   1283 /**
   1284  * htmlAutoCloseOnClose:
   1285  * @ctxt:  an HTML parser context
   1286  * @newtag:  The new tag name
   1287  * @force:  force the tag closure
   1288  *
   1289  * The HTML DTD allows an ending tag to implicitly close other tags.
   1290  */
   1291 static void
   1292 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
   1293 {
   1294     const htmlElemDesc *info;
   1295     int i, priority;
   1296 
   1297     priority = htmlGetEndPriority(newtag);
   1298 
   1299     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
   1300 
   1301         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
   1302             break;
   1303         /*
   1304          * A missplaced endtag can only close elements with lower
   1305          * or equal priority, so if we find an element with higher
   1306          * priority before we find an element with
   1307          * matching name, we just ignore this endtag
   1308          */
   1309         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
   1310             return;
   1311     }
   1312     if (i < 0)
   1313         return;
   1314 
   1315     while (!xmlStrEqual(newtag, ctxt->name)) {
   1316         info = htmlTagLookup(ctxt->name);
   1317         if ((info != NULL) && (info->endTag == 3)) {
   1318             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
   1319 	                 "Opening and ending tag mismatch: %s and %s\n",
   1320 			 newtag, ctxt->name);
   1321         }
   1322         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   1323             ctxt->sax->endElement(ctxt->userData, ctxt->name);
   1324 	htmlnamePop(ctxt);
   1325     }
   1326 }
   1327 
   1328 /**
   1329  * htmlAutoCloseOnEnd:
   1330  * @ctxt:  an HTML parser context
   1331  *
   1332  * Close all remaining tags at the end of the stream
   1333  */
   1334 static void
   1335 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
   1336 {
   1337     int i;
   1338 
   1339     if (ctxt->nameNr == 0)
   1340         return;
   1341     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
   1342         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   1343             ctxt->sax->endElement(ctxt->userData, ctxt->name);
   1344 	htmlnamePop(ctxt);
   1345     }
   1346 }
   1347 
   1348 /**
   1349  * htmlAutoClose:
   1350  * @ctxt:  an HTML parser context
   1351  * @newtag:  The new tag name or NULL
   1352  *
   1353  * The HTML DTD allows a tag to implicitly close other tags.
   1354  * The list is kept in htmlStartClose array. This function is
   1355  * called when a new tag has been detected and generates the
   1356  * appropriates closes if possible/needed.
   1357  * If newtag is NULL this mean we are at the end of the resource
   1358  * and we should check
   1359  */
   1360 static void
   1361 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
   1362 {
   1363     while ((newtag != NULL) && (ctxt->name != NULL) &&
   1364            (htmlCheckAutoClose(newtag, ctxt->name))) {
   1365         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   1366             ctxt->sax->endElement(ctxt->userData, ctxt->name);
   1367 	htmlnamePop(ctxt);
   1368     }
   1369     if (newtag == NULL) {
   1370         htmlAutoCloseOnEnd(ctxt);
   1371         return;
   1372     }
   1373     while ((newtag == NULL) && (ctxt->name != NULL) &&
   1374            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
   1375             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
   1376             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
   1377         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   1378             ctxt->sax->endElement(ctxt->userData, ctxt->name);
   1379 	htmlnamePop(ctxt);
   1380     }
   1381 }
   1382 
   1383 /**
   1384  * htmlAutoCloseTag:
   1385  * @doc:  the HTML document
   1386  * @name:  The tag name
   1387  * @elem:  the HTML element
   1388  *
   1389  * The HTML DTD allows a tag to implicitly close other tags.
   1390  * The list is kept in htmlStartClose array. This function checks
   1391  * if the element or one of it's children would autoclose the
   1392  * given tag.
   1393  *
   1394  * Returns 1 if autoclose, 0 otherwise
   1395  */
   1396 int
   1397 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
   1398     htmlNodePtr child;
   1399 
   1400     if (elem == NULL) return(1);
   1401     if (xmlStrEqual(name, elem->name)) return(0);
   1402     if (htmlCheckAutoClose(elem->name, name)) return(1);
   1403     child = elem->children;
   1404     while (child != NULL) {
   1405         if (htmlAutoCloseTag(doc, name, child)) return(1);
   1406 	child = child->next;
   1407     }
   1408     return(0);
   1409 }
   1410 
   1411 /**
   1412  * htmlIsAutoClosed:
   1413  * @doc:  the HTML document
   1414  * @elem:  the HTML element
   1415  *
   1416  * The HTML DTD allows a tag to implicitly close other tags.
   1417  * The list is kept in htmlStartClose array. This function checks
   1418  * if a tag is autoclosed by one of it's child
   1419  *
   1420  * Returns 1 if autoclosed, 0 otherwise
   1421  */
   1422 int
   1423 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
   1424     htmlNodePtr child;
   1425 
   1426     if (elem == NULL) return(1);
   1427     child = elem->children;
   1428     while (child != NULL) {
   1429 	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
   1430 	child = child->next;
   1431     }
   1432     return(0);
   1433 }
   1434 
   1435 /**
   1436  * htmlCheckImplied:
   1437  * @ctxt:  an HTML parser context
   1438  * @newtag:  The new tag name
   1439  *
   1440  * The HTML DTD allows a tag to exists only implicitly
   1441  * called when a new tag has been detected and generates the
   1442  * appropriates implicit tags if missing
   1443  */
   1444 static void
   1445 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
   1446     int i;
   1447 
   1448     if (ctxt->options & HTML_PARSE_NOIMPLIED)
   1449         return;
   1450     if (!htmlOmittedDefaultValue)
   1451 	return;
   1452     if (xmlStrEqual(newtag, BAD_CAST"html"))
   1453 	return;
   1454     if (ctxt->nameNr <= 0) {
   1455 	htmlnamePush(ctxt, BAD_CAST"html");
   1456 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1457 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
   1458     }
   1459     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
   1460         return;
   1461     if ((ctxt->nameNr <= 1) &&
   1462         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
   1463 	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
   1464 	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
   1465 	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
   1466 	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
   1467 	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
   1468         if (ctxt->html >= 3) {
   1469             /* we already saw or generated an <head> before */
   1470             return;
   1471         }
   1472         /*
   1473          * dropped OBJECT ... i you put it first BODY will be
   1474          * assumed !
   1475          */
   1476         htmlnamePush(ctxt, BAD_CAST"head");
   1477         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1478             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
   1479     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
   1480 	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
   1481 	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
   1482         if (ctxt->html >= 10) {
   1483             /* we already saw or generated a <body> before */
   1484             return;
   1485         }
   1486 	for (i = 0;i < ctxt->nameNr;i++) {
   1487 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
   1488 		return;
   1489 	    }
   1490 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
   1491 		return;
   1492 	    }
   1493 	}
   1494 
   1495 	htmlnamePush(ctxt, BAD_CAST"body");
   1496 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1497 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
   1498     }
   1499 }
   1500 
   1501 /**
   1502  * htmlCheckParagraph
   1503  * @ctxt:  an HTML parser context
   1504  *
   1505  * Check whether a p element need to be implied before inserting
   1506  * characters in the current element.
   1507  *
   1508  * Returns 1 if a paragraph has been inserted, 0 if not and -1
   1509  *         in case of error.
   1510  */
   1511 
   1512 static int
   1513 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
   1514     const xmlChar *tag;
   1515     int i;
   1516 
   1517     if (ctxt == NULL)
   1518 	return(-1);
   1519     tag = ctxt->name;
   1520     if (tag == NULL) {
   1521 	htmlAutoClose(ctxt, BAD_CAST"p");
   1522 	htmlCheckImplied(ctxt, BAD_CAST"p");
   1523 	htmlnamePush(ctxt, BAD_CAST"p");
   1524 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1525 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
   1526 	return(1);
   1527     }
   1528     if (!htmlOmittedDefaultValue)
   1529 	return(0);
   1530     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
   1531 	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
   1532 	    htmlAutoClose(ctxt, BAD_CAST"p");
   1533 	    htmlCheckImplied(ctxt, BAD_CAST"p");
   1534 	    htmlnamePush(ctxt, BAD_CAST"p");
   1535 	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1536 		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
   1537 	    return(1);
   1538 	}
   1539     }
   1540     return(0);
   1541 }
   1542 
   1543 /**
   1544  * htmlIsScriptAttribute:
   1545  * @name:  an attribute name
   1546  *
   1547  * Check if an attribute is of content type Script
   1548  *
   1549  * Returns 1 is the attribute is a script 0 otherwise
   1550  */
   1551 int
   1552 htmlIsScriptAttribute(const xmlChar *name) {
   1553     unsigned int i;
   1554 
   1555     if (name == NULL)
   1556       return(0);
   1557     /*
   1558      * all script attributes start with 'on'
   1559      */
   1560     if ((name[0] != 'o') || (name[1] != 'n'))
   1561       return(0);
   1562     for (i = 0;
   1563 	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
   1564 	 i++) {
   1565 	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
   1566 	    return(1);
   1567     }
   1568     return(0);
   1569 }
   1570 
   1571 /************************************************************************
   1572  *									*
   1573  *	The list of HTML predefined entities			*
   1574  *									*
   1575  ************************************************************************/
   1576 
   1577 
   1578 static const htmlEntityDesc  html40EntitiesTable[] = {
   1579 /*
   1580  * the 4 absolute ones, plus apostrophe.
   1581  */
   1582 { 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
   1583 { 38,	"amp",	"ampersand, U+0026 ISOnum" },
   1584 { 39,	"apos",	"single quote" },
   1585 { 60,	"lt",	"less-than sign, U+003C ISOnum" },
   1586 { 62,	"gt",	"greater-than sign, U+003E ISOnum" },
   1587 
   1588 /*
   1589  * A bunch still in the 128-255 range
   1590  * Replacing them depend really on the charset used.
   1591  */
   1592 { 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
   1593 { 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
   1594 { 162,	"cent",	"cent sign, U+00A2 ISOnum" },
   1595 { 163,	"pound","pound sign, U+00A3 ISOnum" },
   1596 { 164,	"curren","currency sign, U+00A4 ISOnum" },
   1597 { 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
   1598 { 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
   1599 { 167,	"sect",	"section sign, U+00A7 ISOnum" },
   1600 { 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
   1601 { 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
   1602 { 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
   1603 { 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
   1604 { 172,	"not",	"not sign, U+00AC ISOnum" },
   1605 { 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
   1606 { 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
   1607 { 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
   1608 { 176,	"deg",	"degree sign, U+00B0 ISOnum" },
   1609 { 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
   1610 { 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
   1611 { 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
   1612 { 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
   1613 { 181,	"micro","micro sign, U+00B5 ISOnum" },
   1614 { 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
   1615 { 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
   1616 { 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
   1617 { 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
   1618 { 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
   1619 { 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
   1620 { 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
   1621 { 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
   1622 { 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
   1623 { 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
   1624 { 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
   1625 { 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
   1626 { 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
   1627 { 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
   1628 { 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
   1629 { 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
   1630 { 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
   1631 { 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
   1632 { 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
   1633 { 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
   1634 { 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
   1635 { 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
   1636 { 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
   1637 { 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
   1638 { 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
   1639 { 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
   1640 { 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
   1641 { 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
   1642 { 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
   1643 { 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
   1644 { 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
   1645 { 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
   1646 { 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
   1647 { 215,	"times","multiplication sign, U+00D7 ISOnum" },
   1648 { 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
   1649 { 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
   1650 { 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
   1651 { 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
   1652 { 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
   1653 { 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
   1654 { 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
   1655 { 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
   1656 { 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
   1657 { 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
   1658 { 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
   1659 { 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
   1660 { 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
   1661 { 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
   1662 { 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
   1663 { 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
   1664 { 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
   1665 { 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
   1666 { 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
   1667 { 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
   1668 { 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
   1669 { 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
   1670 { 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
   1671 { 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
   1672 { 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
   1673 { 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
   1674 { 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
   1675 { 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
   1676 { 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
   1677 { 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
   1678 { 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
   1679 { 247,	"divide","division sign, U+00F7 ISOnum" },
   1680 { 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
   1681 { 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
   1682 { 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
   1683 { 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
   1684 { 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
   1685 { 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
   1686 { 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
   1687 { 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
   1688 
   1689 { 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
   1690 { 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
   1691 { 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
   1692 { 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
   1693 { 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
   1694 
   1695 /*
   1696  * Anything below should really be kept as entities references
   1697  */
   1698 { 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
   1699 
   1700 { 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
   1701 { 732,	"tilde","small tilde, U+02DC ISOdia" },
   1702 
   1703 { 913,	"Alpha","greek capital letter alpha, U+0391" },
   1704 { 914,	"Beta",	"greek capital letter beta, U+0392" },
   1705 { 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
   1706 { 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
   1707 { 917,	"Epsilon","greek capital letter epsilon, U+0395" },
   1708 { 918,	"Zeta",	"greek capital letter zeta, U+0396" },
   1709 { 919,	"Eta",	"greek capital letter eta, U+0397" },
   1710 { 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
   1711 { 921,	"Iota",	"greek capital letter iota, U+0399" },
   1712 { 922,	"Kappa","greek capital letter kappa, U+039A" },
   1713 { 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
   1714 { 924,	"Mu",	"greek capital letter mu, U+039C" },
   1715 { 925,	"Nu",	"greek capital letter nu, U+039D" },
   1716 { 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
   1717 { 927,	"Omicron","greek capital letter omicron, U+039F" },
   1718 { 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
   1719 { 929,	"Rho",	"greek capital letter rho, U+03A1" },
   1720 { 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
   1721 { 932,	"Tau",	"greek capital letter tau, U+03A4" },
   1722 { 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
   1723 { 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
   1724 { 935,	"Chi",	"greek capital letter chi, U+03A7" },
   1725 { 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
   1726 { 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
   1727 
   1728 { 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
   1729 { 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
   1730 { 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
   1731 { 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
   1732 { 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
   1733 { 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
   1734 { 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
   1735 { 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
   1736 { 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
   1737 { 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
   1738 { 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
   1739 { 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
   1740 { 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
   1741 { 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
   1742 { 959,	"omicron","greek small letter omicron, U+03BF NEW" },
   1743 { 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
   1744 { 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
   1745 { 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
   1746 { 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
   1747 { 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
   1748 { 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
   1749 { 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
   1750 { 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
   1751 { 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
   1752 { 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
   1753 { 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
   1754 { 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
   1755 { 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
   1756 
   1757 { 8194,	"ensp",	"en space, U+2002 ISOpub" },
   1758 { 8195,	"emsp",	"em space, U+2003 ISOpub" },
   1759 { 8201,	"thinsp","thin space, U+2009 ISOpub" },
   1760 { 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
   1761 { 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
   1762 { 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
   1763 { 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
   1764 { 8211,	"ndash","en dash, U+2013 ISOpub" },
   1765 { 8212,	"mdash","em dash, U+2014 ISOpub" },
   1766 { 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
   1767 { 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
   1768 { 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
   1769 { 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
   1770 { 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
   1771 { 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
   1772 { 8224,	"dagger","dagger, U+2020 ISOpub" },
   1773 { 8225,	"Dagger","double dagger, U+2021 ISOpub" },
   1774 
   1775 { 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
   1776 { 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
   1777 
   1778 { 8240,	"permil","per mille sign, U+2030 ISOtech" },
   1779 
   1780 { 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
   1781 { 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
   1782 
   1783 { 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
   1784 { 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
   1785 
   1786 { 8254,	"oline","overline = spacing overscore, U+203E NEW" },
   1787 { 8260,	"frasl","fraction slash, U+2044 NEW" },
   1788 
   1789 { 8364,	"euro",	"euro sign, U+20AC NEW" },
   1790 
   1791 { 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
   1792 { 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
   1793 { 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
   1794 { 8482,	"trade","trade mark sign, U+2122 ISOnum" },
   1795 { 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
   1796 { 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
   1797 { 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
   1798 { 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
   1799 { 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
   1800 { 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
   1801 { 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
   1802 { 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
   1803 { 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
   1804 { 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
   1805 { 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
   1806 { 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
   1807 
   1808 { 8704,	"forall","for all, U+2200 ISOtech" },
   1809 { 8706,	"part",	"partial differential, U+2202 ISOtech" },
   1810 { 8707,	"exist","there exists, U+2203 ISOtech" },
   1811 { 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
   1812 { 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
   1813 { 8712,	"isin",	"element of, U+2208 ISOtech" },
   1814 { 8713,	"notin","not an element of, U+2209 ISOtech" },
   1815 { 8715,	"ni",	"contains as member, U+220B ISOtech" },
   1816 { 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
   1817 { 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
   1818 { 8722,	"minus","minus sign, U+2212 ISOtech" },
   1819 { 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
   1820 { 8730,	"radic","square root = radical sign, U+221A ISOtech" },
   1821 { 8733,	"prop",	"proportional to, U+221D ISOtech" },
   1822 { 8734,	"infin","infinity, U+221E ISOtech" },
   1823 { 8736,	"ang",	"angle, U+2220 ISOamso" },
   1824 { 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
   1825 { 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
   1826 { 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
   1827 { 8746,	"cup",	"union = cup, U+222A ISOtech" },
   1828 { 8747,	"int",	"integral, U+222B ISOtech" },
   1829 { 8756,	"there4","therefore, U+2234 ISOtech" },
   1830 { 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
   1831 { 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
   1832 { 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
   1833 { 8800,	"ne",	"not equal to, U+2260 ISOtech" },
   1834 { 8801,	"equiv","identical to, U+2261 ISOtech" },
   1835 { 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
   1836 { 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
   1837 { 8834,	"sub",	"subset of, U+2282 ISOtech" },
   1838 { 8835,	"sup",	"superset of, U+2283 ISOtech" },
   1839 { 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
   1840 { 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
   1841 { 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
   1842 { 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
   1843 { 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
   1844 { 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
   1845 { 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
   1846 { 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
   1847 { 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
   1848 { 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
   1849 { 8971,	"rfloor","right floor, U+230B ISOamsc" },
   1850 { 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
   1851 { 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
   1852 { 9674,	"loz",	"lozenge, U+25CA ISOpub" },
   1853 
   1854 { 9824,	"spades","black spade suit, U+2660 ISOpub" },
   1855 { 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
   1856 { 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
   1857 { 9830,	"diams","black diamond suit, U+2666 ISOpub" },
   1858 
   1859 };
   1860 
   1861 /************************************************************************
   1862  *									*
   1863  *		Commodity functions to handle entities			*
   1864  *									*
   1865  ************************************************************************/
   1866 
   1867 /*
   1868  * Macro used to grow the current buffer.
   1869  */
   1870 #define growBuffer(buffer) {						\
   1871     xmlChar *tmp;							\
   1872     buffer##_size *= 2;							\
   1873     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
   1874     if (tmp == NULL) {						\
   1875 	htmlErrMemory(ctxt, "growing buffer\n");			\
   1876 	xmlFree(buffer);						\
   1877 	return(NULL);							\
   1878     }									\
   1879     buffer = tmp;							\
   1880 }
   1881 
   1882 /**
   1883  * htmlEntityLookup:
   1884  * @name: the entity name
   1885  *
   1886  * Lookup the given entity in EntitiesTable
   1887  *
   1888  * TODO: the linear scan is really ugly, an hash table is really needed.
   1889  *
   1890  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
   1891  */
   1892 const htmlEntityDesc *
   1893 htmlEntityLookup(const xmlChar *name) {
   1894     unsigned int i;
   1895 
   1896     for (i = 0;i < (sizeof(html40EntitiesTable)/
   1897                     sizeof(html40EntitiesTable[0]));i++) {
   1898         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
   1899             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
   1900 	}
   1901     }
   1902     return(NULL);
   1903 }
   1904 
   1905 /**
   1906  * htmlEntityValueLookup:
   1907  * @value: the entity's unicode value
   1908  *
   1909  * Lookup the given entity in EntitiesTable
   1910  *
   1911  * TODO: the linear scan is really ugly, an hash table is really needed.
   1912  *
   1913  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
   1914  */
   1915 const htmlEntityDesc *
   1916 htmlEntityValueLookup(unsigned int value) {
   1917     unsigned int i;
   1918 
   1919     for (i = 0;i < (sizeof(html40EntitiesTable)/
   1920                     sizeof(html40EntitiesTable[0]));i++) {
   1921         if (html40EntitiesTable[i].value >= value) {
   1922 	    if (html40EntitiesTable[i].value > value)
   1923 		break;
   1924             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
   1925 	}
   1926     }
   1927     return(NULL);
   1928 }
   1929 
   1930 /**
   1931  * UTF8ToHtml:
   1932  * @out:  a pointer to an array of bytes to store the result
   1933  * @outlen:  the length of @out
   1934  * @in:  a pointer to an array of UTF-8 chars
   1935  * @inlen:  the length of @in
   1936  *
   1937  * Take a block of UTF-8 chars in and try to convert it to an ASCII
   1938  * plus HTML entities block of chars out.
   1939  *
   1940  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
   1941  * The value of @inlen after return is the number of octets consumed
   1942  *     as the return value is positive, else unpredictable.
   1943  * The value of @outlen after return is the number of octets consumed.
   1944  */
   1945 int
   1946 UTF8ToHtml(unsigned char* out, int *outlen,
   1947               const unsigned char* in, int *inlen) {
   1948     const unsigned char* processed = in;
   1949     const unsigned char* outend;
   1950     const unsigned char* outstart = out;
   1951     const unsigned char* instart = in;
   1952     const unsigned char* inend;
   1953     unsigned int c, d;
   1954     int trailing;
   1955 
   1956     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
   1957     if (in == NULL) {
   1958         /*
   1959 	 * initialization nothing to do
   1960 	 */
   1961 	*outlen = 0;
   1962 	*inlen = 0;
   1963 	return(0);
   1964     }
   1965     inend = in + (*inlen);
   1966     outend = out + (*outlen);
   1967     while (in < inend) {
   1968 	d = *in++;
   1969 	if      (d < 0x80)  { c= d; trailing= 0; }
   1970 	else if (d < 0xC0) {
   1971 	    /* trailing byte in leading position */
   1972 	    *outlen = out - outstart;
   1973 	    *inlen = processed - instart;
   1974 	    return(-2);
   1975         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
   1976         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
   1977         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
   1978 	else {
   1979 	    /* no chance for this in Ascii */
   1980 	    *outlen = out - outstart;
   1981 	    *inlen = processed - instart;
   1982 	    return(-2);
   1983 	}
   1984 
   1985 	if (inend - in < trailing) {
   1986 	    break;
   1987 	}
   1988 
   1989 	for ( ; trailing; trailing--) {
   1990 	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
   1991 		break;
   1992 	    c <<= 6;
   1993 	    c |= d & 0x3F;
   1994 	}
   1995 
   1996 	/* assertion: c is a single UTF-4 value */
   1997 	if (c < 0x80) {
   1998 	    if (out + 1 >= outend)
   1999 		break;
   2000 	    *out++ = c;
   2001 	} else {
   2002 	    int len;
   2003 	    const htmlEntityDesc * ent;
   2004 	    const char *cp;
   2005 	    char nbuf[16];
   2006 
   2007 	    /*
   2008 	     * Try to lookup a predefined HTML entity for it
   2009 	     */
   2010 
   2011 	    ent = htmlEntityValueLookup(c);
   2012 	    if (ent == NULL) {
   2013 	      snprintf(nbuf, sizeof(nbuf), "#%u", c);
   2014 	      cp = nbuf;
   2015 	    }
   2016 	    else
   2017 	      cp = ent->name;
   2018 	    len = strlen(cp);
   2019 	    if (out + 2 + len >= outend)
   2020 		break;
   2021 	    *out++ = '&';
   2022 	    memcpy(out, cp, len);
   2023 	    out += len;
   2024 	    *out++ = ';';
   2025 	}
   2026 	processed = in;
   2027     }
   2028     *outlen = out - outstart;
   2029     *inlen = processed - instart;
   2030     return(0);
   2031 }
   2032 
   2033 /**
   2034  * htmlEncodeEntities:
   2035  * @out:  a pointer to an array of bytes to store the result
   2036  * @outlen:  the length of @out
   2037  * @in:  a pointer to an array of UTF-8 chars
   2038  * @inlen:  the length of @in
   2039  * @quoteChar: the quote character to escape (' or ") or zero.
   2040  *
   2041  * Take a block of UTF-8 chars in and try to convert it to an ASCII
   2042  * plus HTML entities block of chars out.
   2043  *
   2044  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
   2045  * The value of @inlen after return is the number of octets consumed
   2046  *     as the return value is positive, else unpredictable.
   2047  * The value of @outlen after return is the number of octets consumed.
   2048  */
   2049 int
   2050 htmlEncodeEntities(unsigned char* out, int *outlen,
   2051 		   const unsigned char* in, int *inlen, int quoteChar) {
   2052     const unsigned char* processed = in;
   2053     const unsigned char* outend;
   2054     const unsigned char* outstart = out;
   2055     const unsigned char* instart = in;
   2056     const unsigned char* inend;
   2057     unsigned int c, d;
   2058     int trailing;
   2059 
   2060     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
   2061         return(-1);
   2062     outend = out + (*outlen);
   2063     inend = in + (*inlen);
   2064     while (in < inend) {
   2065 	d = *in++;
   2066 	if      (d < 0x80)  { c= d; trailing= 0; }
   2067 	else if (d < 0xC0) {
   2068 	    /* trailing byte in leading position */
   2069 	    *outlen = out - outstart;
   2070 	    *inlen = processed - instart;
   2071 	    return(-2);
   2072         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
   2073         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
   2074         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
   2075 	else {
   2076 	    /* no chance for this in Ascii */
   2077 	    *outlen = out - outstart;
   2078 	    *inlen = processed - instart;
   2079 	    return(-2);
   2080 	}
   2081 
   2082 	if (inend - in < trailing)
   2083 	    break;
   2084 
   2085 	while (trailing--) {
   2086 	    if (((d= *in++) & 0xC0) != 0x80) {
   2087 		*outlen = out - outstart;
   2088 		*inlen = processed - instart;
   2089 		return(-2);
   2090 	    }
   2091 	    c <<= 6;
   2092 	    c |= d & 0x3F;
   2093 	}
   2094 
   2095 	/* assertion: c is a single UTF-4 value */
   2096 	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
   2097 	    (c != '&') && (c != '<') && (c != '>')) {
   2098 	    if (out >= outend)
   2099 		break;
   2100 	    *out++ = c;
   2101 	} else {
   2102 	    const htmlEntityDesc * ent;
   2103 	    const char *cp;
   2104 	    char nbuf[16];
   2105 	    int len;
   2106 
   2107 	    /*
   2108 	     * Try to lookup a predefined HTML entity for it
   2109 	     */
   2110 	    ent = htmlEntityValueLookup(c);
   2111 	    if (ent == NULL) {
   2112 		snprintf(nbuf, sizeof(nbuf), "#%u", c);
   2113 		cp = nbuf;
   2114 	    }
   2115 	    else
   2116 		cp = ent->name;
   2117 	    len = strlen(cp);
   2118 	    if (out + 2 + len > outend)
   2119 		break;
   2120 	    *out++ = '&';
   2121 	    memcpy(out, cp, len);
   2122 	    out += len;
   2123 	    *out++ = ';';
   2124 	}
   2125 	processed = in;
   2126     }
   2127     *outlen = out - outstart;
   2128     *inlen = processed - instart;
   2129     return(0);
   2130 }
   2131 
   2132 /************************************************************************
   2133  *									*
   2134  *		Commodity functions to handle streams			*
   2135  *									*
   2136  ************************************************************************/
   2137 
   2138 /**
   2139  * htmlNewInputStream:
   2140  * @ctxt:  an HTML parser context
   2141  *
   2142  * Create a new input stream structure
   2143  * Returns the new input stream or NULL
   2144  */
   2145 static htmlParserInputPtr
   2146 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
   2147     htmlParserInputPtr input;
   2148 
   2149     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
   2150     if (input == NULL) {
   2151         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
   2152 	return(NULL);
   2153     }
   2154     memset(input, 0, sizeof(htmlParserInput));
   2155     input->filename = NULL;
   2156     input->directory = NULL;
   2157     input->base = NULL;
   2158     input->cur = NULL;
   2159     input->buf = NULL;
   2160     input->line = 1;
   2161     input->col = 1;
   2162     input->buf = NULL;
   2163     input->free = NULL;
   2164     input->version = NULL;
   2165     input->consumed = 0;
   2166     input->length = 0;
   2167     return(input);
   2168 }
   2169 
   2170 
   2171 /************************************************************************
   2172  *									*
   2173  *		Commodity functions, cleanup needed ?			*
   2174  *									*
   2175  ************************************************************************/
   2176 /*
   2177  * all tags allowing pc data from the html 4.01 loose dtd
   2178  * NOTE: it might be more apropriate to integrate this information
   2179  * into the html40ElementTable array but I don't want to risk any
   2180  * binary incomptibility
   2181  */
   2182 static const char *allowPCData[] = {
   2183     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
   2184     "blockquote", "body", "button", "caption", "center", "cite", "code",
   2185     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
   2186     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
   2187     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
   2188     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
   2189 };
   2190 
   2191 /**
   2192  * areBlanks:
   2193  * @ctxt:  an HTML parser context
   2194  * @str:  a xmlChar *
   2195  * @len:  the size of @str
   2196  *
   2197  * Is this a sequence of blank chars that one can ignore ?
   2198  *
   2199  * Returns 1 if ignorable 0 otherwise.
   2200  */
   2201 
   2202 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
   2203     unsigned int i;
   2204     int j;
   2205     xmlNodePtr lastChild;
   2206     xmlDtdPtr dtd;
   2207 
   2208     for (j = 0;j < len;j++)
   2209         if (!(IS_BLANK_CH(str[j]))) return(0);
   2210 
   2211     if (CUR == 0) return(1);
   2212     if (CUR != '<') return(0);
   2213     if (ctxt->name == NULL)
   2214 	return(1);
   2215     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
   2216 	return(1);
   2217     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
   2218 	return(1);
   2219 
   2220     /* Only strip CDATA children of the body tag for strict HTML DTDs */
   2221     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
   2222         dtd = xmlGetIntSubset(ctxt->myDoc);
   2223         if (dtd != NULL && dtd->ExternalID != NULL) {
   2224             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
   2225                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
   2226                 return(1);
   2227         }
   2228     }
   2229 
   2230     if (ctxt->node == NULL) return(0);
   2231     lastChild = xmlGetLastChild(ctxt->node);
   2232     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
   2233 	lastChild = lastChild->prev;
   2234     if (lastChild == NULL) {
   2235         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
   2236             (ctxt->node->content != NULL)) return(0);
   2237 	/* keep ws in constructs like ...<b> </b>...
   2238 	   for all tags "b" allowing PCDATA */
   2239 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
   2240 	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
   2241 		return(0);
   2242 	    }
   2243 	}
   2244     } else if (xmlNodeIsText(lastChild)) {
   2245         return(0);
   2246     } else {
   2247 	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
   2248 	   for all tags "p" allowing PCDATA */
   2249 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
   2250 	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
   2251 		return(0);
   2252 	    }
   2253 	}
   2254     }
   2255     return(1);
   2256 }
   2257 
   2258 /**
   2259  * htmlNewDocNoDtD:
   2260  * @URI:  URI for the dtd, or NULL
   2261  * @ExternalID:  the external ID of the DTD, or NULL
   2262  *
   2263  * Creates a new HTML document without a DTD node if @URI and @ExternalID
   2264  * are NULL
   2265  *
   2266  * Returns a new document, do not initialize the DTD if not provided
   2267  */
   2268 htmlDocPtr
   2269 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
   2270     xmlDocPtr cur;
   2271 
   2272     /*
   2273      * Allocate a new document and fill the fields.
   2274      */
   2275     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
   2276     if (cur == NULL) {
   2277 	htmlErrMemory(NULL, "HTML document creation failed\n");
   2278 	return(NULL);
   2279     }
   2280     memset(cur, 0, sizeof(xmlDoc));
   2281 
   2282     cur->type = XML_HTML_DOCUMENT_NODE;
   2283     cur->version = NULL;
   2284     cur->intSubset = NULL;
   2285     cur->doc = cur;
   2286     cur->name = NULL;
   2287     cur->children = NULL;
   2288     cur->extSubset = NULL;
   2289     cur->oldNs = NULL;
   2290     cur->encoding = NULL;
   2291     cur->standalone = 1;
   2292     cur->compression = 0;
   2293     cur->ids = NULL;
   2294     cur->refs = NULL;
   2295     cur->_private = NULL;
   2296     cur->charset = XML_CHAR_ENCODING_UTF8;
   2297     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
   2298     if ((ExternalID != NULL) ||
   2299 	(URI != NULL))
   2300 	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
   2301     return(cur);
   2302 }
   2303 
   2304 /**
   2305  * htmlNewDoc:
   2306  * @URI:  URI for the dtd, or NULL
   2307  * @ExternalID:  the external ID of the DTD, or NULL
   2308  *
   2309  * Creates a new HTML document
   2310  *
   2311  * Returns a new document
   2312  */
   2313 htmlDocPtr
   2314 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
   2315     if ((URI == NULL) && (ExternalID == NULL))
   2316 	return(htmlNewDocNoDtD(
   2317 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
   2318 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
   2319 
   2320     return(htmlNewDocNoDtD(URI, ExternalID));
   2321 }
   2322 
   2323 
   2324 /************************************************************************
   2325  *									*
   2326  *			The parser itself				*
   2327  *	Relates to http://www.w3.org/TR/html40				*
   2328  *									*
   2329  ************************************************************************/
   2330 
   2331 /************************************************************************
   2332  *									*
   2333  *			The parser itself				*
   2334  *									*
   2335  ************************************************************************/
   2336 
   2337 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
   2338 
   2339 /**
   2340  * htmlParseHTMLName:
   2341  * @ctxt:  an HTML parser context
   2342  *
   2343  * parse an HTML tag or attribute name, note that we convert it to lowercase
   2344  * since HTML names are not case-sensitive.
   2345  *
   2346  * Returns the Tag Name parsed or NULL
   2347  */
   2348 
   2349 static const xmlChar *
   2350 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
   2351     int i = 0;
   2352     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
   2353 
   2354     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
   2355         (CUR != ':') && (CUR != '.')) return(NULL);
   2356 
   2357     while ((i < HTML_PARSER_BUFFER_SIZE) &&
   2358            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
   2359 	   (CUR == ':') || (CUR == '-') || (CUR == '_') ||
   2360            (CUR == '.'))) {
   2361 	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
   2362         else loc[i] = CUR;
   2363 	i++;
   2364 
   2365 	NEXT;
   2366     }
   2367 
   2368     return(xmlDictLookup(ctxt->dict, loc, i));
   2369 }
   2370 
   2371 
   2372 /**
   2373  * htmlParseHTMLName_nonInvasive:
   2374  * @ctxt:  an HTML parser context
   2375  *
   2376  * parse an HTML tag or attribute name, note that we convert it to lowercase
   2377  * since HTML names are not case-sensitive, this doesn't consume the data
   2378  * from the stream, it's a look-ahead
   2379  *
   2380  * Returns the Tag Name parsed or NULL
   2381  */
   2382 
   2383 static const xmlChar *
   2384 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
   2385     int i = 0;
   2386     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
   2387 
   2388     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
   2389         (NXT(1) != ':')) return(NULL);
   2390 
   2391     while ((i < HTML_PARSER_BUFFER_SIZE) &&
   2392            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
   2393 	   (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
   2394 	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
   2395         else loc[i] = NXT(1+i);
   2396 	i++;
   2397     }
   2398 
   2399     return(xmlDictLookup(ctxt->dict, loc, i));
   2400 }
   2401 
   2402 
   2403 /**
   2404  * htmlParseName:
   2405  * @ctxt:  an HTML parser context
   2406  *
   2407  * parse an HTML name, this routine is case sensitive.
   2408  *
   2409  * Returns the Name parsed or NULL
   2410  */
   2411 
   2412 static const xmlChar *
   2413 htmlParseName(htmlParserCtxtPtr ctxt) {
   2414     const xmlChar *in;
   2415     const xmlChar *ret;
   2416     int count = 0;
   2417 
   2418     GROW;
   2419 
   2420     /*
   2421      * Accelerator for simple ASCII names
   2422      */
   2423     in = ctxt->input->cur;
   2424     if (((*in >= 0x61) && (*in <= 0x7A)) ||
   2425 	((*in >= 0x41) && (*in <= 0x5A)) ||
   2426 	(*in == '_') || (*in == ':')) {
   2427 	in++;
   2428 	while (((*in >= 0x61) && (*in <= 0x7A)) ||
   2429 	       ((*in >= 0x41) && (*in <= 0x5A)) ||
   2430 	       ((*in >= 0x30) && (*in <= 0x39)) ||
   2431 	       (*in == '_') || (*in == '-') ||
   2432 	       (*in == ':') || (*in == '.'))
   2433 	    in++;
   2434 	if ((*in > 0) && (*in < 0x80)) {
   2435 	    count = in - ctxt->input->cur;
   2436 	    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
   2437 	    ctxt->input->cur = in;
   2438 	    ctxt->nbChars += count;
   2439 	    ctxt->input->col += count;
   2440 	    return(ret);
   2441 	}
   2442     }
   2443     return(htmlParseNameComplex(ctxt));
   2444 }
   2445 
   2446 static const xmlChar *
   2447 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
   2448     int len = 0, l;
   2449     int c;
   2450     int count = 0;
   2451 
   2452     /*
   2453      * Handler for more complex cases
   2454      */
   2455     GROW;
   2456     c = CUR_CHAR(l);
   2457     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
   2458 	(!IS_LETTER(c) && (c != '_') &&
   2459          (c != ':'))) {
   2460 	return(NULL);
   2461     }
   2462 
   2463     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
   2464 	   ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
   2465             (c == '.') || (c == '-') ||
   2466 	    (c == '_') || (c == ':') ||
   2467 	    (IS_COMBINING(c)) ||
   2468 	    (IS_EXTENDER(c)))) {
   2469 	if (count++ > 100) {
   2470 	    count = 0;
   2471 	    GROW;
   2472 	}
   2473 	len += l;
   2474 	NEXTL(l);
   2475 	c = CUR_CHAR(l);
   2476     }
   2477     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
   2478 }
   2479 
   2480 
   2481 /**
   2482  * htmlParseHTMLAttribute:
   2483  * @ctxt:  an HTML parser context
   2484  * @stop:  a char stop value
   2485  *
   2486  * parse an HTML attribute value till the stop (quote), if
   2487  * stop is 0 then it stops at the first space
   2488  *
   2489  * Returns the attribute parsed or NULL
   2490  */
   2491 
   2492 static xmlChar *
   2493 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
   2494     xmlChar *buffer = NULL;
   2495     int buffer_size = 0;
   2496     xmlChar *out = NULL;
   2497     const xmlChar *name = NULL;
   2498     const xmlChar *cur = NULL;
   2499     const htmlEntityDesc * ent;
   2500 
   2501     /*
   2502      * allocate a translation buffer.
   2503      */
   2504     buffer_size = HTML_PARSER_BUFFER_SIZE;
   2505     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
   2506     if (buffer == NULL) {
   2507 	htmlErrMemory(ctxt, "buffer allocation failed\n");
   2508 	return(NULL);
   2509     }
   2510     out = buffer;
   2511 
   2512     /*
   2513      * Ok loop until we reach one of the ending chars
   2514      */
   2515     while ((CUR != 0) && (CUR != stop)) {
   2516 	if ((stop == 0) && (CUR == '>')) break;
   2517 	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
   2518         if (CUR == '&') {
   2519 	    if (NXT(1) == '#') {
   2520 		unsigned int c;
   2521 		int bits;
   2522 
   2523 		c = htmlParseCharRef(ctxt);
   2524 		if      (c <    0x80)
   2525 		        { *out++  = c;                bits= -6; }
   2526 		else if (c <   0x800)
   2527 		        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   2528 		else if (c < 0x10000)
   2529 		        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   2530 		else
   2531 		        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   2532 
   2533 		for ( ; bits >= 0; bits-= 6) {
   2534 		    *out++  = ((c >> bits) & 0x3F) | 0x80;
   2535 		}
   2536 
   2537 		if (out - buffer > buffer_size - 100) {
   2538 			int indx = out - buffer;
   2539 
   2540 			growBuffer(buffer);
   2541 			out = &buffer[indx];
   2542 		}
   2543 	    } else {
   2544 		ent = htmlParseEntityRef(ctxt, &name);
   2545 		if (name == NULL) {
   2546 		    *out++ = '&';
   2547 		    if (out - buffer > buffer_size - 100) {
   2548 			int indx = out - buffer;
   2549 
   2550 			growBuffer(buffer);
   2551 			out = &buffer[indx];
   2552 		    }
   2553 		} else if (ent == NULL) {
   2554 		    *out++ = '&';
   2555 		    cur = name;
   2556 		    while (*cur != 0) {
   2557 			if (out - buffer > buffer_size - 100) {
   2558 			    int indx = out - buffer;
   2559 
   2560 			    growBuffer(buffer);
   2561 			    out = &buffer[indx];
   2562 			}
   2563 			*out++ = *cur++;
   2564 		    }
   2565 		} else {
   2566 		    unsigned int c;
   2567 		    int bits;
   2568 
   2569 		    if (out - buffer > buffer_size - 100) {
   2570 			int indx = out - buffer;
   2571 
   2572 			growBuffer(buffer);
   2573 			out = &buffer[indx];
   2574 		    }
   2575 		    c = ent->value;
   2576 		    if      (c <    0x80)
   2577 			{ *out++  = c;                bits= -6; }
   2578 		    else if (c <   0x800)
   2579 			{ *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   2580 		    else if (c < 0x10000)
   2581 			{ *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   2582 		    else
   2583 			{ *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   2584 
   2585 		    for ( ; bits >= 0; bits-= 6) {
   2586 			*out++  = ((c >> bits) & 0x3F) | 0x80;
   2587 		    }
   2588 		}
   2589 	    }
   2590 	} else {
   2591 	    unsigned int c;
   2592 	    int bits, l;
   2593 
   2594 	    if (out - buffer > buffer_size - 100) {
   2595 		int indx = out - buffer;
   2596 
   2597 		growBuffer(buffer);
   2598 		out = &buffer[indx];
   2599 	    }
   2600 	    c = CUR_CHAR(l);
   2601 	    if      (c <    0x80)
   2602 		    { *out++  = c;                bits= -6; }
   2603 	    else if (c <   0x800)
   2604 		    { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   2605 	    else if (c < 0x10000)
   2606 		    { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   2607 	    else
   2608 		    { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   2609 
   2610 	    for ( ; bits >= 0; bits-= 6) {
   2611 		*out++  = ((c >> bits) & 0x3F) | 0x80;
   2612 	    }
   2613 	    NEXT;
   2614 	}
   2615     }
   2616     *out = 0;
   2617     return(buffer);
   2618 }
   2619 
   2620 /**
   2621  * htmlParseEntityRef:
   2622  * @ctxt:  an HTML parser context
   2623  * @str:  location to store the entity name
   2624  *
   2625  * parse an HTML ENTITY references
   2626  *
   2627  * [68] EntityRef ::= '&' Name ';'
   2628  *
   2629  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
   2630  *         if non-NULL *str will have to be freed by the caller.
   2631  */
   2632 const htmlEntityDesc *
   2633 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
   2634     const xmlChar *name;
   2635     const htmlEntityDesc * ent = NULL;
   2636 
   2637     if (str != NULL) *str = NULL;
   2638     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
   2639 
   2640     if (CUR == '&') {
   2641         NEXT;
   2642         name = htmlParseName(ctxt);
   2643 	if (name == NULL) {
   2644 	    htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   2645 	                 "htmlParseEntityRef: no name\n", NULL, NULL);
   2646 	} else {
   2647 	    GROW;
   2648 	    if (CUR == ';') {
   2649 	        if (str != NULL)
   2650 		    *str = name;
   2651 
   2652 		/*
   2653 		 * Lookup the entity in the table.
   2654 		 */
   2655 		ent = htmlEntityLookup(name);
   2656 		if (ent != NULL) /* OK that's ugly !!! */
   2657 		    NEXT;
   2658 	    } else {
   2659 		htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
   2660 		             "htmlParseEntityRef: expecting ';'\n",
   2661 			     NULL, NULL);
   2662 	        if (str != NULL)
   2663 		    *str = name;
   2664 	    }
   2665 	}
   2666     }
   2667     return(ent);
   2668 }
   2669 
   2670 /**
   2671  * htmlParseAttValue:
   2672  * @ctxt:  an HTML parser context
   2673  *
   2674  * parse a value for an attribute
   2675  * Note: the parser won't do substitution of entities here, this
   2676  * will be handled later in xmlStringGetNodeList, unless it was
   2677  * asked for ctxt->replaceEntities != 0
   2678  *
   2679  * Returns the AttValue parsed or NULL.
   2680  */
   2681 
   2682 static xmlChar *
   2683 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
   2684     xmlChar *ret = NULL;
   2685 
   2686     if (CUR == '"') {
   2687         NEXT;
   2688 	ret = htmlParseHTMLAttribute(ctxt, '"');
   2689         if (CUR != '"') {
   2690 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
   2691 	                 "AttValue: \" expected\n", NULL, NULL);
   2692 	} else
   2693 	    NEXT;
   2694     } else if (CUR == '\'') {
   2695         NEXT;
   2696 	ret = htmlParseHTMLAttribute(ctxt, '\'');
   2697         if (CUR != '\'') {
   2698 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
   2699 	                 "AttValue: ' expected\n", NULL, NULL);
   2700 	} else
   2701 	    NEXT;
   2702     } else {
   2703         /*
   2704 	 * That's an HTMLism, the attribute value may not be quoted
   2705 	 */
   2706 	ret = htmlParseHTMLAttribute(ctxt, 0);
   2707 	if (ret == NULL) {
   2708 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
   2709 	                 "AttValue: no value found\n", NULL, NULL);
   2710 	}
   2711     }
   2712     return(ret);
   2713 }
   2714 
   2715 /**
   2716  * htmlParseSystemLiteral:
   2717  * @ctxt:  an HTML parser context
   2718  *
   2719  * parse an HTML Literal
   2720  *
   2721  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
   2722  *
   2723  * Returns the SystemLiteral parsed or NULL
   2724  */
   2725 
   2726 static xmlChar *
   2727 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
   2728     const xmlChar *q;
   2729     xmlChar *ret = NULL;
   2730 
   2731     if (CUR == '"') {
   2732         NEXT;
   2733 	q = CUR_PTR;
   2734 	while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
   2735 	    NEXT;
   2736 	if (!IS_CHAR_CH(CUR)) {
   2737 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
   2738 			 "Unfinished SystemLiteral\n", NULL, NULL);
   2739 	} else {
   2740 	    ret = xmlStrndup(q, CUR_PTR - q);
   2741 	    NEXT;
   2742         }
   2743     } else if (CUR == '\'') {
   2744         NEXT;
   2745 	q = CUR_PTR;
   2746 	while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
   2747 	    NEXT;
   2748 	if (!IS_CHAR_CH(CUR)) {
   2749 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
   2750 			 "Unfinished SystemLiteral\n", NULL, NULL);
   2751 	} else {
   2752 	    ret = xmlStrndup(q, CUR_PTR - q);
   2753 	    NEXT;
   2754         }
   2755     } else {
   2756 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
   2757 	             " or ' expected\n", NULL, NULL);
   2758     }
   2759 
   2760     return(ret);
   2761 }
   2762 
   2763 /**
   2764  * htmlParsePubidLiteral:
   2765  * @ctxt:  an HTML parser context
   2766  *
   2767  * parse an HTML public literal
   2768  *
   2769  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
   2770  *
   2771  * Returns the PubidLiteral parsed or NULL.
   2772  */
   2773 
   2774 static xmlChar *
   2775 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
   2776     const xmlChar *q;
   2777     xmlChar *ret = NULL;
   2778     /*
   2779      * Name ::= (Letter | '_') (NameChar)*
   2780      */
   2781     if (CUR == '"') {
   2782         NEXT;
   2783 	q = CUR_PTR;
   2784 	while (IS_PUBIDCHAR_CH(CUR)) NEXT;
   2785 	if (CUR != '"') {
   2786 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
   2787 	                 "Unfinished PubidLiteral\n", NULL, NULL);
   2788 	} else {
   2789 	    ret = xmlStrndup(q, CUR_PTR - q);
   2790 	    NEXT;
   2791 	}
   2792     } else if (CUR == '\'') {
   2793         NEXT;
   2794 	q = CUR_PTR;
   2795 	while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
   2796 	    NEXT;
   2797 	if (CUR != '\'') {
   2798 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
   2799 	                 "Unfinished PubidLiteral\n", NULL, NULL);
   2800 	} else {
   2801 	    ret = xmlStrndup(q, CUR_PTR - q);
   2802 	    NEXT;
   2803 	}
   2804     } else {
   2805 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
   2806 	             "PubidLiteral \" or ' expected\n", NULL, NULL);
   2807     }
   2808 
   2809     return(ret);
   2810 }
   2811 
   2812 /**
   2813  * htmlParseScript:
   2814  * @ctxt:  an HTML parser context
   2815  *
   2816  * parse the content of an HTML SCRIPT or STYLE element
   2817  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
   2818  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
   2819  * http://www.w3.org/TR/html4/types.html#type-script
   2820  * http://www.w3.org/TR/html4/types.html#h-6.15
   2821  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
   2822  *
   2823  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
   2824  * element and the value of intrinsic event attributes. User agents must
   2825  * not evaluate script data as HTML markup but instead must pass it on as
   2826  * data to a script engine.
   2827  * NOTES:
   2828  * - The content is passed like CDATA
   2829  * - the attributes for style and scripting "onXXX" are also described
   2830  *   as CDATA but SGML allows entities references in attributes so their
   2831  *   processing is identical as other attributes
   2832  */
   2833 static void
   2834 htmlParseScript(htmlParserCtxtPtr ctxt) {
   2835     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
   2836     int nbchar = 0;
   2837     int cur,l;
   2838 
   2839     SHRINK;
   2840     cur = CUR_CHAR(l);
   2841     while (IS_CHAR_CH(cur)) {
   2842 	if ((cur == '<') && (NXT(1) == '/')) {
   2843             /*
   2844              * One should break here, the specification is clear:
   2845              * Authors should therefore escape "</" within the content.
   2846              * Escape mechanisms are specific to each scripting or
   2847              * style sheet language.
   2848              *
   2849              * In recovery mode, only break if end tag match the
   2850              * current tag, effectively ignoring all tags inside the
   2851              * script/style block and treating the entire block as
   2852              * CDATA.
   2853              */
   2854             if (ctxt->recovery) {
   2855                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
   2856 				   xmlStrlen(ctxt->name)) == 0)
   2857                 {
   2858                     break; /* while */
   2859                 } else {
   2860 		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
   2861 				 "Element %s embeds close tag\n",
   2862 		                 ctxt->name, NULL);
   2863 		}
   2864             } else {
   2865                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
   2866                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
   2867                 {
   2868                     break; /* while */
   2869                 }
   2870             }
   2871 	}
   2872 	COPY_BUF(l,buf,nbchar,cur);
   2873 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
   2874 	    if (ctxt->sax->cdataBlock!= NULL) {
   2875 		/*
   2876 		 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
   2877 		 */
   2878 		ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
   2879 	    } else if (ctxt->sax->characters != NULL) {
   2880 		ctxt->sax->characters(ctxt->userData, buf, nbchar);
   2881 	    }
   2882 	    nbchar = 0;
   2883 	}
   2884 	GROW;
   2885 	NEXTL(l);
   2886 	cur = CUR_CHAR(l);
   2887     }
   2888 
   2889     if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
   2890         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
   2891                     "Invalid char in CDATA 0x%X\n", cur);
   2892         if (ctxt->input->cur < ctxt->input->end) {
   2893             NEXT;
   2894         }
   2895     }
   2896 
   2897     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
   2898 	if (ctxt->sax->cdataBlock!= NULL) {
   2899 	    /*
   2900 	     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
   2901 	     */
   2902 	    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
   2903 	} else if (ctxt->sax->characters != NULL) {
   2904 	    ctxt->sax->characters(ctxt->userData, buf, nbchar);
   2905 	}
   2906     }
   2907 }
   2908 
   2909 
   2910 /**
   2911  * htmlParseCharData:
   2912  * @ctxt:  an HTML parser context
   2913  *
   2914  * parse a CharData section.
   2915  * if we are within a CDATA section ']]>' marks an end of section.
   2916  *
   2917  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
   2918  */
   2919 
   2920 static void
   2921 htmlParseCharData(htmlParserCtxtPtr ctxt) {
   2922     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
   2923     int nbchar = 0;
   2924     int cur, l;
   2925     int chunk = 0;
   2926 
   2927     SHRINK;
   2928     cur = CUR_CHAR(l);
   2929     while (((cur != '<') || (ctxt->token == '<')) &&
   2930            ((cur != '&') || (ctxt->token == '&')) &&
   2931 	   (cur != 0)) {
   2932 	if (!(IS_CHAR(cur))) {
   2933 	    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
   2934 	                "Invalid char in CDATA 0x%X\n", cur);
   2935 	} else {
   2936 	    COPY_BUF(l,buf,nbchar,cur);
   2937 	}
   2938 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
   2939 	    /*
   2940 	     * Ok the segment is to be consumed as chars.
   2941 	     */
   2942 	    if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
   2943 		if (areBlanks(ctxt, buf, nbchar)) {
   2944 		    if (ctxt->sax->ignorableWhitespace != NULL)
   2945 			ctxt->sax->ignorableWhitespace(ctxt->userData,
   2946 			                               buf, nbchar);
   2947 		} else {
   2948 		    htmlCheckParagraph(ctxt);
   2949 		    if (ctxt->sax->characters != NULL)
   2950 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
   2951 		}
   2952 	    }
   2953 	    nbchar = 0;
   2954 	}
   2955 	NEXTL(l);
   2956         chunk++;
   2957         if (chunk > HTML_PARSER_BUFFER_SIZE) {
   2958             chunk = 0;
   2959             SHRINK;
   2960             GROW;
   2961         }
   2962 	cur = CUR_CHAR(l);
   2963 	if (cur == 0) {
   2964 	    SHRINK;
   2965 	    GROW;
   2966 	    cur = CUR_CHAR(l);
   2967 	}
   2968     }
   2969     if (nbchar != 0) {
   2970         buf[nbchar] = 0;
   2971 
   2972 	/*
   2973 	 * Ok the segment is to be consumed as chars.
   2974 	 */
   2975 	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
   2976 	    if (areBlanks(ctxt, buf, nbchar)) {
   2977 		if (ctxt->sax->ignorableWhitespace != NULL)
   2978 		    ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
   2979 	    } else {
   2980 		htmlCheckParagraph(ctxt);
   2981 		if (ctxt->sax->characters != NULL)
   2982 		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
   2983 	    }
   2984 	}
   2985     } else {
   2986 	/*
   2987 	 * Loop detection
   2988 	 */
   2989 	if (cur == 0)
   2990 	    ctxt->instate = XML_PARSER_EOF;
   2991     }
   2992 }
   2993 
   2994 /**
   2995  * htmlParseExternalID:
   2996  * @ctxt:  an HTML parser context
   2997  * @publicID:  a xmlChar** receiving PubidLiteral
   2998  *
   2999  * Parse an External ID or a Public ID
   3000  *
   3001  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
   3002  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
   3003  *
   3004  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
   3005  *
   3006  * Returns the function returns SystemLiteral and in the second
   3007  *                case publicID receives PubidLiteral, is strict is off
   3008  *                it is possible to return NULL and have publicID set.
   3009  */
   3010 
   3011 static xmlChar *
   3012 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
   3013     xmlChar *URI = NULL;
   3014 
   3015     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
   3016          (UPP(2) == 'S') && (UPP(3) == 'T') &&
   3017 	 (UPP(4) == 'E') && (UPP(5) == 'M')) {
   3018         SKIP(6);
   3019 	if (!IS_BLANK_CH(CUR)) {
   3020 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
   3021 	                 "Space required after 'SYSTEM'\n", NULL, NULL);
   3022 	}
   3023         SKIP_BLANKS;
   3024 	URI = htmlParseSystemLiteral(ctxt);
   3025 	if (URI == NULL) {
   3026 	    htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
   3027 	                 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
   3028         }
   3029     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
   3030 	       (UPP(2) == 'B') && (UPP(3) == 'L') &&
   3031 	       (UPP(4) == 'I') && (UPP(5) == 'C')) {
   3032         SKIP(6);
   3033 	if (!IS_BLANK_CH(CUR)) {
   3034 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
   3035 	                 "Space required after 'PUBLIC'\n", NULL, NULL);
   3036 	}
   3037         SKIP_BLANKS;
   3038 	*publicID = htmlParsePubidLiteral(ctxt);
   3039 	if (*publicID == NULL) {
   3040 	    htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
   3041 	                 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
   3042 			 NULL, NULL);
   3043 	}
   3044         SKIP_BLANKS;
   3045         if ((CUR == '"') || (CUR == '\'')) {
   3046 	    URI = htmlParseSystemLiteral(ctxt);
   3047 	}
   3048     }
   3049     return(URI);
   3050 }
   3051 
   3052 /**
   3053  * xmlParsePI:
   3054  * @ctxt:  an XML parser context
   3055  *
   3056  * parse an XML Processing Instruction.
   3057  *
   3058  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
   3059  */
   3060 static void
   3061 htmlParsePI(htmlParserCtxtPtr ctxt) {
   3062     xmlChar *buf = NULL;
   3063     int len = 0;
   3064     int size = HTML_PARSER_BUFFER_SIZE;
   3065     int cur, l;
   3066     const xmlChar *target;
   3067     xmlParserInputState state;
   3068     int count = 0;
   3069 
   3070     if ((RAW == '<') && (NXT(1) == '?')) {
   3071 	state = ctxt->instate;
   3072         ctxt->instate = XML_PARSER_PI;
   3073 	/*
   3074 	 * this is a Processing Instruction.
   3075 	 */
   3076 	SKIP(2);
   3077 	SHRINK;
   3078 
   3079 	/*
   3080 	 * Parse the target name and check for special support like
   3081 	 * namespace.
   3082 	 */
   3083         target = htmlParseName(ctxt);
   3084 	if (target != NULL) {
   3085 	    if (RAW == '>') {
   3086 		SKIP(1);
   3087 
   3088 		/*
   3089 		 * SAX: PI detected.
   3090 		 */
   3091 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
   3092 		    (ctxt->sax->processingInstruction != NULL))
   3093 		    ctxt->sax->processingInstruction(ctxt->userData,
   3094 		                                     target, NULL);
   3095 		ctxt->instate = state;
   3096 		return;
   3097 	    }
   3098 	    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
   3099 	    if (buf == NULL) {
   3100 		htmlErrMemory(ctxt, NULL);
   3101 		ctxt->instate = state;
   3102 		return;
   3103 	    }
   3104 	    cur = CUR;
   3105 	    if (!IS_BLANK(cur)) {
   3106 		htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
   3107 			  "ParsePI: PI %s space expected\n", target, NULL);
   3108 	    }
   3109             SKIP_BLANKS;
   3110 	    cur = CUR_CHAR(l);
   3111 	    while (IS_CHAR(cur) && (cur != '>')) {
   3112 		if (len + 5 >= size) {
   3113 		    xmlChar *tmp;
   3114 
   3115 		    size *= 2;
   3116 		    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
   3117 		    if (tmp == NULL) {
   3118 			htmlErrMemory(ctxt, NULL);
   3119 			xmlFree(buf);
   3120 			ctxt->instate = state;
   3121 			return;
   3122 		    }
   3123 		    buf = tmp;
   3124 		}
   3125 		count++;
   3126 		if (count > 50) {
   3127 		    GROW;
   3128 		    count = 0;
   3129 		}
   3130 		COPY_BUF(l,buf,len,cur);
   3131 		NEXTL(l);
   3132 		cur = CUR_CHAR(l);
   3133 		if (cur == 0) {
   3134 		    SHRINK;
   3135 		    GROW;
   3136 		    cur = CUR_CHAR(l);
   3137 		}
   3138 	    }
   3139 	    buf[len] = 0;
   3140 	    if (cur != '>') {
   3141 		htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
   3142 		      "ParsePI: PI %s never end ...\n", target, NULL);
   3143 	    } else {
   3144 		SKIP(1);
   3145 
   3146 		/*
   3147 		 * SAX: PI detected.
   3148 		 */
   3149 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
   3150 		    (ctxt->sax->processingInstruction != NULL))
   3151 		    ctxt->sax->processingInstruction(ctxt->userData,
   3152 		                                     target, buf);
   3153 	    }
   3154 	    xmlFree(buf);
   3155 	} else {
   3156 	    htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
   3157                          "PI is not started correctly", NULL, NULL);
   3158 	}
   3159 	ctxt->instate = state;
   3160     }
   3161 }
   3162 
   3163 /**
   3164  * htmlParseComment:
   3165  * @ctxt:  an HTML parser context
   3166  *
   3167  * Parse an XML (SGML) comment <!-- .... -->
   3168  *
   3169  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
   3170  */
   3171 static void
   3172 htmlParseComment(htmlParserCtxtPtr ctxt) {
   3173     xmlChar *buf = NULL;
   3174     int len;
   3175     int size = HTML_PARSER_BUFFER_SIZE;
   3176     int q, ql;
   3177     int r, rl;
   3178     int cur, l;
   3179     xmlParserInputState state;
   3180 
   3181     /*
   3182      * Check that there is a comment right here.
   3183      */
   3184     if ((RAW != '<') || (NXT(1) != '!') ||
   3185         (NXT(2) != '-') || (NXT(3) != '-')) return;
   3186 
   3187     state = ctxt->instate;
   3188     ctxt->instate = XML_PARSER_COMMENT;
   3189     SHRINK;
   3190     SKIP(4);
   3191     buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
   3192     if (buf == NULL) {
   3193         htmlErrMemory(ctxt, "buffer allocation failed\n");
   3194 	ctxt->instate = state;
   3195 	return;
   3196     }
   3197     q = CUR_CHAR(ql);
   3198     NEXTL(ql);
   3199     r = CUR_CHAR(rl);
   3200     NEXTL(rl);
   3201     cur = CUR_CHAR(l);
   3202     len = 0;
   3203     while (IS_CHAR(cur) &&
   3204            ((cur != '>') ||
   3205 	    (r != '-') || (q != '-'))) {
   3206 	if (len + 5 >= size) {
   3207 	    xmlChar *tmp;
   3208 
   3209 	    size *= 2;
   3210 	    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
   3211 	    if (tmp == NULL) {
   3212 	        xmlFree(buf);
   3213 	        htmlErrMemory(ctxt, "growing buffer failed\n");
   3214 		ctxt->instate = state;
   3215 		return;
   3216 	    }
   3217 	    buf = tmp;
   3218 	}
   3219 	COPY_BUF(ql,buf,len,q);
   3220 	q = r;
   3221 	ql = rl;
   3222 	r = cur;
   3223 	rl = l;
   3224 	NEXTL(l);
   3225 	cur = CUR_CHAR(l);
   3226 	if (cur == 0) {
   3227 	    SHRINK;
   3228 	    GROW;
   3229 	    cur = CUR_CHAR(l);
   3230 	}
   3231     }
   3232     buf[len] = 0;
   3233     if (!IS_CHAR(cur)) {
   3234 	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
   3235 	             "Comment not terminated \n<!--%.50s\n", buf, NULL);
   3236 	xmlFree(buf);
   3237     } else {
   3238         NEXT;
   3239 	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
   3240 	    (!ctxt->disableSAX))
   3241 	    ctxt->sax->comment(ctxt->userData, buf);
   3242 	xmlFree(buf);
   3243     }
   3244     ctxt->instate = state;
   3245 }
   3246 
   3247 /**
   3248  * htmlParseCharRef:
   3249  * @ctxt:  an HTML parser context
   3250  *
   3251  * parse Reference declarations
   3252  *
   3253  * [66] CharRef ::= '&#' [0-9]+ ';' |
   3254  *                  '&#x' [0-9a-fA-F]+ ';'
   3255  *
   3256  * Returns the value parsed (as an int)
   3257  */
   3258 int
   3259 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
   3260     int val = 0;
   3261 
   3262     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   3263 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   3264 		     "htmlParseCharRef: context error\n",
   3265 		     NULL, NULL);
   3266         return(0);
   3267     }
   3268     if ((CUR == '&') && (NXT(1) == '#') &&
   3269         ((NXT(2) == 'x') || NXT(2) == 'X')) {
   3270 	SKIP(3);
   3271 	while (CUR != ';') {
   3272 	    if ((CUR >= '0') && (CUR <= '9'))
   3273 	        val = val * 16 + (CUR - '0');
   3274 	    else if ((CUR >= 'a') && (CUR <= 'f'))
   3275 	        val = val * 16 + (CUR - 'a') + 10;
   3276 	    else if ((CUR >= 'A') && (CUR <= 'F'))
   3277 	        val = val * 16 + (CUR - 'A') + 10;
   3278 	    else {
   3279 	        htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
   3280 		             "htmlParseCharRef: missing semicolon\n",
   3281 			     NULL, NULL);
   3282 		break;
   3283 	    }
   3284 	    NEXT;
   3285 	}
   3286 	if (CUR == ';')
   3287 	    NEXT;
   3288     } else if  ((CUR == '&') && (NXT(1) == '#')) {
   3289 	SKIP(2);
   3290 	while (CUR != ';') {
   3291 	    if ((CUR >= '0') && (CUR <= '9'))
   3292 	        val = val * 10 + (CUR - '0');
   3293 	    else {
   3294 	        htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
   3295 		             "htmlParseCharRef: missing semicolon\n",
   3296 			     NULL, NULL);
   3297 		break;
   3298 	    }
   3299 	    NEXT;
   3300 	}
   3301 	if (CUR == ';')
   3302 	    NEXT;
   3303     } else {
   3304 	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
   3305 	             "htmlParseCharRef: invalid value\n", NULL, NULL);
   3306     }
   3307     /*
   3308      * Check the value IS_CHAR ...
   3309      */
   3310     if (IS_CHAR(val)) {
   3311         return(val);
   3312     } else {
   3313 	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
   3314 			"htmlParseCharRef: invalid xmlChar value %d\n",
   3315 			val);
   3316     }
   3317     return(0);
   3318 }
   3319 
   3320 
   3321 /**
   3322  * htmlParseDocTypeDecl:
   3323  * @ctxt:  an HTML parser context
   3324  *
   3325  * parse a DOCTYPE declaration
   3326  *
   3327  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
   3328  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
   3329  */
   3330 
   3331 static void
   3332 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
   3333     const xmlChar *name;
   3334     xmlChar *ExternalID = NULL;
   3335     xmlChar *URI = NULL;
   3336 
   3337     /*
   3338      * We know that '<!DOCTYPE' has been detected.
   3339      */
   3340     SKIP(9);
   3341 
   3342     SKIP_BLANKS;
   3343 
   3344     /*
   3345      * Parse the DOCTYPE name.
   3346      */
   3347     name = htmlParseName(ctxt);
   3348     if (name == NULL) {
   3349 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   3350 	             "htmlParseDocTypeDecl : no DOCTYPE name !\n",
   3351 		     NULL, NULL);
   3352     }
   3353     /*
   3354      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
   3355      */
   3356 
   3357     SKIP_BLANKS;
   3358 
   3359     /*
   3360      * Check for SystemID and ExternalID
   3361      */
   3362     URI = htmlParseExternalID(ctxt, &ExternalID);
   3363     SKIP_BLANKS;
   3364 
   3365     /*
   3366      * We should be at the end of the DOCTYPE declaration.
   3367      */
   3368     if (CUR != '>') {
   3369 	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
   3370 	             "DOCTYPE improperly terminated\n", NULL, NULL);
   3371         /* We shouldn't try to resynchronize ... */
   3372     }
   3373     NEXT;
   3374 
   3375     /*
   3376      * Create or update the document accordingly to the DOCTYPE
   3377      */
   3378     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
   3379 	(!ctxt->disableSAX))
   3380 	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
   3381 
   3382     /*
   3383      * Cleanup, since we don't use all those identifiers
   3384      */
   3385     if (URI != NULL) xmlFree(URI);
   3386     if (ExternalID != NULL) xmlFree(ExternalID);
   3387 }
   3388 
   3389 /**
   3390  * htmlParseAttribute:
   3391  * @ctxt:  an HTML parser context
   3392  * @value:  a xmlChar ** used to store the value of the attribute
   3393  *
   3394  * parse an attribute
   3395  *
   3396  * [41] Attribute ::= Name Eq AttValue
   3397  *
   3398  * [25] Eq ::= S? '=' S?
   3399  *
   3400  * With namespace:
   3401  *
   3402  * [NS 11] Attribute ::= QName Eq AttValue
   3403  *
   3404  * Also the case QName == xmlns:??? is handled independently as a namespace
   3405  * definition.
   3406  *
   3407  * Returns the attribute name, and the value in *value.
   3408  */
   3409 
   3410 static const xmlChar *
   3411 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
   3412     const xmlChar *name;
   3413     xmlChar *val = NULL;
   3414 
   3415     *value = NULL;
   3416     name = htmlParseHTMLName(ctxt);
   3417     if (name == NULL) {
   3418 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   3419 	             "error parsing attribute name\n", NULL, NULL);
   3420         return(NULL);
   3421     }
   3422 
   3423     /*
   3424      * read the value
   3425      */
   3426     SKIP_BLANKS;
   3427     if (CUR == '=') {
   3428         NEXT;
   3429 	SKIP_BLANKS;
   3430 	val = htmlParseAttValue(ctxt);
   3431     }
   3432 
   3433     *value = val;
   3434     return(name);
   3435 }
   3436 
   3437 /**
   3438  * htmlCheckEncoding:
   3439  * @ctxt:  an HTML parser context
   3440  * @attvalue: the attribute value
   3441  *
   3442  * Checks an http-equiv attribute from a Meta tag to detect
   3443  * the encoding
   3444  * If a new encoding is detected the parser is switched to decode
   3445  * it and pass UTF8
   3446  */
   3447 static void
   3448 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
   3449     const xmlChar *encoding;
   3450 
   3451     if ((ctxt == NULL) || (attvalue == NULL) ||
   3452         (ctxt->options & HTML_PARSE_IGNORE_ENC))
   3453 	return;
   3454 
   3455     /* do not change encoding */
   3456     if (ctxt->input->encoding != NULL)
   3457         return;
   3458 
   3459     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
   3460     if (encoding != NULL) {
   3461 	encoding += 8;
   3462     } else {
   3463 	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
   3464 	if (encoding != NULL)
   3465 	    encoding += 9;
   3466     }
   3467     if (encoding != NULL) {
   3468 	xmlCharEncoding enc;
   3469 	xmlCharEncodingHandlerPtr handler;
   3470 
   3471 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
   3472 
   3473 	if (ctxt->input->encoding != NULL)
   3474 	    xmlFree((xmlChar *) ctxt->input->encoding);
   3475 	ctxt->input->encoding = xmlStrdup(encoding);
   3476 
   3477 	enc = xmlParseCharEncoding((const char *) encoding);
   3478 	/*
   3479 	 * registered set of known encodings
   3480 	 */
   3481 	if (enc != XML_CHAR_ENCODING_ERROR) {
   3482 	    if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
   3483 	         (enc == XML_CHAR_ENCODING_UTF16BE) ||
   3484 		 (enc == XML_CHAR_ENCODING_UCS4LE) ||
   3485 		 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
   3486 		(ctxt->input->buf != NULL) &&
   3487 		(ctxt->input->buf->encoder == NULL)) {
   3488 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
   3489 		             "htmlCheckEncoding: wrong encoding meta\n",
   3490 			     NULL, NULL);
   3491 	    } else {
   3492 		xmlSwitchEncoding(ctxt, enc);
   3493 	    }
   3494 	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
   3495 	} else {
   3496 	    /*
   3497 	     * fallback for unknown encodings
   3498 	     */
   3499 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
   3500 	    if (handler != NULL) {
   3501 		xmlSwitchToEncoding(ctxt, handler);
   3502 		ctxt->charset = XML_CHAR_ENCODING_UTF8;
   3503 	    } else {
   3504 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
   3505 		             "htmlCheckEncoding: unknown encoding %s\n",
   3506 			     encoding, NULL);
   3507 	    }
   3508 	}
   3509 
   3510 	if ((ctxt->input->buf != NULL) &&
   3511 	    (ctxt->input->buf->encoder != NULL) &&
   3512 	    (ctxt->input->buf->raw != NULL) &&
   3513 	    (ctxt->input->buf->buffer != NULL)) {
   3514 	    int nbchars;
   3515 	    int processed;
   3516 
   3517 	    /*
   3518 	     * convert as much as possible to the parser reading buffer.
   3519 	     */
   3520 	    processed = ctxt->input->cur - ctxt->input->base;
   3521 	    xmlBufferShrink(ctxt->input->buf->buffer, processed);
   3522 	    nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
   3523 		                       ctxt->input->buf->buffer,
   3524 				       ctxt->input->buf->raw);
   3525 	    if (nbchars < 0) {
   3526 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
   3527 		             "htmlCheckEncoding: encoder error\n",
   3528 			     NULL, NULL);
   3529 	    }
   3530 	    ctxt->input->base =
   3531 	    ctxt->input->cur = ctxt->input->buf->buffer->content;
   3532             ctxt->input->end =
   3533                           &ctxt->input->base[ctxt->input->buf->buffer->use];
   3534 	}
   3535     }
   3536 }
   3537 
   3538 /**
   3539  * htmlCheckMeta:
   3540  * @ctxt:  an HTML parser context
   3541  * @atts:  the attributes values
   3542  *
   3543  * Checks an attributes from a Meta tag
   3544  */
   3545 static void
   3546 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
   3547     int i;
   3548     const xmlChar *att, *value;
   3549     int http = 0;
   3550     const xmlChar *content = NULL;
   3551 
   3552     if ((ctxt == NULL) || (atts == NULL))
   3553 	return;
   3554 
   3555     i = 0;
   3556     att = atts[i++];
   3557     while (att != NULL) {
   3558 	value = atts[i++];
   3559 	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
   3560 	 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
   3561 	    http = 1;
   3562 	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
   3563 	    content = value;
   3564 	att = atts[i++];
   3565     }
   3566     if ((http) && (content != NULL))
   3567 	htmlCheckEncoding(ctxt, content);
   3568 
   3569 }
   3570 
   3571 /**
   3572  * htmlParseStartTag:
   3573  * @ctxt:  an HTML parser context
   3574  *
   3575  * parse a start of tag either for rule element or
   3576  * EmptyElement. In both case we don't parse the tag closing chars.
   3577  *
   3578  * [40] STag ::= '<' Name (S Attribute)* S? '>'
   3579  *
   3580  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
   3581  *
   3582  * With namespace:
   3583  *
   3584  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
   3585  *
   3586  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
   3587  *
   3588  * Returns 0 in case of success, -1 in case of error and 1 if discarded
   3589  */
   3590 
   3591 static int
   3592 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
   3593     const xmlChar *name;
   3594     const xmlChar *attname;
   3595     xmlChar *attvalue;
   3596     const xmlChar **atts;
   3597     int nbatts = 0;
   3598     int maxatts;
   3599     int meta = 0;
   3600     int i;
   3601     int discardtag = 0;
   3602 
   3603     if (ctxt->instate == XML_PARSER_EOF)
   3604         return(-1);
   3605     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   3606 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   3607 		     "htmlParseStartTag: context error\n", NULL, NULL);
   3608 	return -1;
   3609     }
   3610     if (CUR != '<') return -1;
   3611     NEXT;
   3612 
   3613     atts = ctxt->atts;
   3614     maxatts = ctxt->maxatts;
   3615 
   3616     GROW;
   3617     name = htmlParseHTMLName(ctxt);
   3618     if (name == NULL) {
   3619 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   3620 	             "htmlParseStartTag: invalid element name\n",
   3621 		     NULL, NULL);
   3622 	/* Dump the bogus tag like browsers do */
   3623 	while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
   3624                (ctxt->instate != XML_PARSER_EOF))
   3625 	    NEXT;
   3626         return -1;
   3627     }
   3628     if (xmlStrEqual(name, BAD_CAST"meta"))
   3629 	meta = 1;
   3630 
   3631     /*
   3632      * Check for auto-closure of HTML elements.
   3633      */
   3634     htmlAutoClose(ctxt, name);
   3635 
   3636     /*
   3637      * Check for implied HTML elements.
   3638      */
   3639     htmlCheckImplied(ctxt, name);
   3640 
   3641     /*
   3642      * Avoid html at any level > 0, head at any level != 1
   3643      * or any attempt to recurse body
   3644      */
   3645     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
   3646 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   3647 	             "htmlParseStartTag: misplaced <html> tag\n",
   3648 		     name, NULL);
   3649 	discardtag = 1;
   3650 	ctxt->depth++;
   3651     }
   3652     if ((ctxt->nameNr != 1) &&
   3653 	(xmlStrEqual(name, BAD_CAST"head"))) {
   3654 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   3655 	             "htmlParseStartTag: misplaced <head> tag\n",
   3656 		     name, NULL);
   3657 	discardtag = 1;
   3658 	ctxt->depth++;
   3659     }
   3660     if (xmlStrEqual(name, BAD_CAST"body")) {
   3661 	int indx;
   3662 	for (indx = 0;indx < ctxt->nameNr;indx++) {
   3663 	    if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
   3664 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   3665 		             "htmlParseStartTag: misplaced <body> tag\n",
   3666 			     name, NULL);
   3667 		discardtag = 1;
   3668 		ctxt->depth++;
   3669 	    }
   3670 	}
   3671     }
   3672 
   3673     /*
   3674      * Now parse the attributes, it ends up with the ending
   3675      *
   3676      * (S Attribute)* S?
   3677      */
   3678     SKIP_BLANKS;
   3679     while ((IS_CHAR_CH(CUR)) &&
   3680            (CUR != '>') &&
   3681 	   ((CUR != '/') || (NXT(1) != '>'))) {
   3682 	long cons = ctxt->nbChars;
   3683 
   3684 	GROW;
   3685 	attname = htmlParseAttribute(ctxt, &attvalue);
   3686         if (attname != NULL) {
   3687 
   3688 	    /*
   3689 	     * Well formedness requires at most one declaration of an attribute
   3690 	     */
   3691 	    for (i = 0; i < nbatts;i += 2) {
   3692 	        if (xmlStrEqual(atts[i], attname)) {
   3693 		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
   3694 		                 "Attribute %s redefined\n", attname, NULL);
   3695 		    if (attvalue != NULL)
   3696 			xmlFree(attvalue);
   3697 		    goto failed;
   3698 		}
   3699 	    }
   3700 
   3701 	    /*
   3702 	     * Add the pair to atts
   3703 	     */
   3704 	    if (atts == NULL) {
   3705 	        maxatts = 22; /* allow for 10 attrs by default */
   3706 	        atts = (const xmlChar **)
   3707 		       xmlMalloc(maxatts * sizeof(xmlChar *));
   3708 		if (atts == NULL) {
   3709 		    htmlErrMemory(ctxt, NULL);
   3710 		    if (attvalue != NULL)
   3711 			xmlFree(attvalue);
   3712 		    goto failed;
   3713 		}
   3714 		ctxt->atts = atts;
   3715 		ctxt->maxatts = maxatts;
   3716 	    } else if (nbatts + 4 > maxatts) {
   3717 	        const xmlChar **n;
   3718 
   3719 	        maxatts *= 2;
   3720 	        n = (const xmlChar **) xmlRealloc((void *) atts,
   3721 					     maxatts * sizeof(const xmlChar *));
   3722 		if (n == NULL) {
   3723 		    htmlErrMemory(ctxt, NULL);
   3724 		    if (attvalue != NULL)
   3725 			xmlFree(attvalue);
   3726 		    goto failed;
   3727 		}
   3728 		atts = n;
   3729 		ctxt->atts = atts;
   3730 		ctxt->maxatts = maxatts;
   3731 	    }
   3732 	    atts[nbatts++] = attname;
   3733 	    atts[nbatts++] = attvalue;
   3734 	    atts[nbatts] = NULL;
   3735 	    atts[nbatts + 1] = NULL;
   3736 	}
   3737 	else {
   3738 	    if (attvalue != NULL)
   3739 	        xmlFree(attvalue);
   3740 	    /* Dump the bogus attribute string up to the next blank or
   3741 	     * the end of the tag. */
   3742 	    while ((IS_CHAR_CH(CUR)) &&
   3743 	           !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
   3744 		   ((CUR != '/') || (NXT(1) != '>')))
   3745 		NEXT;
   3746 	}
   3747 
   3748 failed:
   3749 	SKIP_BLANKS;
   3750         if (cons == ctxt->nbChars) {
   3751 	    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   3752 	                 "htmlParseStartTag: problem parsing attributes\n",
   3753 			 NULL, NULL);
   3754 	    break;
   3755 	}
   3756     }
   3757 
   3758     /*
   3759      * Handle specific association to the META tag
   3760      */
   3761     if (meta && (nbatts != 0))
   3762 	htmlCheckMeta(ctxt, atts);
   3763 
   3764     /*
   3765      * SAX: Start of Element !
   3766      */
   3767     if (!discardtag) {
   3768 	htmlnamePush(ctxt, name);
   3769 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
   3770 	    if (nbatts != 0)
   3771 		ctxt->sax->startElement(ctxt->userData, name, atts);
   3772 	    else
   3773 		ctxt->sax->startElement(ctxt->userData, name, NULL);
   3774 	}
   3775     }
   3776 
   3777     if (atts != NULL) {
   3778         for (i = 1;i < nbatts;i += 2) {
   3779 	    if (atts[i] != NULL)
   3780 		xmlFree((xmlChar *) atts[i]);
   3781 	}
   3782     }
   3783 
   3784     return(discardtag);
   3785 }
   3786 
   3787 /**
   3788  * htmlParseEndTag:
   3789  * @ctxt:  an HTML parser context
   3790  *
   3791  * parse an end of tag
   3792  *
   3793  * [42] ETag ::= '</' Name S? '>'
   3794  *
   3795  * With namespace
   3796  *
   3797  * [NS 9] ETag ::= '</' QName S? '>'
   3798  *
   3799  * Returns 1 if the current level should be closed.
   3800  */
   3801 
   3802 static int
   3803 htmlParseEndTag(htmlParserCtxtPtr ctxt)
   3804 {
   3805     const xmlChar *name;
   3806     const xmlChar *oldname;
   3807     int i, ret;
   3808 
   3809     if ((CUR != '<') || (NXT(1) != '/')) {
   3810         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
   3811 	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
   3812         return (0);
   3813     }
   3814     SKIP(2);
   3815 
   3816     name = htmlParseHTMLName(ctxt);
   3817     if (name == NULL)
   3818         return (0);
   3819     /*
   3820      * We should definitely be at the ending "S? '>'" part
   3821      */
   3822     SKIP_BLANKS;
   3823     if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
   3824         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
   3825 	             "End tag : expected '>'\n", NULL, NULL);
   3826 	if (ctxt->recovery) {
   3827 	    /*
   3828 	     * We're not at the ending > !!
   3829 	     * Error, unless in recover mode where we search forwards
   3830 	     * until we find a >
   3831 	     */
   3832 	    while (CUR != '\0' && CUR != '>') NEXT;
   3833 	    NEXT;
   3834 	}
   3835     } else
   3836         NEXT;
   3837 
   3838     /*
   3839      * if we ignored misplaced tags in htmlParseStartTag don't pop them
   3840      * out now.
   3841      */
   3842     if ((ctxt->depth > 0) &&
   3843         (xmlStrEqual(name, BAD_CAST "html") ||
   3844          xmlStrEqual(name, BAD_CAST "body") ||
   3845 	 xmlStrEqual(name, BAD_CAST "head"))) {
   3846 	ctxt->depth--;
   3847 	return (0);
   3848     }
   3849 
   3850     /*
   3851      * If the name read is not one of the element in the parsing stack
   3852      * then return, it's just an error.
   3853      */
   3854     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
   3855         if (xmlStrEqual(name, ctxt->nameTab[i]))
   3856             break;
   3857     }
   3858     if (i < 0) {
   3859         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
   3860 	             "Unexpected end tag : %s\n", name, NULL);
   3861         return (0);
   3862     }
   3863 
   3864 
   3865     /*
   3866      * Check for auto-closure of HTML elements.
   3867      */
   3868 
   3869     htmlAutoCloseOnClose(ctxt, name);
   3870 
   3871     /*
   3872      * Well formedness constraints, opening and closing must match.
   3873      * With the exception that the autoclose may have popped stuff out
   3874      * of the stack.
   3875      */
   3876     if (!xmlStrEqual(name, ctxt->name)) {
   3877         if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
   3878             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
   3879 	                 "Opening and ending tag mismatch: %s and %s\n",
   3880 			 name, ctxt->name);
   3881         }
   3882     }
   3883 
   3884     /*
   3885      * SAX: End of Tag
   3886      */
   3887     oldname = ctxt->name;
   3888     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
   3889         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   3890             ctxt->sax->endElement(ctxt->userData, name);
   3891         htmlnamePop(ctxt);
   3892         ret = 1;
   3893     } else {
   3894         ret = 0;
   3895     }
   3896 
   3897     return (ret);
   3898 }
   3899 
   3900 
   3901 /**
   3902  * htmlParseReference:
   3903  * @ctxt:  an HTML parser context
   3904  *
   3905  * parse and handle entity references in content,
   3906  * this will end-up in a call to character() since this is either a
   3907  * CharRef, or a predefined entity.
   3908  */
   3909 static void
   3910 htmlParseReference(htmlParserCtxtPtr ctxt) {
   3911     const htmlEntityDesc * ent;
   3912     xmlChar out[6];
   3913     const xmlChar *name;
   3914     if (CUR != '&') return;
   3915 
   3916     if (NXT(1) == '#') {
   3917 	unsigned int c;
   3918 	int bits, i = 0;
   3919 
   3920 	c = htmlParseCharRef(ctxt);
   3921 	if (c == 0)
   3922 	    return;
   3923 
   3924         if      (c <    0x80) { out[i++]= c;                bits= -6; }
   3925         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   3926         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   3927         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   3928 
   3929         for ( ; bits >= 0; bits-= 6) {
   3930             out[i++]= ((c >> bits) & 0x3F) | 0x80;
   3931         }
   3932 	out[i] = 0;
   3933 
   3934 	htmlCheckParagraph(ctxt);
   3935 	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
   3936 	    ctxt->sax->characters(ctxt->userData, out, i);
   3937     } else {
   3938 	ent = htmlParseEntityRef(ctxt, &name);
   3939 	if (name == NULL) {
   3940 	    htmlCheckParagraph(ctxt);
   3941 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
   3942 	        ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
   3943 	    return;
   3944 	}
   3945 	if ((ent == NULL) || !(ent->value > 0)) {
   3946 	    htmlCheckParagraph(ctxt);
   3947 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
   3948 		ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
   3949 		ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
   3950 		/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
   3951 	    }
   3952 	} else {
   3953 	    unsigned int c;
   3954 	    int bits, i = 0;
   3955 
   3956 	    c = ent->value;
   3957 	    if      (c <    0x80)
   3958 	            { out[i++]= c;                bits= -6; }
   3959 	    else if (c <   0x800)
   3960 	            { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   3961 	    else if (c < 0x10000)
   3962 	            { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   3963 	    else
   3964 	            { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   3965 
   3966 	    for ( ; bits >= 0; bits-= 6) {
   3967 		out[i++]= ((c >> bits) & 0x3F) | 0x80;
   3968 	    }
   3969 	    out[i] = 0;
   3970 
   3971 	    htmlCheckParagraph(ctxt);
   3972 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
   3973 		ctxt->sax->characters(ctxt->userData, out, i);
   3974 	}
   3975     }
   3976 }
   3977 
   3978 /**
   3979  * htmlParseContent:
   3980  * @ctxt:  an HTML parser context
   3981  *
   3982  * Parse a content: comment, sub-element, reference or text.
   3983  * Kept for compatibility with old code
   3984  */
   3985 
   3986 static void
   3987 htmlParseContent(htmlParserCtxtPtr ctxt) {
   3988     xmlChar *currentNode;
   3989     int depth;
   3990     const xmlChar *name;
   3991 
   3992     currentNode = xmlStrdup(ctxt->name);
   3993     depth = ctxt->nameNr;
   3994     while (1) {
   3995 	long cons = ctxt->nbChars;
   3996 
   3997         GROW;
   3998 
   3999         if (ctxt->instate == XML_PARSER_EOF)
   4000             break;
   4001 
   4002 	/*
   4003 	 * Our tag or one of it's parent or children is ending.
   4004 	 */
   4005         if ((CUR == '<') && (NXT(1) == '/')) {
   4006 	    if (htmlParseEndTag(ctxt) &&
   4007 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
   4008 		if (currentNode != NULL)
   4009 		    xmlFree(currentNode);
   4010 		return;
   4011 	    }
   4012 	    continue; /* while */
   4013         }
   4014 
   4015 	else if ((CUR == '<') &&
   4016 	         ((IS_ASCII_LETTER(NXT(1))) ||
   4017 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
   4018 	    name = htmlParseHTMLName_nonInvasive(ctxt);
   4019 	    if (name == NULL) {
   4020 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   4021 			 "htmlParseStartTag: invalid element name\n",
   4022 			 NULL, NULL);
   4023 	        /* Dump the bogus tag like browsers do */
   4024         while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
   4025 	            NEXT;
   4026 
   4027 	        if (currentNode != NULL)
   4028 	            xmlFree(currentNode);
   4029 	        return;
   4030 	    }
   4031 
   4032 	    if (ctxt->name != NULL) {
   4033 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
   4034 	            htmlAutoClose(ctxt, name);
   4035 	            continue;
   4036 	        }
   4037 	    }
   4038 	}
   4039 
   4040 	/*
   4041 	 * Has this node been popped out during parsing of
   4042 	 * the next element
   4043 	 */
   4044         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
   4045 	    (!xmlStrEqual(currentNode, ctxt->name)))
   4046 	     {
   4047 	    if (currentNode != NULL) xmlFree(currentNode);
   4048 	    return;
   4049 	}
   4050 
   4051 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
   4052 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
   4053 	    /*
   4054 	     * Handle SCRIPT/STYLE separately
   4055 	     */
   4056 	    htmlParseScript(ctxt);
   4057 	} else {
   4058 	    /*
   4059 	     * Sometimes DOCTYPE arrives in the middle of the document
   4060 	     */
   4061 	    if ((CUR == '<') && (NXT(1) == '!') &&
   4062 		(UPP(2) == 'D') && (UPP(3) == 'O') &&
   4063 		(UPP(4) == 'C') && (UPP(5) == 'T') &&
   4064 		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
   4065 		(UPP(8) == 'E')) {
   4066 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   4067 		             "Misplaced DOCTYPE declaration\n",
   4068 			     BAD_CAST "DOCTYPE" , NULL);
   4069 		htmlParseDocTypeDecl(ctxt);
   4070 	    }
   4071 
   4072 	    /*
   4073 	     * First case :  a comment
   4074 	     */
   4075 	    if ((CUR == '<') && (NXT(1) == '!') &&
   4076 		(NXT(2) == '-') && (NXT(3) == '-')) {
   4077 		htmlParseComment(ctxt);
   4078 	    }
   4079 
   4080 	    /*
   4081 	     * Second case : a Processing Instruction.
   4082 	     */
   4083 	    else if ((CUR == '<') && (NXT(1) == '?')) {
   4084 		htmlParsePI(ctxt);
   4085 	    }
   4086 
   4087 	    /*
   4088 	     * Third case :  a sub-element.
   4089 	     */
   4090 	    else if (CUR == '<') {
   4091 		htmlParseElement(ctxt);
   4092 	    }
   4093 
   4094 	    /*
   4095 	     * Fourth case : a reference. If if has not been resolved,
   4096 	     *    parsing returns it's Name, create the node
   4097 	     */
   4098 	    else if (CUR == '&') {
   4099 		htmlParseReference(ctxt);
   4100 	    }
   4101 
   4102 	    /*
   4103 	     * Fifth case : end of the resource
   4104 	     */
   4105 	    else if (CUR == 0) {
   4106 		htmlAutoCloseOnEnd(ctxt);
   4107 		break;
   4108 	    }
   4109 
   4110 	    /*
   4111 	     * Last case, text. Note that References are handled directly.
   4112 	     */
   4113 	    else {
   4114 		htmlParseCharData(ctxt);
   4115 	    }
   4116 
   4117 	    if (cons == ctxt->nbChars) {
   4118 		if (ctxt->node != NULL) {
   4119 		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   4120 		                 "detected an error in element content\n",
   4121 				 NULL, NULL);
   4122 		}
   4123 		break;
   4124 	    }
   4125 	}
   4126         GROW;
   4127     }
   4128     if (currentNode != NULL) xmlFree(currentNode);
   4129 }
   4130 
   4131 /**
   4132  * htmlParseElement:
   4133  * @ctxt:  an HTML parser context
   4134  *
   4135  * parse an HTML element, this is highly recursive
   4136  * this is kept for compatibility with previous code versions
   4137  *
   4138  * [39] element ::= EmptyElemTag | STag content ETag
   4139  *
   4140  * [41] Attribute ::= Name Eq AttValue
   4141  */
   4142 
   4143 void
   4144 htmlParseElement(htmlParserCtxtPtr ctxt) {
   4145     const xmlChar *name;
   4146     xmlChar *currentNode = NULL;
   4147     const htmlElemDesc * info;
   4148     htmlParserNodeInfo node_info;
   4149     int failed;
   4150     int depth;
   4151     const xmlChar *oldptr;
   4152 
   4153     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   4154 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   4155 		     "htmlParseElement: context error\n", NULL, NULL);
   4156 	return;
   4157     }
   4158 
   4159     if (ctxt->instate == XML_PARSER_EOF)
   4160         return;
   4161 
   4162     /* Capture start position */
   4163     if (ctxt->record_info) {
   4164         node_info.begin_pos = ctxt->input->consumed +
   4165                           (CUR_PTR - ctxt->input->base);
   4166 	node_info.begin_line = ctxt->input->line;
   4167     }
   4168 
   4169     failed = htmlParseStartTag(ctxt);
   4170     name = ctxt->name;
   4171     if ((failed == -1) || (name == NULL)) {
   4172 	if (CUR == '>')
   4173 	    NEXT;
   4174         return;
   4175     }
   4176 
   4177     /*
   4178      * Lookup the info for that element.
   4179      */
   4180     info = htmlTagLookup(name);
   4181     if (info == NULL) {
   4182 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
   4183 	             "Tag %s invalid\n", name, NULL);
   4184     }
   4185 
   4186     /*
   4187      * Check for an Empty Element labeled the XML/SGML way
   4188      */
   4189     if ((CUR == '/') && (NXT(1) == '>')) {
   4190         SKIP(2);
   4191 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   4192 	    ctxt->sax->endElement(ctxt->userData, name);
   4193 	htmlnamePop(ctxt);
   4194 	return;
   4195     }
   4196 
   4197     if (CUR == '>') {
   4198         NEXT;
   4199     } else {
   4200 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
   4201 	             "Couldn't find end of Start Tag %s\n", name, NULL);
   4202 
   4203 	/*
   4204 	 * end of parsing of this node.
   4205 	 */
   4206 	if (xmlStrEqual(name, ctxt->name)) {
   4207 	    nodePop(ctxt);
   4208 	    htmlnamePop(ctxt);
   4209 	}
   4210 
   4211 	/*
   4212 	 * Capture end position and add node
   4213 	 */
   4214 	if (ctxt->record_info) {
   4215 	   node_info.end_pos = ctxt->input->consumed +
   4216 			      (CUR_PTR - ctxt->input->base);
   4217 	   node_info.end_line = ctxt->input->line;
   4218 	   node_info.node = ctxt->node;
   4219 	   xmlParserAddNodeInfo(ctxt, &node_info);
   4220 	}
   4221 	return;
   4222     }
   4223 
   4224     /*
   4225      * Check for an Empty Element from DTD definition
   4226      */
   4227     if ((info != NULL) && (info->empty)) {
   4228 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   4229 	    ctxt->sax->endElement(ctxt->userData, name);
   4230 	htmlnamePop(ctxt);
   4231 	return;
   4232     }
   4233 
   4234     /*
   4235      * Parse the content of the element:
   4236      */
   4237     currentNode = xmlStrdup(ctxt->name);
   4238     depth = ctxt->nameNr;
   4239     while (IS_CHAR_CH(CUR)) {
   4240 	oldptr = ctxt->input->cur;
   4241 	htmlParseContent(ctxt);
   4242 	if (oldptr==ctxt->input->cur) break;
   4243 	if (ctxt->nameNr < depth) break;
   4244     }
   4245 
   4246     /*
   4247      * Capture end position and add node
   4248      */
   4249     if ( currentNode != NULL && ctxt->record_info ) {
   4250        node_info.end_pos = ctxt->input->consumed +
   4251                           (CUR_PTR - ctxt->input->base);
   4252        node_info.end_line = ctxt->input->line;
   4253        node_info.node = ctxt->node;
   4254        xmlParserAddNodeInfo(ctxt, &node_info);
   4255     }
   4256     if (!IS_CHAR_CH(CUR)) {
   4257 	htmlAutoCloseOnEnd(ctxt);
   4258     }
   4259 
   4260     if (currentNode != NULL)
   4261 	xmlFree(currentNode);
   4262 }
   4263 
   4264 static void
   4265 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
   4266     /*
   4267      * Capture end position and add node
   4268      */
   4269     if ( ctxt->node != NULL && ctxt->record_info ) {
   4270        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
   4271                                 (CUR_PTR - ctxt->input->base);
   4272        ctxt->nodeInfo->end_line = ctxt->input->line;
   4273        ctxt->nodeInfo->node = ctxt->node;
   4274        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
   4275        htmlNodeInfoPop(ctxt);
   4276     }
   4277     if (!IS_CHAR_CH(CUR)) {
   4278        htmlAutoCloseOnEnd(ctxt);
   4279     }
   4280 }
   4281 
   4282 /**
   4283  * htmlParseElementInternal:
   4284  * @ctxt:  an HTML parser context
   4285  *
   4286  * parse an HTML element, new version, non recursive
   4287  *
   4288  * [39] element ::= EmptyElemTag | STag content ETag
   4289  *
   4290  * [41] Attribute ::= Name Eq AttValue
   4291  */
   4292 
   4293 static void
   4294 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
   4295     const xmlChar *name;
   4296     const htmlElemDesc * info;
   4297     htmlParserNodeInfo node_info;
   4298     int failed;
   4299 
   4300     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   4301 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   4302 		     "htmlParseElementInternal: context error\n", NULL, NULL);
   4303 	return;
   4304     }
   4305 
   4306     if (ctxt->instate == XML_PARSER_EOF)
   4307         return;
   4308 
   4309     /* Capture start position */
   4310     if (ctxt->record_info) {
   4311         node_info.begin_pos = ctxt->input->consumed +
   4312                           (CUR_PTR - ctxt->input->base);
   4313 	node_info.begin_line = ctxt->input->line;
   4314     }
   4315 
   4316     failed = htmlParseStartTag(ctxt);
   4317     name = ctxt->name;
   4318     if ((failed == -1) || (name == NULL)) {
   4319 	if (CUR == '>')
   4320 	    NEXT;
   4321         return;
   4322     }
   4323 
   4324     /*
   4325      * Lookup the info for that element.
   4326      */
   4327     info = htmlTagLookup(name);
   4328     if (info == NULL) {
   4329 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
   4330 	             "Tag %s invalid\n", name, NULL);
   4331     }
   4332 
   4333     /*
   4334      * Check for an Empty Element labeled the XML/SGML way
   4335      */
   4336     if ((CUR == '/') && (NXT(1) == '>')) {
   4337         SKIP(2);
   4338 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   4339 	    ctxt->sax->endElement(ctxt->userData, name);
   4340 	htmlnamePop(ctxt);
   4341 	return;
   4342     }
   4343 
   4344     if (CUR == '>') {
   4345         NEXT;
   4346     } else {
   4347 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
   4348 	             "Couldn't find end of Start Tag %s\n", name, NULL);
   4349 
   4350 	/*
   4351 	 * end of parsing of this node.
   4352 	 */
   4353 	if (xmlStrEqual(name, ctxt->name)) {
   4354 	    nodePop(ctxt);
   4355 	    htmlnamePop(ctxt);
   4356 	}
   4357 
   4358         if (ctxt->record_info)
   4359             htmlNodeInfoPush(ctxt, &node_info);
   4360         htmlParserFinishElementParsing(ctxt);
   4361 	return;
   4362     }
   4363 
   4364     /*
   4365      * Check for an Empty Element from DTD definition
   4366      */
   4367     if ((info != NULL) && (info->empty)) {
   4368 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   4369 	    ctxt->sax->endElement(ctxt->userData, name);
   4370 	htmlnamePop(ctxt);
   4371 	return;
   4372     }
   4373 
   4374     if (ctxt->record_info)
   4375         htmlNodeInfoPush(ctxt, &node_info);
   4376 }
   4377 
   4378 /**
   4379  * htmlParseContentInternal:
   4380  * @ctxt:  an HTML parser context
   4381  *
   4382  * Parse a content: comment, sub-element, reference or text.
   4383  * New version for non recursive htmlParseElementInternal
   4384  */
   4385 
   4386 static void
   4387 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
   4388     xmlChar *currentNode;
   4389     int depth;
   4390     const xmlChar *name;
   4391 
   4392     currentNode = xmlStrdup(ctxt->name);
   4393     depth = ctxt->nameNr;
   4394     while (1) {
   4395 	long cons = ctxt->nbChars;
   4396 
   4397         GROW;
   4398 
   4399         if (ctxt->instate == XML_PARSER_EOF)
   4400             break;
   4401 
   4402 	/*
   4403 	 * Our tag or one of it's parent or children is ending.
   4404 	 */
   4405         if ((CUR == '<') && (NXT(1) == '/')) {
   4406 	    if (htmlParseEndTag(ctxt) &&
   4407 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
   4408 		if (currentNode != NULL)
   4409 		    xmlFree(currentNode);
   4410 
   4411 	        currentNode = xmlStrdup(ctxt->name);
   4412 	        depth = ctxt->nameNr;
   4413 	    }
   4414 	    continue; /* while */
   4415         }
   4416 
   4417 	else if ((CUR == '<') &&
   4418 	         ((IS_ASCII_LETTER(NXT(1))) ||
   4419 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
   4420 	    name = htmlParseHTMLName_nonInvasive(ctxt);
   4421 	    if (name == NULL) {
   4422 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   4423 			 "htmlParseStartTag: invalid element name\n",
   4424 			 NULL, NULL);
   4425 	        /* Dump the bogus tag like browsers do */
   4426 	        while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
   4427 	            NEXT;
   4428 
   4429 	        htmlParserFinishElementParsing(ctxt);
   4430 	        if (currentNode != NULL)
   4431 	            xmlFree(currentNode);
   4432 
   4433 	        currentNode = xmlStrdup(ctxt->name);
   4434 	        depth = ctxt->nameNr;
   4435 	        continue;
   4436 	    }
   4437 
   4438 	    if (ctxt->name != NULL) {
   4439 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
   4440 	            htmlAutoClose(ctxt, name);
   4441 	            continue;
   4442 	        }
   4443 	    }
   4444 	}
   4445 
   4446 	/*
   4447 	 * Has this node been popped out during parsing of
   4448 	 * the next element
   4449 	 */
   4450         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
   4451 	    (!xmlStrEqual(currentNode, ctxt->name)))
   4452 	     {
   4453 	    htmlParserFinishElementParsing(ctxt);
   4454 	    if (currentNode != NULL) xmlFree(currentNode);
   4455 
   4456 	    currentNode = xmlStrdup(ctxt->name);
   4457 	    depth = ctxt->nameNr;
   4458 	    continue;
   4459 	}
   4460 
   4461 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
   4462 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
   4463 	    /*
   4464 	     * Handle SCRIPT/STYLE separately
   4465 	     */
   4466 	    htmlParseScript(ctxt);
   4467 	} else {
   4468 	    /*
   4469 	     * Sometimes DOCTYPE arrives in the middle of the document
   4470 	     */
   4471 	    if ((CUR == '<') && (NXT(1) == '!') &&
   4472 		(UPP(2) == 'D') && (UPP(3) == 'O') &&
   4473 		(UPP(4) == 'C') && (UPP(5) == 'T') &&
   4474 		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
   4475 		(UPP(8) == 'E')) {
   4476 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   4477 		             "Misplaced DOCTYPE declaration\n",
   4478 			     BAD_CAST "DOCTYPE" , NULL);
   4479 		htmlParseDocTypeDecl(ctxt);
   4480 	    }
   4481 
   4482 	    /*
   4483 	     * First case :  a comment
   4484 	     */
   4485 	    if ((CUR == '<') && (NXT(1) == '!') &&
   4486 		(NXT(2) == '-') && (NXT(3) == '-')) {
   4487 		htmlParseComment(ctxt);
   4488 	    }
   4489 
   4490 	    /*
   4491 	     * Second case : a Processing Instruction.
   4492 	     */
   4493 	    else if ((CUR == '<') && (NXT(1) == '?')) {
   4494 		htmlParsePI(ctxt);
   4495 	    }
   4496 
   4497 	    /*
   4498 	     * Third case :  a sub-element.
   4499 	     */
   4500 	    else if (CUR == '<') {
   4501 		htmlParseElementInternal(ctxt);
   4502 		if (currentNode != NULL) xmlFree(currentNode);
   4503 
   4504 		currentNode = xmlStrdup(ctxt->name);
   4505 		depth = ctxt->nameNr;
   4506 	    }
   4507 
   4508 	    /*
   4509 	     * Fourth case : a reference. If if has not been resolved,
   4510 	     *    parsing returns it's Name, create the node
   4511 	     */
   4512 	    else if (CUR == '&') {
   4513 		htmlParseReference(ctxt);
   4514 	    }
   4515 
   4516 	    /*
   4517 	     * Fifth case : end of the resource
   4518 	     */
   4519 	    else if (CUR == 0) {
   4520 		htmlAutoCloseOnEnd(ctxt);
   4521 		break;
   4522 	    }
   4523 
   4524 	    /*
   4525 	     * Last case, text. Note that References are handled directly.
   4526 	     */
   4527 	    else {
   4528 		htmlParseCharData(ctxt);
   4529 	    }
   4530 
   4531 	    if (cons == ctxt->nbChars) {
   4532 		if (ctxt->node != NULL) {
   4533 		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   4534 		                 "detected an error in element content\n",
   4535 				 NULL, NULL);
   4536 		}
   4537 		break;
   4538 	    }
   4539 	}
   4540         GROW;
   4541     }
   4542     if (currentNode != NULL) xmlFree(currentNode);
   4543 }
   4544 
   4545 /**
   4546  * htmlParseContent:
   4547  * @ctxt:  an HTML parser context
   4548  *
   4549  * Parse a content: comment, sub-element, reference or text.
   4550  * This is the entry point when called from parser.c
   4551  */
   4552 
   4553 void
   4554 __htmlParseContent(void *ctxt) {
   4555     if (ctxt != NULL)
   4556 	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
   4557 }
   4558 
   4559 /**
   4560  * htmlParseDocument:
   4561  * @ctxt:  an HTML parser context
   4562  *
   4563  * parse an HTML document (and build a tree if using the standard SAX
   4564  * interface).
   4565  *
   4566  * Returns 0, -1 in case of error. the parser context is augmented
   4567  *                as a result of the parsing.
   4568  */
   4569 
   4570 int
   4571 htmlParseDocument(htmlParserCtxtPtr ctxt) {
   4572     xmlChar start[4];
   4573     xmlCharEncoding enc;
   4574     xmlDtdPtr dtd;
   4575 
   4576     xmlInitParser();
   4577 
   4578     htmlDefaultSAXHandlerInit();
   4579 
   4580     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   4581 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   4582 		     "htmlParseDocument: context error\n", NULL, NULL);
   4583 	return(XML_ERR_INTERNAL_ERROR);
   4584     }
   4585     ctxt->html = 1;
   4586     ctxt->linenumbers = 1;
   4587     GROW;
   4588     /*
   4589      * SAX: beginning of the document processing.
   4590      */
   4591     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
   4592         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
   4593 
   4594     if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
   4595         ((ctxt->input->end - ctxt->input->cur) >= 4)) {
   4596 	/*
   4597 	 * Get the 4 first bytes and decode the charset
   4598 	 * if enc != XML_CHAR_ENCODING_NONE
   4599 	 * plug some encoding conversion routines.
   4600 	 */
   4601 	start[0] = RAW;
   4602 	start[1] = NXT(1);
   4603 	start[2] = NXT(2);
   4604 	start[3] = NXT(3);
   4605 	enc = xmlDetectCharEncoding(&start[0], 4);
   4606 	if (enc != XML_CHAR_ENCODING_NONE) {
   4607 	    xmlSwitchEncoding(ctxt, enc);
   4608 	}
   4609     }
   4610 
   4611     /*
   4612      * Wipe out everything which is before the first '<'
   4613      */
   4614     SKIP_BLANKS;
   4615     if (CUR == 0) {
   4616 	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
   4617 	             "Document is empty\n", NULL, NULL);
   4618     }
   4619 
   4620     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
   4621 	ctxt->sax->startDocument(ctxt->userData);
   4622 
   4623 
   4624     /*
   4625      * Parse possible comments and PIs before any content
   4626      */
   4627     while (((CUR == '<') && (NXT(1) == '!') &&
   4628             (NXT(2) == '-') && (NXT(3) == '-')) ||
   4629 	   ((CUR == '<') && (NXT(1) == '?'))) {
   4630         htmlParseComment(ctxt);
   4631         htmlParsePI(ctxt);
   4632 	SKIP_BLANKS;
   4633     }
   4634 
   4635 
   4636     /*
   4637      * Then possibly doc type declaration(s) and more Misc
   4638      * (doctypedecl Misc*)?
   4639      */
   4640     if ((CUR == '<') && (NXT(1) == '!') &&
   4641 	(UPP(2) == 'D') && (UPP(3) == 'O') &&
   4642 	(UPP(4) == 'C') && (UPP(5) == 'T') &&
   4643 	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
   4644 	(UPP(8) == 'E')) {
   4645 	htmlParseDocTypeDecl(ctxt);
   4646     }
   4647     SKIP_BLANKS;
   4648 
   4649     /*
   4650      * Parse possible comments and PIs before any content
   4651      */
   4652     while (((CUR == '<') && (NXT(1) == '!') &&
   4653             (NXT(2) == '-') && (NXT(3) == '-')) ||
   4654 	   ((CUR == '<') && (NXT(1) == '?'))) {
   4655         htmlParseComment(ctxt);
   4656         htmlParsePI(ctxt);
   4657 	SKIP_BLANKS;
   4658     }
   4659 
   4660     /*
   4661      * Time to start parsing the tree itself
   4662      */
   4663     htmlParseContentInternal(ctxt);
   4664 
   4665     /*
   4666      * autoclose
   4667      */
   4668     if (CUR == 0)
   4669 	htmlAutoCloseOnEnd(ctxt);
   4670 
   4671 
   4672     /*
   4673      * SAX: end of the document processing.
   4674      */
   4675     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
   4676         ctxt->sax->endDocument(ctxt->userData);
   4677 
   4678     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
   4679 	dtd = xmlGetIntSubset(ctxt->myDoc);
   4680 	if (dtd == NULL)
   4681 	    ctxt->myDoc->intSubset =
   4682 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
   4683 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
   4684 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
   4685     }
   4686     if (! ctxt->wellFormed) return(-1);
   4687     return(0);
   4688 }
   4689 
   4690 
   4691 /************************************************************************
   4692  *									*
   4693  *			Parser contexts handling			*
   4694  *									*
   4695  ************************************************************************/
   4696 
   4697 /**
   4698  * htmlInitParserCtxt:
   4699  * @ctxt:  an HTML parser context
   4700  *
   4701  * Initialize a parser context
   4702  *
   4703  * Returns 0 in case of success and -1 in case of error
   4704  */
   4705 
   4706 static int
   4707 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
   4708 {
   4709     htmlSAXHandler *sax;
   4710 
   4711     if (ctxt == NULL) return(-1);
   4712     memset(ctxt, 0, sizeof(htmlParserCtxt));
   4713 
   4714     ctxt->dict = xmlDictCreate();
   4715     if (ctxt->dict == NULL) {
   4716         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
   4717 	return(-1);
   4718     }
   4719     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
   4720     if (sax == NULL) {
   4721         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
   4722 	return(-1);
   4723     }
   4724     else
   4725         memset(sax, 0, sizeof(htmlSAXHandler));
   4726 
   4727     /* Allocate the Input stack */
   4728     ctxt->inputTab = (htmlParserInputPtr *)
   4729                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
   4730     if (ctxt->inputTab == NULL) {
   4731         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
   4732 	ctxt->inputNr = 0;
   4733 	ctxt->inputMax = 0;
   4734 	ctxt->input = NULL;
   4735 	return(-1);
   4736     }
   4737     ctxt->inputNr = 0;
   4738     ctxt->inputMax = 5;
   4739     ctxt->input = NULL;
   4740     ctxt->version = NULL;
   4741     ctxt->encoding = NULL;
   4742     ctxt->standalone = -1;
   4743     ctxt->instate = XML_PARSER_START;
   4744 
   4745     /* Allocate the Node stack */
   4746     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
   4747     if (ctxt->nodeTab == NULL) {
   4748         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
   4749 	ctxt->nodeNr = 0;
   4750 	ctxt->nodeMax = 0;
   4751 	ctxt->node = NULL;
   4752 	ctxt->inputNr = 0;
   4753 	ctxt->inputMax = 0;
   4754 	ctxt->input = NULL;
   4755 	return(-1);
   4756     }
   4757     ctxt->nodeNr = 0;
   4758     ctxt->nodeMax = 10;
   4759     ctxt->node = NULL;
   4760 
   4761     /* Allocate the Name stack */
   4762     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
   4763     if (ctxt->nameTab == NULL) {
   4764         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
   4765 	ctxt->nameNr = 0;
   4766 	ctxt->nameMax = 0;
   4767 	ctxt->name = NULL;
   4768 	ctxt->nodeNr = 0;
   4769 	ctxt->nodeMax = 0;
   4770 	ctxt->node = NULL;
   4771 	ctxt->inputNr = 0;
   4772 	ctxt->inputMax = 0;
   4773 	ctxt->input = NULL;
   4774 	return(-1);
   4775     }
   4776     ctxt->nameNr = 0;
   4777     ctxt->nameMax = 10;
   4778     ctxt->name = NULL;
   4779 
   4780     ctxt->nodeInfoTab = NULL;
   4781     ctxt->nodeInfoNr  = 0;
   4782     ctxt->nodeInfoMax = 0;
   4783 
   4784     if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
   4785     else {
   4786         ctxt->sax = sax;
   4787 	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
   4788     }
   4789     ctxt->userData = ctxt;
   4790     ctxt->myDoc = NULL;
   4791     ctxt->wellFormed = 1;
   4792     ctxt->replaceEntities = 0;
   4793     ctxt->linenumbers = xmlLineNumbersDefaultValue;
   4794     ctxt->html = 1;
   4795     ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
   4796     ctxt->vctxt.userData = ctxt;
   4797     ctxt->vctxt.error = xmlParserValidityError;
   4798     ctxt->vctxt.warning = xmlParserValidityWarning;
   4799     ctxt->record_info = 0;
   4800     ctxt->validate = 0;
   4801     ctxt->nbChars = 0;
   4802     ctxt->checkIndex = 0;
   4803     ctxt->catalogs = NULL;
   4804     xmlInitNodeInfoSeq(&ctxt->node_seq);
   4805     return(0);
   4806 }
   4807 
   4808 /**
   4809  * htmlFreeParserCtxt:
   4810  * @ctxt:  an HTML parser context
   4811  *
   4812  * Free all the memory used by a parser context. However the parsed
   4813  * document in ctxt->myDoc is not freed.
   4814  */
   4815 
   4816 void
   4817 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
   4818 {
   4819     xmlFreeParserCtxt(ctxt);
   4820 }
   4821 
   4822 /**
   4823  * htmlNewParserCtxt:
   4824  *
   4825  * Allocate and initialize a new parser context.
   4826  *
   4827  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
   4828  */
   4829 
   4830 htmlParserCtxtPtr
   4831 htmlNewParserCtxt(void)
   4832 {
   4833     xmlParserCtxtPtr ctxt;
   4834 
   4835     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
   4836     if (ctxt == NULL) {
   4837         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
   4838 	return(NULL);
   4839     }
   4840     memset(ctxt, 0, sizeof(xmlParserCtxt));
   4841     if (htmlInitParserCtxt(ctxt) < 0) {
   4842         htmlFreeParserCtxt(ctxt);
   4843 	return(NULL);
   4844     }
   4845     return(ctxt);
   4846 }
   4847 
   4848 /**
   4849  * htmlCreateMemoryParserCtxt:
   4850  * @buffer:  a pointer to a char array
   4851  * @size:  the size of the array
   4852  *
   4853  * Create a parser context for an HTML in-memory document.
   4854  *
   4855  * Returns the new parser context or NULL
   4856  */
   4857 htmlParserCtxtPtr
   4858 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
   4859     xmlParserCtxtPtr ctxt;
   4860     xmlParserInputPtr input;
   4861     xmlParserInputBufferPtr buf;
   4862 
   4863     if (buffer == NULL)
   4864 	return(NULL);
   4865     if (size <= 0)
   4866 	return(NULL);
   4867 
   4868     ctxt = htmlNewParserCtxt();
   4869     if (ctxt == NULL)
   4870 	return(NULL);
   4871 
   4872     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
   4873     if (buf == NULL) return(NULL);
   4874 
   4875     input = xmlNewInputStream(ctxt);
   4876     if (input == NULL) {
   4877 	xmlFreeParserCtxt(ctxt);
   4878 	return(NULL);
   4879     }
   4880 
   4881     input->filename = NULL;
   4882     input->buf = buf;
   4883     input->base = input->buf->buffer->content;
   4884     input->cur = input->buf->buffer->content;
   4885     input->end = &input->buf->buffer->content[input->buf->buffer->use];
   4886 
   4887     inputPush(ctxt, input);
   4888     return(ctxt);
   4889 }
   4890 
   4891 /**
   4892  * htmlCreateDocParserCtxt:
   4893  * @cur:  a pointer to an array of xmlChar
   4894  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   4895  *
   4896  * Create a parser context for an HTML document.
   4897  *
   4898  * TODO: check the need to add encoding handling there
   4899  *
   4900  * Returns the new parser context or NULL
   4901  */
   4902 static htmlParserCtxtPtr
   4903 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
   4904     int len;
   4905     htmlParserCtxtPtr ctxt;
   4906 
   4907     if (cur == NULL)
   4908 	return(NULL);
   4909     len = xmlStrlen(cur);
   4910     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
   4911     if (ctxt == NULL)
   4912 	return(NULL);
   4913 
   4914     if (encoding != NULL) {
   4915 	xmlCharEncoding enc;
   4916 	xmlCharEncodingHandlerPtr handler;
   4917 
   4918 	if (ctxt->input->encoding != NULL)
   4919 	    xmlFree((xmlChar *) ctxt->input->encoding);
   4920 	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
   4921 
   4922 	enc = xmlParseCharEncoding(encoding);
   4923 	/*
   4924 	 * registered set of known encodings
   4925 	 */
   4926 	if (enc != XML_CHAR_ENCODING_ERROR) {
   4927 	    xmlSwitchEncoding(ctxt, enc);
   4928 	    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
   4929 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
   4930 		             "Unsupported encoding %s\n",
   4931 			     (const xmlChar *) encoding, NULL);
   4932 	    }
   4933 	} else {
   4934 	    /*
   4935 	     * fallback for unknown encodings
   4936 	     */
   4937 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
   4938 	    if (handler != NULL) {
   4939 		xmlSwitchToEncoding(ctxt, handler);
   4940 	    } else {
   4941 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
   4942 		             "Unsupported encoding %s\n",
   4943 			     (const xmlChar *) encoding, NULL);
   4944 	    }
   4945 	}
   4946     }
   4947     return(ctxt);
   4948 }
   4949 
   4950 #ifdef LIBXML_PUSH_ENABLED
   4951 /************************************************************************
   4952  *									*
   4953  *	Progressive parsing interfaces				*
   4954  *									*
   4955  ************************************************************************/
   4956 
   4957 /**
   4958  * htmlParseLookupSequence:
   4959  * @ctxt:  an HTML parser context
   4960  * @first:  the first char to lookup
   4961  * @next:  the next char to lookup or zero
   4962  * @third:  the next char to lookup or zero
   4963  * @comment: flag to force checking inside comments
   4964  *
   4965  * Try to find if a sequence (first, next, third) or  just (first next) or
   4966  * (first) is available in the input stream.
   4967  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
   4968  * to avoid rescanning sequences of bytes, it DOES change the state of the
   4969  * parser, do not use liberally.
   4970  * This is basically similar to xmlParseLookupSequence()
   4971  *
   4972  * Returns the index to the current parsing point if the full sequence
   4973  *      is available, -1 otherwise.
   4974  */
   4975 static int
   4976 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
   4977                         xmlChar next, xmlChar third, int iscomment,
   4978                         int ignoreattrval)
   4979 {
   4980     int base, len;
   4981     htmlParserInputPtr in;
   4982     const xmlChar *buf;
   4983     int incomment = 0;
   4984     int invalue = 0;
   4985     char valdellim = 0x0;
   4986 
   4987     in = ctxt->input;
   4988     if (in == NULL)
   4989         return (-1);
   4990 
   4991     base = in->cur - in->base;
   4992     if (base < 0)
   4993         return (-1);
   4994 
   4995     if (ctxt->checkIndex > base)
   4996         base = ctxt->checkIndex;
   4997 
   4998     if (in->buf == NULL) {
   4999         buf = in->base;
   5000         len = in->length;
   5001     } else {
   5002         buf = in->buf->buffer->content;
   5003         len = in->buf->buffer->use;
   5004     }
   5005 
   5006     /* take into account the sequence length */
   5007     if (third)
   5008         len -= 2;
   5009     else if (next)
   5010         len--;
   5011     for (; base < len; base++) {
   5012         if ((!incomment) && (base + 4 < len) && (!iscomment)) {
   5013             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
   5014                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
   5015                 incomment = 1;
   5016                 /* do not increment past <! - some people use <!--> */
   5017                 base += 2;
   5018             }
   5019         }
   5020         if (ignoreattrval) {
   5021             if (buf[base] == '"' || buf[base] == '\'') {
   5022                 if (invalue) {
   5023                     if (buf[base] == valdellim) {
   5024                         invalue = 0;
   5025                         continue;
   5026                     }
   5027                 } else {
   5028                     valdellim = buf[base];
   5029                     invalue = 1;
   5030                     continue;
   5031                 }
   5032             } else if (invalue) {
   5033                 continue;
   5034             }
   5035         }
   5036         if (incomment) {
   5037             if (base + 3 > len)
   5038                 return (-1);
   5039             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
   5040                 (buf[base + 2] == '>')) {
   5041                 incomment = 0;
   5042                 base += 2;
   5043             }
   5044             continue;
   5045         }
   5046         if (buf[base] == first) {
   5047             if (third != 0) {
   5048                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
   5049                     continue;
   5050             } else if (next != 0) {
   5051                 if (buf[base + 1] != next)
   5052                     continue;
   5053             }
   5054             ctxt->checkIndex = 0;
   5055 #ifdef DEBUG_PUSH
   5056             if (next == 0)
   5057                 xmlGenericError(xmlGenericErrorContext,
   5058                                 "HPP: lookup '%c' found at %d\n",
   5059                                 first, base);
   5060             else if (third == 0)
   5061                 xmlGenericError(xmlGenericErrorContext,
   5062                                 "HPP: lookup '%c%c' found at %d\n",
   5063                                 first, next, base);
   5064             else
   5065                 xmlGenericError(xmlGenericErrorContext,
   5066                                 "HPP: lookup '%c%c%c' found at %d\n",
   5067                                 first, next, third, base);
   5068 #endif
   5069             return (base - (in->cur - in->base));
   5070         }
   5071     }
   5072     if ((!incomment) && (!invalue))
   5073         ctxt->checkIndex = base;
   5074 #ifdef DEBUG_PUSH
   5075     if (next == 0)
   5076         xmlGenericError(xmlGenericErrorContext,
   5077                         "HPP: lookup '%c' failed\n", first);
   5078     else if (third == 0)
   5079         xmlGenericError(xmlGenericErrorContext,
   5080                         "HPP: lookup '%c%c' failed\n", first, next);
   5081     else
   5082         xmlGenericError(xmlGenericErrorContext,
   5083                         "HPP: lookup '%c%c%c' failed\n", first, next,
   5084                         third);
   5085 #endif
   5086     return (-1);
   5087 }
   5088 
   5089 /**
   5090  * htmlParseLookupChars:
   5091  * @ctxt: an HTML parser context
   5092  * @stop: Array of chars, which stop the lookup.
   5093  * @stopLen: Length of stop-Array
   5094  *
   5095  * Try to find if any char of the stop-Array is available in the input
   5096  * stream.
   5097  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
   5098  * to avoid rescanning sequences of bytes, it DOES change the state of the
   5099  * parser, do not use liberally.
   5100  *
   5101  * Returns the index to the current parsing point if a stopChar
   5102  *      is available, -1 otherwise.
   5103  */
   5104 static int
   5105 htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
   5106                      int stopLen)
   5107 {
   5108     int base, len;
   5109     htmlParserInputPtr in;
   5110     const xmlChar *buf;
   5111     int incomment = 0;
   5112     int i;
   5113 
   5114     in = ctxt->input;
   5115     if (in == NULL)
   5116         return (-1);
   5117 
   5118     base = in->cur - in->base;
   5119     if (base < 0)
   5120         return (-1);
   5121 
   5122     if (ctxt->checkIndex > base)
   5123         base = ctxt->checkIndex;
   5124 
   5125     if (in->buf == NULL) {
   5126         buf = in->base;
   5127         len = in->length;
   5128     } else {
   5129         buf = in->buf->buffer->content;
   5130         len = in->buf->buffer->use;
   5131     }
   5132 
   5133     for (; base < len; base++) {
   5134         if (!incomment && (base + 4 < len)) {
   5135             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
   5136                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
   5137                 incomment = 1;
   5138                 /* do not increment past <! - some people use <!--> */
   5139                 base += 2;
   5140             }
   5141         }
   5142         if (incomment) {
   5143             if (base + 3 > len)
   5144                 return (-1);
   5145             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
   5146                 (buf[base + 2] == '>')) {
   5147                 incomment = 0;
   5148                 base += 2;
   5149             }
   5150             continue;
   5151         }
   5152         for (i = 0; i < stopLen; ++i) {
   5153             if (buf[base] == stop[i]) {
   5154                 ctxt->checkIndex = 0;
   5155                 return (base - (in->cur - in->base));
   5156             }
   5157         }
   5158     }
   5159     ctxt->checkIndex = base;
   5160     return (-1);
   5161 }
   5162 
   5163 /**
   5164  * htmlParseTryOrFinish:
   5165  * @ctxt:  an HTML parser context
   5166  * @terminate:  last chunk indicator
   5167  *
   5168  * Try to progress on parsing
   5169  *
   5170  * Returns zero if no parsing was possible
   5171  */
   5172 static int
   5173 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
   5174     int ret = 0;
   5175     htmlParserInputPtr in;
   5176     int avail = 0;
   5177     xmlChar cur, next;
   5178 
   5179 #ifdef DEBUG_PUSH
   5180     switch (ctxt->instate) {
   5181 	case XML_PARSER_EOF:
   5182 	    xmlGenericError(xmlGenericErrorContext,
   5183 		    "HPP: try EOF\n"); break;
   5184 	case XML_PARSER_START:
   5185 	    xmlGenericError(xmlGenericErrorContext,
   5186 		    "HPP: try START\n"); break;
   5187 	case XML_PARSER_MISC:
   5188 	    xmlGenericError(xmlGenericErrorContext,
   5189 		    "HPP: try MISC\n");break;
   5190 	case XML_PARSER_COMMENT:
   5191 	    xmlGenericError(xmlGenericErrorContext,
   5192 		    "HPP: try COMMENT\n");break;
   5193 	case XML_PARSER_PROLOG:
   5194 	    xmlGenericError(xmlGenericErrorContext,
   5195 		    "HPP: try PROLOG\n");break;
   5196 	case XML_PARSER_START_TAG:
   5197 	    xmlGenericError(xmlGenericErrorContext,
   5198 		    "HPP: try START_TAG\n");break;
   5199 	case XML_PARSER_CONTENT:
   5200 	    xmlGenericError(xmlGenericErrorContext,
   5201 		    "HPP: try CONTENT\n");break;
   5202 	case XML_PARSER_CDATA_SECTION:
   5203 	    xmlGenericError(xmlGenericErrorContext,
   5204 		    "HPP: try CDATA_SECTION\n");break;
   5205 	case XML_PARSER_END_TAG:
   5206 	    xmlGenericError(xmlGenericErrorContext,
   5207 		    "HPP: try END_TAG\n");break;
   5208 	case XML_PARSER_ENTITY_DECL:
   5209 	    xmlGenericError(xmlGenericErrorContext,
   5210 		    "HPP: try ENTITY_DECL\n");break;
   5211 	case XML_PARSER_ENTITY_VALUE:
   5212 	    xmlGenericError(xmlGenericErrorContext,
   5213 		    "HPP: try ENTITY_VALUE\n");break;
   5214 	case XML_PARSER_ATTRIBUTE_VALUE:
   5215 	    xmlGenericError(xmlGenericErrorContext,
   5216 		    "HPP: try ATTRIBUTE_VALUE\n");break;
   5217 	case XML_PARSER_DTD:
   5218 	    xmlGenericError(xmlGenericErrorContext,
   5219 		    "HPP: try DTD\n");break;
   5220 	case XML_PARSER_EPILOG:
   5221 	    xmlGenericError(xmlGenericErrorContext,
   5222 		    "HPP: try EPILOG\n");break;
   5223 	case XML_PARSER_PI:
   5224 	    xmlGenericError(xmlGenericErrorContext,
   5225 		    "HPP: try PI\n");break;
   5226 	case XML_PARSER_SYSTEM_LITERAL:
   5227 	    xmlGenericError(xmlGenericErrorContext,
   5228 		    "HPP: try SYSTEM_LITERAL\n");break;
   5229     }
   5230 #endif
   5231 
   5232     while (1) {
   5233 
   5234 	in = ctxt->input;
   5235 	if (in == NULL) break;
   5236 	if (in->buf == NULL)
   5237 	    avail = in->length - (in->cur - in->base);
   5238 	else
   5239 	    avail = in->buf->buffer->use - (in->cur - in->base);
   5240 	if ((avail == 0) && (terminate)) {
   5241 	    htmlAutoCloseOnEnd(ctxt);
   5242 	    if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
   5243 		/*
   5244 		 * SAX: end of the document processing.
   5245 		 */
   5246 		ctxt->instate = XML_PARSER_EOF;
   5247 		if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
   5248 		    ctxt->sax->endDocument(ctxt->userData);
   5249 	    }
   5250 	}
   5251         if (avail < 1)
   5252 	    goto done;
   5253 	cur = in->cur[0];
   5254 	if (cur == 0) {
   5255 	    SKIP(1);
   5256 	    continue;
   5257 	}
   5258 
   5259         switch (ctxt->instate) {
   5260             case XML_PARSER_EOF:
   5261 	        /*
   5262 		 * Document parsing is done !
   5263 		 */
   5264 	        goto done;
   5265             case XML_PARSER_START:
   5266 	        /*
   5267 		 * Very first chars read from the document flow.
   5268 		 */
   5269 		cur = in->cur[0];
   5270 		if (IS_BLANK_CH(cur)) {
   5271 		    SKIP_BLANKS;
   5272 		    if (in->buf == NULL)
   5273 			avail = in->length - (in->cur - in->base);
   5274 		    else
   5275 			avail = in->buf->buffer->use - (in->cur - in->base);
   5276 		}
   5277 		if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
   5278 		    ctxt->sax->setDocumentLocator(ctxt->userData,
   5279 						  &xmlDefaultSAXLocator);
   5280 		if ((ctxt->sax) && (ctxt->sax->startDocument) &&
   5281 	            (!ctxt->disableSAX))
   5282 		    ctxt->sax->startDocument(ctxt->userData);
   5283 
   5284 		cur = in->cur[0];
   5285 		next = in->cur[1];
   5286 		if ((cur == '<') && (next == '!') &&
   5287 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
   5288 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
   5289 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
   5290 		    (UPP(8) == 'E')) {
   5291 		    if ((!terminate) &&
   5292 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5293 			goto done;
   5294 #ifdef DEBUG_PUSH
   5295 		    xmlGenericError(xmlGenericErrorContext,
   5296 			    "HPP: Parsing internal subset\n");
   5297 #endif
   5298 		    htmlParseDocTypeDecl(ctxt);
   5299 		    ctxt->instate = XML_PARSER_PROLOG;
   5300 #ifdef DEBUG_PUSH
   5301 		    xmlGenericError(xmlGenericErrorContext,
   5302 			    "HPP: entering PROLOG\n");
   5303 #endif
   5304                 } else {
   5305 		    ctxt->instate = XML_PARSER_MISC;
   5306 #ifdef DEBUG_PUSH
   5307 		    xmlGenericError(xmlGenericErrorContext,
   5308 			    "HPP: entering MISC\n");
   5309 #endif
   5310 		}
   5311 		break;
   5312             case XML_PARSER_MISC:
   5313 		SKIP_BLANKS;
   5314 		if (in->buf == NULL)
   5315 		    avail = in->length - (in->cur - in->base);
   5316 		else
   5317 		    avail = in->buf->buffer->use - (in->cur - in->base);
   5318 		if (avail < 2)
   5319 		    goto done;
   5320 		cur = in->cur[0];
   5321 		next = in->cur[1];
   5322 	        if ((cur == '<') && (next == '!') &&
   5323 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
   5324 		    if ((!terminate) &&
   5325 		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
   5326 			goto done;
   5327 #ifdef DEBUG_PUSH
   5328 		    xmlGenericError(xmlGenericErrorContext,
   5329 			    "HPP: Parsing Comment\n");
   5330 #endif
   5331 		    htmlParseComment(ctxt);
   5332 		    ctxt->instate = XML_PARSER_MISC;
   5333 	        } else if ((cur == '<') && (next == '?')) {
   5334 		    if ((!terminate) &&
   5335 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5336 			goto done;
   5337 #ifdef DEBUG_PUSH
   5338 		    xmlGenericError(xmlGenericErrorContext,
   5339 			    "HPP: Parsing PI\n");
   5340 #endif
   5341 		    htmlParsePI(ctxt);
   5342 		    ctxt->instate = XML_PARSER_MISC;
   5343 		} else if ((cur == '<') && (next == '!') &&
   5344 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
   5345 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
   5346 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
   5347 		    (UPP(8) == 'E')) {
   5348 		    if ((!terminate) &&
   5349 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5350 			goto done;
   5351 #ifdef DEBUG_PUSH
   5352 		    xmlGenericError(xmlGenericErrorContext,
   5353 			    "HPP: Parsing internal subset\n");
   5354 #endif
   5355 		    htmlParseDocTypeDecl(ctxt);
   5356 		    ctxt->instate = XML_PARSER_PROLOG;
   5357 #ifdef DEBUG_PUSH
   5358 		    xmlGenericError(xmlGenericErrorContext,
   5359 			    "HPP: entering PROLOG\n");
   5360 #endif
   5361 		} else if ((cur == '<') && (next == '!') &&
   5362 		           (avail < 9)) {
   5363 		    goto done;
   5364 		} else {
   5365 		    ctxt->instate = XML_PARSER_START_TAG;
   5366 #ifdef DEBUG_PUSH
   5367 		    xmlGenericError(xmlGenericErrorContext,
   5368 			    "HPP: entering START_TAG\n");
   5369 #endif
   5370 		}
   5371 		break;
   5372             case XML_PARSER_PROLOG:
   5373 		SKIP_BLANKS;
   5374 		if (in->buf == NULL)
   5375 		    avail = in->length - (in->cur - in->base);
   5376 		else
   5377 		    avail = in->buf->buffer->use - (in->cur - in->base);
   5378 		if (avail < 2)
   5379 		    goto done;
   5380 		cur = in->cur[0];
   5381 		next = in->cur[1];
   5382 		if ((cur == '<') && (next == '!') &&
   5383 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
   5384 		    if ((!terminate) &&
   5385 		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
   5386 			goto done;
   5387 #ifdef DEBUG_PUSH
   5388 		    xmlGenericError(xmlGenericErrorContext,
   5389 			    "HPP: Parsing Comment\n");
   5390 #endif
   5391 		    htmlParseComment(ctxt);
   5392 		    ctxt->instate = XML_PARSER_PROLOG;
   5393 	        } else if ((cur == '<') && (next == '?')) {
   5394 		    if ((!terminate) &&
   5395 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5396 			goto done;
   5397 #ifdef DEBUG_PUSH
   5398 		    xmlGenericError(xmlGenericErrorContext,
   5399 			    "HPP: Parsing PI\n");
   5400 #endif
   5401 		    htmlParsePI(ctxt);
   5402 		    ctxt->instate = XML_PARSER_PROLOG;
   5403 		} else if ((cur == '<') && (next == '!') &&
   5404 		           (avail < 4)) {
   5405 		    goto done;
   5406 		} else {
   5407 		    ctxt->instate = XML_PARSER_START_TAG;
   5408 #ifdef DEBUG_PUSH
   5409 		    xmlGenericError(xmlGenericErrorContext,
   5410 			    "HPP: entering START_TAG\n");
   5411 #endif
   5412 		}
   5413 		break;
   5414             case XML_PARSER_EPILOG:
   5415 		if (in->buf == NULL)
   5416 		    avail = in->length - (in->cur - in->base);
   5417 		else
   5418 		    avail = in->buf->buffer->use - (in->cur - in->base);
   5419 		if (avail < 1)
   5420 		    goto done;
   5421 		cur = in->cur[0];
   5422 		if (IS_BLANK_CH(cur)) {
   5423 		    htmlParseCharData(ctxt);
   5424 		    goto done;
   5425 		}
   5426 		if (avail < 2)
   5427 		    goto done;
   5428 		next = in->cur[1];
   5429 	        if ((cur == '<') && (next == '!') &&
   5430 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
   5431 		    if ((!terminate) &&
   5432 		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
   5433 			goto done;
   5434 #ifdef DEBUG_PUSH
   5435 		    xmlGenericError(xmlGenericErrorContext,
   5436 			    "HPP: Parsing Comment\n");
   5437 #endif
   5438 		    htmlParseComment(ctxt);
   5439 		    ctxt->instate = XML_PARSER_EPILOG;
   5440 	        } else if ((cur == '<') && (next == '?')) {
   5441 		    if ((!terminate) &&
   5442 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5443 			goto done;
   5444 #ifdef DEBUG_PUSH
   5445 		    xmlGenericError(xmlGenericErrorContext,
   5446 			    "HPP: Parsing PI\n");
   5447 #endif
   5448 		    htmlParsePI(ctxt);
   5449 		    ctxt->instate = XML_PARSER_EPILOG;
   5450 		} else if ((cur == '<') && (next == '!') &&
   5451 		           (avail < 4)) {
   5452 		    goto done;
   5453 		} else {
   5454 		    ctxt->errNo = XML_ERR_DOCUMENT_END;
   5455 		    ctxt->wellFormed = 0;
   5456 		    ctxt->instate = XML_PARSER_EOF;
   5457 #ifdef DEBUG_PUSH
   5458 		    xmlGenericError(xmlGenericErrorContext,
   5459 			    "HPP: entering EOF\n");
   5460 #endif
   5461 		    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
   5462 			ctxt->sax->endDocument(ctxt->userData);
   5463 		    goto done;
   5464 		}
   5465 		break;
   5466             case XML_PARSER_START_TAG: {
   5467 	        const xmlChar *name;
   5468 		int failed;
   5469 		const htmlElemDesc * info;
   5470 
   5471 		if (avail < 2)
   5472 		    goto done;
   5473 		cur = in->cur[0];
   5474 	        if (cur != '<') {
   5475 		    ctxt->instate = XML_PARSER_CONTENT;
   5476 #ifdef DEBUG_PUSH
   5477 		    xmlGenericError(xmlGenericErrorContext,
   5478 			    "HPP: entering CONTENT\n");
   5479 #endif
   5480 		    break;
   5481 		}
   5482 		if (in->cur[1] == '/') {
   5483 		    ctxt->instate = XML_PARSER_END_TAG;
   5484 		    ctxt->checkIndex = 0;
   5485 #ifdef DEBUG_PUSH
   5486 		    xmlGenericError(xmlGenericErrorContext,
   5487 			    "HPP: entering END_TAG\n");
   5488 #endif
   5489 		    break;
   5490 		}
   5491 		if ((!terminate) &&
   5492 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5493 		    goto done;
   5494 
   5495 		failed = htmlParseStartTag(ctxt);
   5496 		name = ctxt->name;
   5497 		if ((failed == -1) ||
   5498 		    (name == NULL)) {
   5499 		    if (CUR == '>')
   5500 			NEXT;
   5501 		    break;
   5502 		}
   5503 
   5504 		/*
   5505 		 * Lookup the info for that element.
   5506 		 */
   5507 		info = htmlTagLookup(name);
   5508 		if (info == NULL) {
   5509 		    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
   5510 		                 "Tag %s invalid\n", name, NULL);
   5511 		}
   5512 
   5513 		/*
   5514 		 * Check for an Empty Element labeled the XML/SGML way
   5515 		 */
   5516 		if ((CUR == '/') && (NXT(1) == '>')) {
   5517 		    SKIP(2);
   5518 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   5519 			ctxt->sax->endElement(ctxt->userData, name);
   5520 		    htmlnamePop(ctxt);
   5521 		    ctxt->instate = XML_PARSER_CONTENT;
   5522 #ifdef DEBUG_PUSH
   5523 		    xmlGenericError(xmlGenericErrorContext,
   5524 			    "HPP: entering CONTENT\n");
   5525 #endif
   5526 		    break;
   5527 		}
   5528 
   5529 		if (CUR == '>') {
   5530 		    NEXT;
   5531 		} else {
   5532 		    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
   5533 		                 "Couldn't find end of Start Tag %s\n",
   5534 				 name, NULL);
   5535 
   5536 		    /*
   5537 		     * end of parsing of this node.
   5538 		     */
   5539 		    if (xmlStrEqual(name, ctxt->name)) {
   5540 			nodePop(ctxt);
   5541 			htmlnamePop(ctxt);
   5542 		    }
   5543 
   5544 		    ctxt->instate = XML_PARSER_CONTENT;
   5545 #ifdef DEBUG_PUSH
   5546 		    xmlGenericError(xmlGenericErrorContext,
   5547 			    "HPP: entering CONTENT\n");
   5548 #endif
   5549 		    break;
   5550 		}
   5551 
   5552 		/*
   5553 		 * Check for an Empty Element from DTD definition
   5554 		 */
   5555 		if ((info != NULL) && (info->empty)) {
   5556 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   5557 			ctxt->sax->endElement(ctxt->userData, name);
   5558 		    htmlnamePop(ctxt);
   5559 		}
   5560 		ctxt->instate = XML_PARSER_CONTENT;
   5561 #ifdef DEBUG_PUSH
   5562 		xmlGenericError(xmlGenericErrorContext,
   5563 			"HPP: entering CONTENT\n");
   5564 #endif
   5565                 break;
   5566 	    }
   5567             case XML_PARSER_CONTENT: {
   5568 		long cons;
   5569                 /*
   5570 		 * Handle preparsed entities and charRef
   5571 		 */
   5572 		if (ctxt->token != 0) {
   5573 		    xmlChar chr[2] = { 0 , 0 } ;
   5574 
   5575 		    chr[0] = (xmlChar) ctxt->token;
   5576 		    htmlCheckParagraph(ctxt);
   5577 		    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
   5578 			ctxt->sax->characters(ctxt->userData, chr, 1);
   5579 		    ctxt->token = 0;
   5580 		    ctxt->checkIndex = 0;
   5581 		}
   5582 		if ((avail == 1) && (terminate)) {
   5583 		    cur = in->cur[0];
   5584 		    if ((cur != '<') && (cur != '&')) {
   5585 			if (ctxt->sax != NULL) {
   5586 			    if (IS_BLANK_CH(cur)) {
   5587 				if (ctxt->sax->ignorableWhitespace != NULL)
   5588 				    ctxt->sax->ignorableWhitespace(
   5589 					    ctxt->userData, &cur, 1);
   5590 			    } else {
   5591 				htmlCheckParagraph(ctxt);
   5592 				if (ctxt->sax->characters != NULL)
   5593 				    ctxt->sax->characters(
   5594 					    ctxt->userData, &cur, 1);
   5595 			    }
   5596 			}
   5597 			ctxt->token = 0;
   5598 			ctxt->checkIndex = 0;
   5599 			in->cur++;
   5600 			break;
   5601 		    }
   5602 		}
   5603 		if (avail < 2)
   5604 		    goto done;
   5605 		cur = in->cur[0];
   5606 		next = in->cur[1];
   5607 		cons = ctxt->nbChars;
   5608 		if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
   5609 		    (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
   5610 		    /*
   5611 		     * Handle SCRIPT/STYLE separately
   5612 		     */
   5613 		    if (!terminate) {
   5614 		        int idx;
   5615 			xmlChar val;
   5616 
   5617 			idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
   5618 			if (idx < 0)
   5619 			    goto done;
   5620 		        val = in->cur[idx + 2];
   5621 			if (val == 0) /* bad cut of input */
   5622 			    goto done;
   5623 		    }
   5624 		    htmlParseScript(ctxt);
   5625 		    if ((cur == '<') && (next == '/')) {
   5626 			ctxt->instate = XML_PARSER_END_TAG;
   5627 			ctxt->checkIndex = 0;
   5628 #ifdef DEBUG_PUSH
   5629 			xmlGenericError(xmlGenericErrorContext,
   5630 				"HPP: entering END_TAG\n");
   5631 #endif
   5632 			break;
   5633 		    }
   5634 		} else {
   5635 		    /*
   5636 		     * Sometimes DOCTYPE arrives in the middle of the document
   5637 		     */
   5638 		    if ((cur == '<') && (next == '!') &&
   5639 			(UPP(2) == 'D') && (UPP(3) == 'O') &&
   5640 			(UPP(4) == 'C') && (UPP(5) == 'T') &&
   5641 			(UPP(6) == 'Y') && (UPP(7) == 'P') &&
   5642 			(UPP(8) == 'E')) {
   5643 			if ((!terminate) &&
   5644 			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5645 			    goto done;
   5646 			htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   5647 			             "Misplaced DOCTYPE declaration\n",
   5648 				     BAD_CAST "DOCTYPE" , NULL);
   5649 			htmlParseDocTypeDecl(ctxt);
   5650 		    } else if ((cur == '<') && (next == '!') &&
   5651 			(in->cur[2] == '-') && (in->cur[3] == '-')) {
   5652 			if ((!terminate) &&
   5653 			    (htmlParseLookupSequence(
   5654 				ctxt, '-', '-', '>', 1, 1) < 0))
   5655 			    goto done;
   5656 #ifdef DEBUG_PUSH
   5657 			xmlGenericError(xmlGenericErrorContext,
   5658 				"HPP: Parsing Comment\n");
   5659 #endif
   5660 			htmlParseComment(ctxt);
   5661 			ctxt->instate = XML_PARSER_CONTENT;
   5662 		    } else if ((cur == '<') && (next == '?')) {
   5663 			if ((!terminate) &&
   5664 			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5665 			    goto done;
   5666 #ifdef DEBUG_PUSH
   5667 			xmlGenericError(xmlGenericErrorContext,
   5668 				"HPP: Parsing PI\n");
   5669 #endif
   5670 			htmlParsePI(ctxt);
   5671 			ctxt->instate = XML_PARSER_CONTENT;
   5672 		    } else if ((cur == '<') && (next == '!') && (avail < 4)) {
   5673 			goto done;
   5674 		    } else if ((cur == '<') && (next == '/')) {
   5675 			ctxt->instate = XML_PARSER_END_TAG;
   5676 			ctxt->checkIndex = 0;
   5677 #ifdef DEBUG_PUSH
   5678 			xmlGenericError(xmlGenericErrorContext,
   5679 				"HPP: entering END_TAG\n");
   5680 #endif
   5681 			break;
   5682 		    } else if (cur == '<') {
   5683 			ctxt->instate = XML_PARSER_START_TAG;
   5684 			ctxt->checkIndex = 0;
   5685 #ifdef DEBUG_PUSH
   5686 			xmlGenericError(xmlGenericErrorContext,
   5687 				"HPP: entering START_TAG\n");
   5688 #endif
   5689 			break;
   5690 		    } else if (cur == '&') {
   5691 			if ((!terminate) &&
   5692 			    (htmlParseLookupChars(ctxt,
   5693                                                   BAD_CAST "; >/", 4) < 0))
   5694 			    goto done;
   5695 #ifdef DEBUG_PUSH
   5696 			xmlGenericError(xmlGenericErrorContext,
   5697 				"HPP: Parsing Reference\n");
   5698 #endif
   5699 			/* TODO: check generation of subtrees if noent !!! */
   5700 			htmlParseReference(ctxt);
   5701 		    } else {
   5702 		        /*
   5703 			 * check that the text sequence is complete
   5704 			 * before handing out the data to the parser
   5705 			 * to avoid problems with erroneous end of
   5706 			 * data detection.
   5707 			 */
   5708 			if ((!terminate) &&
   5709                             (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
   5710 			    goto done;
   5711 			ctxt->checkIndex = 0;
   5712 #ifdef DEBUG_PUSH
   5713 			xmlGenericError(xmlGenericErrorContext,
   5714 				"HPP: Parsing char data\n");
   5715 #endif
   5716 			htmlParseCharData(ctxt);
   5717 		    }
   5718 		}
   5719 		if (cons == ctxt->nbChars) {
   5720 		    if (ctxt->node != NULL) {
   5721 			htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5722 			             "detected an error in element content\n",
   5723 				     NULL, NULL);
   5724 		    }
   5725 		    NEXT;
   5726 		    break;
   5727 		}
   5728 
   5729 		break;
   5730 	    }
   5731             case XML_PARSER_END_TAG:
   5732 		if (avail < 2)
   5733 		    goto done;
   5734 		if ((!terminate) &&
   5735 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
   5736 		    goto done;
   5737 		htmlParseEndTag(ctxt);
   5738 		if (ctxt->nameNr == 0) {
   5739 		    ctxt->instate = XML_PARSER_EPILOG;
   5740 		} else {
   5741 		    ctxt->instate = XML_PARSER_CONTENT;
   5742 		}
   5743 		ctxt->checkIndex = 0;
   5744 #ifdef DEBUG_PUSH
   5745 		xmlGenericError(xmlGenericErrorContext,
   5746 			"HPP: entering CONTENT\n");
   5747 #endif
   5748 	        break;
   5749             case XML_PARSER_CDATA_SECTION:
   5750 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5751 			"HPP: internal error, state == CDATA\n",
   5752 			     NULL, NULL);
   5753 		ctxt->instate = XML_PARSER_CONTENT;
   5754 		ctxt->checkIndex = 0;
   5755 #ifdef DEBUG_PUSH
   5756 		xmlGenericError(xmlGenericErrorContext,
   5757 			"HPP: entering CONTENT\n");
   5758 #endif
   5759 		break;
   5760             case XML_PARSER_DTD:
   5761 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5762 			"HPP: internal error, state == DTD\n",
   5763 			     NULL, NULL);
   5764 		ctxt->instate = XML_PARSER_CONTENT;
   5765 		ctxt->checkIndex = 0;
   5766 #ifdef DEBUG_PUSH
   5767 		xmlGenericError(xmlGenericErrorContext,
   5768 			"HPP: entering CONTENT\n");
   5769 #endif
   5770 		break;
   5771             case XML_PARSER_COMMENT:
   5772 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5773 			"HPP: internal error, state == COMMENT\n",
   5774 			     NULL, NULL);
   5775 		ctxt->instate = XML_PARSER_CONTENT;
   5776 		ctxt->checkIndex = 0;
   5777 #ifdef DEBUG_PUSH
   5778 		xmlGenericError(xmlGenericErrorContext,
   5779 			"HPP: entering CONTENT\n");
   5780 #endif
   5781 		break;
   5782             case XML_PARSER_PI:
   5783 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5784 			"HPP: internal error, state == PI\n",
   5785 			     NULL, NULL);
   5786 		ctxt->instate = XML_PARSER_CONTENT;
   5787 		ctxt->checkIndex = 0;
   5788 #ifdef DEBUG_PUSH
   5789 		xmlGenericError(xmlGenericErrorContext,
   5790 			"HPP: entering CONTENT\n");
   5791 #endif
   5792 		break;
   5793             case XML_PARSER_ENTITY_DECL:
   5794 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5795 			"HPP: internal error, state == ENTITY_DECL\n",
   5796 			     NULL, NULL);
   5797 		ctxt->instate = XML_PARSER_CONTENT;
   5798 		ctxt->checkIndex = 0;
   5799 #ifdef DEBUG_PUSH
   5800 		xmlGenericError(xmlGenericErrorContext,
   5801 			"HPP: entering CONTENT\n");
   5802 #endif
   5803 		break;
   5804             case XML_PARSER_ENTITY_VALUE:
   5805 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5806 			"HPP: internal error, state == ENTITY_VALUE\n",
   5807 			     NULL, NULL);
   5808 		ctxt->instate = XML_PARSER_CONTENT;
   5809 		ctxt->checkIndex = 0;
   5810 #ifdef DEBUG_PUSH
   5811 		xmlGenericError(xmlGenericErrorContext,
   5812 			"HPP: entering DTD\n");
   5813 #endif
   5814 		break;
   5815             case XML_PARSER_ATTRIBUTE_VALUE:
   5816 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5817 			"HPP: internal error, state == ATTRIBUTE_VALUE\n",
   5818 			     NULL, NULL);
   5819 		ctxt->instate = XML_PARSER_START_TAG;
   5820 		ctxt->checkIndex = 0;
   5821 #ifdef DEBUG_PUSH
   5822 		xmlGenericError(xmlGenericErrorContext,
   5823 			"HPP: entering START_TAG\n");
   5824 #endif
   5825 		break;
   5826 	    case XML_PARSER_SYSTEM_LITERAL:
   5827 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5828 		    "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
   5829 			     NULL, NULL);
   5830 		ctxt->instate = XML_PARSER_CONTENT;
   5831 		ctxt->checkIndex = 0;
   5832 #ifdef DEBUG_PUSH
   5833 		xmlGenericError(xmlGenericErrorContext,
   5834 			"HPP: entering CONTENT\n");
   5835 #endif
   5836 		break;
   5837 	    case XML_PARSER_IGNORE:
   5838 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5839 			"HPP: internal error, state == XML_PARSER_IGNORE\n",
   5840 			     NULL, NULL);
   5841 		ctxt->instate = XML_PARSER_CONTENT;
   5842 		ctxt->checkIndex = 0;
   5843 #ifdef DEBUG_PUSH
   5844 		xmlGenericError(xmlGenericErrorContext,
   5845 			"HPP: entering CONTENT\n");
   5846 #endif
   5847 		break;
   5848 	    case XML_PARSER_PUBLIC_LITERAL:
   5849 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5850 			"HPP: internal error, state == XML_PARSER_LITERAL\n",
   5851 			     NULL, NULL);
   5852 		ctxt->instate = XML_PARSER_CONTENT;
   5853 		ctxt->checkIndex = 0;
   5854 #ifdef DEBUG_PUSH
   5855 		xmlGenericError(xmlGenericErrorContext,
   5856 			"HPP: entering CONTENT\n");
   5857 #endif
   5858 		break;
   5859 
   5860 	}
   5861     }
   5862 done:
   5863     if ((avail == 0) && (terminate)) {
   5864 	htmlAutoCloseOnEnd(ctxt);
   5865 	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
   5866 	    /*
   5867 	     * SAX: end of the document processing.
   5868 	     */
   5869 	    ctxt->instate = XML_PARSER_EOF;
   5870 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
   5871 		ctxt->sax->endDocument(ctxt->userData);
   5872 	}
   5873     }
   5874     if ((ctxt->myDoc != NULL) &&
   5875 	((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
   5876 	 (ctxt->instate == XML_PARSER_EPILOG))) {
   5877 	xmlDtdPtr dtd;
   5878 	dtd = xmlGetIntSubset(ctxt->myDoc);
   5879 	if (dtd == NULL)
   5880 	    ctxt->myDoc->intSubset =
   5881 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
   5882 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
   5883 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
   5884     }
   5885 #ifdef DEBUG_PUSH
   5886     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
   5887 #endif
   5888     return(ret);
   5889 }
   5890 
   5891 /**
   5892  * htmlParseChunk:
   5893  * @ctxt:  an HTML parser context
   5894  * @chunk:  an char array
   5895  * @size:  the size in byte of the chunk
   5896  * @terminate:  last chunk indicator
   5897  *
   5898  * Parse a Chunk of memory
   5899  *
   5900  * Returns zero if no error, the xmlParserErrors otherwise.
   5901  */
   5902 int
   5903 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
   5904               int terminate) {
   5905     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   5906 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   5907 		     "htmlParseChunk: context error\n", NULL, NULL);
   5908 	return(XML_ERR_INTERNAL_ERROR);
   5909     }
   5910     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
   5911         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
   5912 	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
   5913 	int cur = ctxt->input->cur - ctxt->input->base;
   5914 	int res;
   5915 
   5916 	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
   5917 	if (res < 0) {
   5918 	    ctxt->errNo = XML_PARSER_EOF;
   5919 	    ctxt->disableSAX = 1;
   5920 	    return (XML_PARSER_EOF);
   5921 	}
   5922 	ctxt->input->base = ctxt->input->buf->buffer->content + base;
   5923 	ctxt->input->cur = ctxt->input->base + cur;
   5924 	ctxt->input->end =
   5925 	  &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
   5926 #ifdef DEBUG_PUSH
   5927 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
   5928 #endif
   5929 
   5930 #if 0
   5931 	if ((terminate) || (ctxt->input->buf->buffer->use > 80))
   5932 	    htmlParseTryOrFinish(ctxt, terminate);
   5933 #endif
   5934     } else if (ctxt->instate != XML_PARSER_EOF) {
   5935 	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
   5936 	    xmlParserInputBufferPtr in = ctxt->input->buf;
   5937 	    if ((in->encoder != NULL) && (in->buffer != NULL) &&
   5938 		    (in->raw != NULL)) {
   5939 		int nbchars;
   5940 
   5941 		nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
   5942 		if (nbchars < 0) {
   5943 		    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
   5944 			         "encoder error\n", NULL, NULL);
   5945 		    return(XML_ERR_INVALID_ENCODING);
   5946 		}
   5947 	    }
   5948 	}
   5949     }
   5950     htmlParseTryOrFinish(ctxt, terminate);
   5951     if (terminate) {
   5952 	if ((ctxt->instate != XML_PARSER_EOF) &&
   5953 	    (ctxt->instate != XML_PARSER_EPILOG) &&
   5954 	    (ctxt->instate != XML_PARSER_MISC)) {
   5955 	    ctxt->errNo = XML_ERR_DOCUMENT_END;
   5956 	    ctxt->wellFormed = 0;
   5957 	}
   5958 	if (ctxt->instate != XML_PARSER_EOF) {
   5959 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
   5960 		ctxt->sax->endDocument(ctxt->userData);
   5961 	}
   5962 	ctxt->instate = XML_PARSER_EOF;
   5963     }
   5964     return((xmlParserErrors) ctxt->errNo);
   5965 }
   5966 
   5967 /************************************************************************
   5968  *									*
   5969  *			User entry points				*
   5970  *									*
   5971  ************************************************************************/
   5972 
   5973 /**
   5974  * htmlCreatePushParserCtxt:
   5975  * @sax:  a SAX handler
   5976  * @user_data:  The user data returned on SAX callbacks
   5977  * @chunk:  a pointer to an array of chars
   5978  * @size:  number of chars in the array
   5979  * @filename:  an optional file name or URI
   5980  * @enc:  an optional encoding
   5981  *
   5982  * Create a parser context for using the HTML parser in push mode
   5983  * The value of @filename is used for fetching external entities
   5984  * and error/warning reports.
   5985  *
   5986  * Returns the new parser context or NULL
   5987  */
   5988 htmlParserCtxtPtr
   5989 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
   5990                          const char *chunk, int size, const char *filename,
   5991 			 xmlCharEncoding enc) {
   5992     htmlParserCtxtPtr ctxt;
   5993     htmlParserInputPtr inputStream;
   5994     xmlParserInputBufferPtr buf;
   5995 
   5996     xmlInitParser();
   5997 
   5998     buf = xmlAllocParserInputBuffer(enc);
   5999     if (buf == NULL) return(NULL);
   6000 
   6001     ctxt = htmlNewParserCtxt();
   6002     if (ctxt == NULL) {
   6003 	xmlFreeParserInputBuffer(buf);
   6004 	return(NULL);
   6005     }
   6006     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
   6007 	ctxt->charset=XML_CHAR_ENCODING_UTF8;
   6008     if (sax != NULL) {
   6009 	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
   6010 	    xmlFree(ctxt->sax);
   6011 	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
   6012 	if (ctxt->sax == NULL) {
   6013 	    xmlFree(buf);
   6014 	    xmlFree(ctxt);
   6015 	    return(NULL);
   6016 	}
   6017 	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
   6018 	if (user_data != NULL)
   6019 	    ctxt->userData = user_data;
   6020     }
   6021     if (filename == NULL) {
   6022 	ctxt->directory = NULL;
   6023     } else {
   6024         ctxt->directory = xmlParserGetDirectory(filename);
   6025     }
   6026 
   6027     inputStream = htmlNewInputStream(ctxt);
   6028     if (inputStream == NULL) {
   6029 	xmlFreeParserCtxt(ctxt);
   6030 	xmlFree(buf);
   6031 	return(NULL);
   6032     }
   6033 
   6034     if (filename == NULL)
   6035 	inputStream->filename = NULL;
   6036     else
   6037 	inputStream->filename = (char *)
   6038 	    xmlCanonicPath((const xmlChar *) filename);
   6039     inputStream->buf = buf;
   6040     inputStream->base = inputStream->buf->buffer->content;
   6041     inputStream->cur = inputStream->buf->buffer->content;
   6042     inputStream->end =
   6043 	&inputStream->buf->buffer->content[inputStream->buf->buffer->use];
   6044 
   6045     inputPush(ctxt, inputStream);
   6046 
   6047     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
   6048         (ctxt->input->buf != NULL))  {
   6049 	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
   6050 	int cur = ctxt->input->cur - ctxt->input->base;
   6051 
   6052 	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
   6053 
   6054 	ctxt->input->base = ctxt->input->buf->buffer->content + base;
   6055 	ctxt->input->cur = ctxt->input->base + cur;
   6056 	ctxt->input->end =
   6057 	    &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
   6058 #ifdef DEBUG_PUSH
   6059 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
   6060 #endif
   6061     }
   6062     ctxt->progressive = 1;
   6063 
   6064     return(ctxt);
   6065 }
   6066 #endif /* LIBXML_PUSH_ENABLED */
   6067 
   6068 /**
   6069  * htmlSAXParseDoc:
   6070  * @cur:  a pointer to an array of xmlChar
   6071  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   6072  * @sax:  the SAX handler block
   6073  * @userData: if using SAX, this pointer will be provided on callbacks.
   6074  *
   6075  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
   6076  * to handle parse events. If sax is NULL, fallback to the default DOM
   6077  * behavior and return a tree.
   6078  *
   6079  * Returns the resulting document tree unless SAX is NULL or the document is
   6080  *     not well formed.
   6081  */
   6082 
   6083 htmlDocPtr
   6084 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
   6085     htmlDocPtr ret;
   6086     htmlParserCtxtPtr ctxt;
   6087 
   6088     xmlInitParser();
   6089 
   6090     if (cur == NULL) return(NULL);
   6091 
   6092 
   6093     ctxt = htmlCreateDocParserCtxt(cur, encoding);
   6094     if (ctxt == NULL) return(NULL);
   6095     if (sax != NULL) {
   6096         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
   6097         ctxt->sax = sax;
   6098         ctxt->userData = userData;
   6099     }
   6100 
   6101     htmlParseDocument(ctxt);
   6102     ret = ctxt->myDoc;
   6103     if (sax != NULL) {
   6104 	ctxt->sax = NULL;
   6105 	ctxt->userData = NULL;
   6106     }
   6107     htmlFreeParserCtxt(ctxt);
   6108 
   6109     return(ret);
   6110 }
   6111 
   6112 /**
   6113  * htmlParseDoc:
   6114  * @cur:  a pointer to an array of xmlChar
   6115  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   6116  *
   6117  * parse an HTML in-memory document and build a tree.
   6118  *
   6119  * Returns the resulting document tree
   6120  */
   6121 
   6122 htmlDocPtr
   6123 htmlParseDoc(xmlChar *cur, const char *encoding) {
   6124     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
   6125 }
   6126 
   6127 
   6128 /**
   6129  * htmlCreateFileParserCtxt:
   6130  * @filename:  the filename
   6131  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   6132  *
   6133  * Create a parser context for a file content.
   6134  * Automatic support for ZLIB/Compress compressed document is provided
   6135  * by default if found at compile-time.
   6136  *
   6137  * Returns the new parser context or NULL
   6138  */
   6139 htmlParserCtxtPtr
   6140 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
   6141 {
   6142     htmlParserCtxtPtr ctxt;
   6143     htmlParserInputPtr inputStream;
   6144     char *canonicFilename;
   6145     /* htmlCharEncoding enc; */
   6146     xmlChar *content, *content_line = (xmlChar *) "charset=";
   6147 
   6148     if (filename == NULL)
   6149         return(NULL);
   6150 
   6151     ctxt = htmlNewParserCtxt();
   6152     if (ctxt == NULL) {
   6153 	return(NULL);
   6154     }
   6155     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
   6156     if (canonicFilename == NULL) {
   6157 #ifdef LIBXML_SAX1_ENABLED
   6158 	if (xmlDefaultSAXHandler.error != NULL) {
   6159 	    xmlDefaultSAXHandler.error(NULL, "out of memory\n");
   6160 	}
   6161 #endif
   6162 	xmlFreeParserCtxt(ctxt);
   6163 	return(NULL);
   6164     }
   6165 
   6166     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
   6167     xmlFree(canonicFilename);
   6168     if (inputStream == NULL) {
   6169 	xmlFreeParserCtxt(ctxt);
   6170 	return(NULL);
   6171     }
   6172 
   6173     inputPush(ctxt, inputStream);
   6174 
   6175     /* set encoding */
   6176     if (encoding) {
   6177         content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
   6178 	if (content) {
   6179 	    strcpy ((char *)content, (char *)content_line);
   6180             strcat ((char *)content, (char *)encoding);
   6181             htmlCheckEncoding (ctxt, content);
   6182 	    xmlFree (content);
   6183 	}
   6184     }
   6185 
   6186     return(ctxt);
   6187 }
   6188 
   6189 /**
   6190  * htmlSAXParseFile:
   6191  * @filename:  the filename
   6192  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   6193  * @sax:  the SAX handler block
   6194  * @userData: if using SAX, this pointer will be provided on callbacks.
   6195  *
   6196  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
   6197  * compressed document is provided by default if found at compile-time.
   6198  * It use the given SAX function block to handle the parsing callback.
   6199  * If sax is NULL, fallback to the default DOM tree building routines.
   6200  *
   6201  * Returns the resulting document tree unless SAX is NULL or the document is
   6202  *     not well formed.
   6203  */
   6204 
   6205 htmlDocPtr
   6206 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
   6207                  void *userData) {
   6208     htmlDocPtr ret;
   6209     htmlParserCtxtPtr ctxt;
   6210     htmlSAXHandlerPtr oldsax = NULL;
   6211 
   6212     xmlInitParser();
   6213 
   6214     ctxt = htmlCreateFileParserCtxt(filename, encoding);
   6215     if (ctxt == NULL) return(NULL);
   6216     if (sax != NULL) {
   6217 	oldsax = ctxt->sax;
   6218         ctxt->sax = sax;
   6219         ctxt->userData = userData;
   6220     }
   6221 
   6222     htmlParseDocument(ctxt);
   6223 
   6224     ret = ctxt->myDoc;
   6225     if (sax != NULL) {
   6226         ctxt->sax = oldsax;
   6227         ctxt->userData = NULL;
   6228     }
   6229     htmlFreeParserCtxt(ctxt);
   6230 
   6231     return(ret);
   6232 }
   6233 
   6234 /**
   6235  * htmlParseFile:
   6236  * @filename:  the filename
   6237  * @encoding:  a free form C string describing the HTML document encoding, or NULL
   6238  *
   6239  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
   6240  * compressed document is provided by default if found at compile-time.
   6241  *
   6242  * Returns the resulting document tree
   6243  */
   6244 
   6245 htmlDocPtr
   6246 htmlParseFile(const char *filename, const char *encoding) {
   6247     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
   6248 }
   6249 
   6250 /**
   6251  * htmlHandleOmittedElem:
   6252  * @val:  int 0 or 1
   6253  *
   6254  * Set and return the previous value for handling HTML omitted tags.
   6255  *
   6256  * Returns the last value for 0 for no handling, 1 for auto insertion.
   6257  */
   6258 
   6259 int
   6260 htmlHandleOmittedElem(int val) {
   6261     int old = htmlOmittedDefaultValue;
   6262 
   6263     htmlOmittedDefaultValue = val;
   6264     return(old);
   6265 }
   6266 
   6267 /**
   6268  * htmlElementAllowedHere:
   6269  * @parent: HTML parent element
   6270  * @elt: HTML element
   6271  *
   6272  * Checks whether an HTML element may be a direct child of a parent element.
   6273  * Note - doesn't check for deprecated elements
   6274  *
   6275  * Returns 1 if allowed; 0 otherwise.
   6276  */
   6277 int
   6278 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
   6279   const char** p ;
   6280 
   6281   if ( ! elt || ! parent || ! parent->subelts )
   6282 	return 0 ;
   6283 
   6284   for ( p = parent->subelts; *p; ++p )
   6285     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
   6286       return 1 ;
   6287 
   6288   return 0 ;
   6289 }
   6290 /**
   6291  * htmlElementStatusHere:
   6292  * @parent: HTML parent element
   6293  * @elt: HTML element
   6294  *
   6295  * Checks whether an HTML element may be a direct child of a parent element.
   6296  * and if so whether it is valid or deprecated.
   6297  *
   6298  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
   6299  */
   6300 htmlStatus
   6301 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
   6302   if ( ! parent || ! elt )
   6303     return HTML_INVALID ;
   6304   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
   6305     return HTML_INVALID ;
   6306 
   6307   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
   6308 }
   6309 /**
   6310  * htmlAttrAllowed:
   6311  * @elt: HTML element
   6312  * @attr: HTML attribute
   6313  * @legacy: whether to allow deprecated attributes
   6314  *
   6315  * Checks whether an attribute is valid for an element
   6316  * Has full knowledge of Required and Deprecated attributes
   6317  *
   6318  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
   6319  */
   6320 htmlStatus
   6321 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
   6322   const char** p ;
   6323 
   6324   if ( !elt || ! attr )
   6325 	return HTML_INVALID ;
   6326 
   6327   if ( elt->attrs_req )
   6328     for ( p = elt->attrs_req; *p; ++p)
   6329       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
   6330         return HTML_REQUIRED ;
   6331 
   6332   if ( elt->attrs_opt )
   6333     for ( p = elt->attrs_opt; *p; ++p)
   6334       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
   6335         return HTML_VALID ;
   6336 
   6337   if ( legacy && elt->attrs_depr )
   6338     for ( p = elt->attrs_depr; *p; ++p)
   6339       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
   6340         return HTML_DEPRECATED ;
   6341 
   6342   return HTML_INVALID ;
   6343 }
   6344 /**
   6345  * htmlNodeStatus:
   6346  * @node: an htmlNodePtr in a tree
   6347  * @legacy: whether to allow deprecated elements (YES is faster here
   6348  *	for Element nodes)
   6349  *
   6350  * Checks whether the tree node is valid.  Experimental (the author
   6351  *     only uses the HTML enhancements in a SAX parser)
   6352  *
   6353  * Return: for Element nodes, a return from htmlElementAllowedHere (if
   6354  *	legacy allowed) or htmlElementStatusHere (otherwise).
   6355  *	for Attribute nodes, a return from htmlAttrAllowed
   6356  *	for other nodes, HTML_NA (no checks performed)
   6357  */
   6358 htmlStatus
   6359 htmlNodeStatus(const htmlNodePtr node, int legacy) {
   6360   if ( ! node )
   6361     return HTML_INVALID ;
   6362 
   6363   switch ( node->type ) {
   6364     case XML_ELEMENT_NODE:
   6365       return legacy
   6366 	? ( htmlElementAllowedHere (
   6367 		htmlTagLookup(node->parent->name) , node->name
   6368 		) ? HTML_VALID : HTML_INVALID )
   6369 	: htmlElementStatusHere(
   6370 		htmlTagLookup(node->parent->name) ,
   6371 		htmlTagLookup(node->name) )
   6372 	;
   6373     case XML_ATTRIBUTE_NODE:
   6374       return htmlAttrAllowed(
   6375 	htmlTagLookup(node->parent->name) , node->name, legacy) ;
   6376     default: return HTML_NA ;
   6377   }
   6378 }
   6379 /************************************************************************
   6380  *									*
   6381  *	New set (2.6.0) of simpler and more flexible APIs		*
   6382  *									*
   6383  ************************************************************************/
   6384 /**
   6385  * DICT_FREE:
   6386  * @str:  a string
   6387  *
   6388  * Free a string if it is not owned by the "dict" dictionnary in the
   6389  * current scope
   6390  */
   6391 #define DICT_FREE(str)						\
   6392 	if ((str) && ((!dict) ||				\
   6393 	    (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))	\
   6394 	    xmlFree((char *)(str));
   6395 
   6396 /**
   6397  * htmlCtxtReset:
   6398  * @ctxt: an HTML parser context
   6399  *
   6400  * Reset a parser context
   6401  */
   6402 void
   6403 htmlCtxtReset(htmlParserCtxtPtr ctxt)
   6404 {
   6405     xmlParserInputPtr input;
   6406     xmlDictPtr dict;
   6407 
   6408     if (ctxt == NULL)
   6409         return;
   6410 
   6411     xmlInitParser();
   6412     dict = ctxt->dict;
   6413 
   6414     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
   6415         xmlFreeInputStream(input);
   6416     }
   6417     ctxt->inputNr = 0;
   6418     ctxt->input = NULL;
   6419 
   6420     ctxt->spaceNr = 0;
   6421     if (ctxt->spaceTab != NULL) {
   6422 	ctxt->spaceTab[0] = -1;
   6423 	ctxt->space = &ctxt->spaceTab[0];
   6424     } else {
   6425 	ctxt->space = NULL;
   6426     }
   6427 
   6428 
   6429     ctxt->nodeNr = 0;
   6430     ctxt->node = NULL;
   6431 
   6432     ctxt->nameNr = 0;
   6433     ctxt->name = NULL;
   6434 
   6435     DICT_FREE(ctxt->version);
   6436     ctxt->version = NULL;
   6437     DICT_FREE(ctxt->encoding);
   6438     ctxt->encoding = NULL;
   6439     DICT_FREE(ctxt->directory);
   6440     ctxt->directory = NULL;
   6441     DICT_FREE(ctxt->extSubURI);
   6442     ctxt->extSubURI = NULL;
   6443     DICT_FREE(ctxt->extSubSystem);
   6444     ctxt->extSubSystem = NULL;
   6445     if (ctxt->myDoc != NULL)
   6446         xmlFreeDoc(ctxt->myDoc);
   6447     ctxt->myDoc = NULL;
   6448 
   6449     ctxt->standalone = -1;
   6450     ctxt->hasExternalSubset = 0;
   6451     ctxt->hasPErefs = 0;
   6452     ctxt->html = 1;
   6453     ctxt->external = 0;
   6454     ctxt->instate = XML_PARSER_START;
   6455     ctxt->token = 0;
   6456 
   6457     ctxt->wellFormed = 1;
   6458     ctxt->nsWellFormed = 1;
   6459     ctxt->disableSAX = 0;
   6460     ctxt->valid = 1;
   6461     ctxt->vctxt.userData = ctxt;
   6462     ctxt->vctxt.error = xmlParserValidityError;
   6463     ctxt->vctxt.warning = xmlParserValidityWarning;
   6464     ctxt->record_info = 0;
   6465     ctxt->nbChars = 0;
   6466     ctxt->checkIndex = 0;
   6467     ctxt->inSubset = 0;
   6468     ctxt->errNo = XML_ERR_OK;
   6469     ctxt->depth = 0;
   6470     ctxt->charset = XML_CHAR_ENCODING_NONE;
   6471     ctxt->catalogs = NULL;
   6472     xmlInitNodeInfoSeq(&ctxt->node_seq);
   6473 
   6474     if (ctxt->attsDefault != NULL) {
   6475         xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
   6476         ctxt->attsDefault = NULL;
   6477     }
   6478     if (ctxt->attsSpecial != NULL) {
   6479         xmlHashFree(ctxt->attsSpecial, NULL);
   6480         ctxt->attsSpecial = NULL;
   6481     }
   6482 }
   6483 
   6484 /**
   6485  * htmlCtxtUseOptions:
   6486  * @ctxt: an HTML parser context
   6487  * @options:  a combination of htmlParserOption(s)
   6488  *
   6489  * Applies the options to the parser context
   6490  *
   6491  * Returns 0 in case of success, the set of unknown or unimplemented options
   6492  *         in case of error.
   6493  */
   6494 int
   6495 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
   6496 {
   6497     if (ctxt == NULL)
   6498         return(-1);
   6499 
   6500     if (options & HTML_PARSE_NOWARNING) {
   6501         ctxt->sax->warning = NULL;
   6502         ctxt->vctxt.warning = NULL;
   6503         options -= XML_PARSE_NOWARNING;
   6504 	ctxt->options |= XML_PARSE_NOWARNING;
   6505     }
   6506     if (options & HTML_PARSE_NOERROR) {
   6507         ctxt->sax->error = NULL;
   6508         ctxt->vctxt.error = NULL;
   6509         ctxt->sax->fatalError = NULL;
   6510         options -= XML_PARSE_NOERROR;
   6511 	ctxt->options |= XML_PARSE_NOERROR;
   6512     }
   6513     if (options & HTML_PARSE_PEDANTIC) {
   6514         ctxt->pedantic = 1;
   6515         options -= XML_PARSE_PEDANTIC;
   6516 	ctxt->options |= XML_PARSE_PEDANTIC;
   6517     } else
   6518         ctxt->pedantic = 0;
   6519     if (options & XML_PARSE_NOBLANKS) {
   6520         ctxt->keepBlanks = 0;
   6521         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
   6522         options -= XML_PARSE_NOBLANKS;
   6523 	ctxt->options |= XML_PARSE_NOBLANKS;
   6524     } else
   6525         ctxt->keepBlanks = 1;
   6526     if (options & HTML_PARSE_RECOVER) {
   6527         ctxt->recovery = 1;
   6528 	options -= HTML_PARSE_RECOVER;
   6529     } else
   6530         ctxt->recovery = 0;
   6531     if (options & HTML_PARSE_COMPACT) {
   6532 	ctxt->options |= HTML_PARSE_COMPACT;
   6533         options -= HTML_PARSE_COMPACT;
   6534     }
   6535     if (options & XML_PARSE_HUGE) {
   6536 	ctxt->options |= XML_PARSE_HUGE;
   6537         options -= XML_PARSE_HUGE;
   6538     }
   6539     if (options & HTML_PARSE_NODEFDTD) {
   6540 	ctxt->options |= HTML_PARSE_NODEFDTD;
   6541         options -= HTML_PARSE_NODEFDTD;
   6542     }
   6543     if (options & HTML_PARSE_IGNORE_ENC) {
   6544 	ctxt->options |= HTML_PARSE_IGNORE_ENC;
   6545         options -= HTML_PARSE_IGNORE_ENC;
   6546     }
   6547     ctxt->dictNames = 0;
   6548     return (options);
   6549 }
   6550 
   6551 /**
   6552  * htmlDoRead:
   6553  * @ctxt:  an HTML parser context
   6554  * @URL:  the base URL to use for the document
   6555  * @encoding:  the document encoding, or NULL
   6556  * @options:  a combination of htmlParserOption(s)
   6557  * @reuse:  keep the context for reuse
   6558  *
   6559  * Common front-end for the htmlRead functions
   6560  *
   6561  * Returns the resulting document tree or NULL
   6562  */
   6563 static htmlDocPtr
   6564 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
   6565           int options, int reuse)
   6566 {
   6567     htmlDocPtr ret;
   6568 
   6569     htmlCtxtUseOptions(ctxt, options);
   6570     ctxt->html = 1;
   6571     if (encoding != NULL) {
   6572         xmlCharEncodingHandlerPtr hdlr;
   6573 
   6574 	hdlr = xmlFindCharEncodingHandler(encoding);
   6575 	if (hdlr != NULL) {
   6576 	    xmlSwitchToEncoding(ctxt, hdlr);
   6577 	    if (ctxt->input->encoding != NULL)
   6578 	      xmlFree((xmlChar *) ctxt->input->encoding);
   6579             ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
   6580         }
   6581     }
   6582     if ((URL != NULL) && (ctxt->input != NULL) &&
   6583         (ctxt->input->filename == NULL))
   6584         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
   6585     htmlParseDocument(ctxt);
   6586     ret = ctxt->myDoc;
   6587     ctxt->myDoc = NULL;
   6588     if (!reuse) {
   6589         if ((ctxt->dictNames) &&
   6590 	    (ret != NULL) &&
   6591 	    (ret->dict == ctxt->dict))
   6592 	    ctxt->dict = NULL;
   6593 	xmlFreeParserCtxt(ctxt);
   6594     }
   6595     return (ret);
   6596 }
   6597 
   6598 /**
   6599  * htmlReadDoc:
   6600  * @cur:  a pointer to a zero terminated string
   6601  * @URL:  the base URL to use for the document
   6602  * @encoding:  the document encoding, or NULL
   6603  * @options:  a combination of htmlParserOption(s)
   6604  *
   6605  * parse an XML in-memory document and build a tree.
   6606  *
   6607  * Returns the resulting document tree
   6608  */
   6609 htmlDocPtr
   6610 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
   6611 {
   6612     htmlParserCtxtPtr ctxt;
   6613 
   6614     if (cur == NULL)
   6615         return (NULL);
   6616 
   6617     xmlInitParser();
   6618     ctxt = htmlCreateDocParserCtxt(cur, NULL);
   6619     if (ctxt == NULL)
   6620         return (NULL);
   6621     return (htmlDoRead(ctxt, URL, encoding, options, 0));
   6622 }
   6623 
   6624 /**
   6625  * htmlReadFile:
   6626  * @filename:  a file or URL
   6627  * @encoding:  the document encoding, or NULL
   6628  * @options:  a combination of htmlParserOption(s)
   6629  *
   6630  * parse an XML file from the filesystem or the network.
   6631  *
   6632  * Returns the resulting document tree
   6633  */
   6634 htmlDocPtr
   6635 htmlReadFile(const char *filename, const char *encoding, int options)
   6636 {
   6637     htmlParserCtxtPtr ctxt;
   6638 
   6639     xmlInitParser();
   6640     ctxt = htmlCreateFileParserCtxt(filename, encoding);
   6641     if (ctxt == NULL)
   6642         return (NULL);
   6643     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
   6644 }
   6645 
   6646 /**
   6647  * htmlReadMemory:
   6648  * @buffer:  a pointer to a char array
   6649  * @size:  the size of the array
   6650  * @URL:  the base URL to use for the document
   6651  * @encoding:  the document encoding, or NULL
   6652  * @options:  a combination of htmlParserOption(s)
   6653  *
   6654  * parse an XML in-memory document and build a tree.
   6655  *
   6656  * Returns the resulting document tree
   6657  */
   6658 htmlDocPtr
   6659 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
   6660 {
   6661     htmlParserCtxtPtr ctxt;
   6662 
   6663     xmlInitParser();
   6664     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
   6665     if (ctxt == NULL)
   6666         return (NULL);
   6667     htmlDefaultSAXHandlerInit();
   6668     if (ctxt->sax != NULL)
   6669         memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
   6670     return (htmlDoRead(ctxt, URL, encoding, options, 0));
   6671 }
   6672 
   6673 /**
   6674  * htmlReadFd:
   6675  * @fd:  an open file descriptor
   6676  * @URL:  the base URL to use for the document
   6677  * @encoding:  the document encoding, or NULL
   6678  * @options:  a combination of htmlParserOption(s)
   6679  *
   6680  * parse an XML from a file descriptor and build a tree.
   6681  *
   6682  * Returns the resulting document tree
   6683  */
   6684 htmlDocPtr
   6685 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
   6686 {
   6687     htmlParserCtxtPtr ctxt;
   6688     xmlParserInputBufferPtr input;
   6689     xmlParserInputPtr stream;
   6690 
   6691     if (fd < 0)
   6692         return (NULL);
   6693 
   6694     xmlInitParser();
   6695     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
   6696     if (input == NULL)
   6697         return (NULL);
   6698     ctxt = xmlNewParserCtxt();
   6699     if (ctxt == NULL) {
   6700         xmlFreeParserInputBuffer(input);
   6701         return (NULL);
   6702     }
   6703     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
   6704     if (stream == NULL) {
   6705         xmlFreeParserInputBuffer(input);
   6706 	xmlFreeParserCtxt(ctxt);
   6707         return (NULL);
   6708     }
   6709     inputPush(ctxt, stream);
   6710     return (htmlDoRead(ctxt, URL, encoding, options, 0));
   6711 }
   6712 
   6713 /**
   6714  * htmlReadIO:
   6715  * @ioread:  an I/O read function
   6716  * @ioclose:  an I/O close function
   6717  * @ioctx:  an I/O handler
   6718  * @URL:  the base URL to use for the document
   6719  * @encoding:  the document encoding, or NULL
   6720  * @options:  a combination of htmlParserOption(s)
   6721  *
   6722  * parse an HTML document from I/O functions and source and build a tree.
   6723  *
   6724  * Returns the resulting document tree
   6725  */
   6726 htmlDocPtr
   6727 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
   6728           void *ioctx, const char *URL, const char *encoding, int options)
   6729 {
   6730     htmlParserCtxtPtr ctxt;
   6731     xmlParserInputBufferPtr input;
   6732     xmlParserInputPtr stream;
   6733 
   6734     if (ioread == NULL)
   6735         return (NULL);
   6736     xmlInitParser();
   6737 
   6738     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
   6739                                          XML_CHAR_ENCODING_NONE);
   6740     if (input == NULL)
   6741         return (NULL);
   6742     ctxt = htmlNewParserCtxt();
   6743     if (ctxt == NULL) {
   6744         xmlFreeParserInputBuffer(input);
   6745         return (NULL);
   6746     }
   6747     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
   6748     if (stream == NULL) {
   6749         xmlFreeParserInputBuffer(input);
   6750 	xmlFreeParserCtxt(ctxt);
   6751         return (NULL);
   6752     }
   6753     inputPush(ctxt, stream);
   6754     return (htmlDoRead(ctxt, URL, encoding, options, 0));
   6755 }
   6756 
   6757 /**
   6758  * htmlCtxtReadDoc:
   6759  * @ctxt:  an HTML parser context
   6760  * @cur:  a pointer to a zero terminated string
   6761  * @URL:  the base URL to use for the document
   6762  * @encoding:  the document encoding, or NULL
   6763  * @options:  a combination of htmlParserOption(s)
   6764  *
   6765  * parse an XML in-memory document and build a tree.
   6766  * This reuses the existing @ctxt parser context
   6767  *
   6768  * Returns the resulting document tree
   6769  */
   6770 htmlDocPtr
   6771 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
   6772                const char *URL, const char *encoding, int options)
   6773 {
   6774     xmlParserInputPtr stream;
   6775 
   6776     if (cur == NULL)
   6777         return (NULL);
   6778     if (ctxt == NULL)
   6779         return (NULL);
   6780 
   6781     htmlCtxtReset(ctxt);
   6782 
   6783     stream = xmlNewStringInputStream(ctxt, cur);
   6784     if (stream == NULL) {
   6785         return (NULL);
   6786     }
   6787     inputPush(ctxt, stream);
   6788     return (htmlDoRead(ctxt, URL, encoding, options, 1));
   6789 }
   6790 
   6791 /**
   6792  * htmlCtxtReadFile:
   6793  * @ctxt:  an HTML parser context
   6794  * @filename:  a file or URL
   6795  * @encoding:  the document encoding, or NULL
   6796  * @options:  a combination of htmlParserOption(s)
   6797  *
   6798  * parse an XML file from the filesystem or the network.
   6799  * This reuses the existing @ctxt parser context
   6800  *
   6801  * Returns the resulting document tree
   6802  */
   6803 htmlDocPtr
   6804 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
   6805                 const char *encoding, int options)
   6806 {
   6807     xmlParserInputPtr stream;
   6808 
   6809     if (filename == NULL)
   6810         return (NULL);
   6811     if (ctxt == NULL)
   6812         return (NULL);
   6813 
   6814     htmlCtxtReset(ctxt);
   6815 
   6816     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
   6817     if (stream == NULL) {
   6818         return (NULL);
   6819     }
   6820     inputPush(ctxt, stream);
   6821     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
   6822 }
   6823 
   6824 /**
   6825  * htmlCtxtReadMemory:
   6826  * @ctxt:  an HTML parser context
   6827  * @buffer:  a pointer to a char array
   6828  * @size:  the size of the array
   6829  * @URL:  the base URL to use for the document
   6830  * @encoding:  the document encoding, or NULL
   6831  * @options:  a combination of htmlParserOption(s)
   6832  *
   6833  * parse an XML in-memory document and build a tree.
   6834  * This reuses the existing @ctxt parser context
   6835  *
   6836  * Returns the resulting document tree
   6837  */
   6838 htmlDocPtr
   6839 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
   6840                   const char *URL, const char *encoding, int options)
   6841 {
   6842     xmlParserInputBufferPtr input;
   6843     xmlParserInputPtr stream;
   6844 
   6845     if (ctxt == NULL)
   6846         return (NULL);
   6847     if (buffer == NULL)
   6848         return (NULL);
   6849 
   6850     htmlCtxtReset(ctxt);
   6851 
   6852     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
   6853     if (input == NULL) {
   6854 	return(NULL);
   6855     }
   6856 
   6857     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
   6858     if (stream == NULL) {
   6859 	xmlFreeParserInputBuffer(input);
   6860 	return(NULL);
   6861     }
   6862 
   6863     inputPush(ctxt, stream);
   6864     return (htmlDoRead(ctxt, URL, encoding, options, 1));
   6865 }
   6866 
   6867 /**
   6868  * htmlCtxtReadFd:
   6869  * @ctxt:  an HTML parser context
   6870  * @fd:  an open file descriptor
   6871  * @URL:  the base URL to use for the document
   6872  * @encoding:  the document encoding, or NULL
   6873  * @options:  a combination of htmlParserOption(s)
   6874  *
   6875  * parse an XML from a file descriptor and build a tree.
   6876  * This reuses the existing @ctxt parser context
   6877  *
   6878  * Returns the resulting document tree
   6879  */
   6880 htmlDocPtr
   6881 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
   6882               const char *URL, const char *encoding, int options)
   6883 {
   6884     xmlParserInputBufferPtr input;
   6885     xmlParserInputPtr stream;
   6886 
   6887     if (fd < 0)
   6888         return (NULL);
   6889     if (ctxt == NULL)
   6890         return (NULL);
   6891 
   6892     htmlCtxtReset(ctxt);
   6893 
   6894 
   6895     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
   6896     if (input == NULL)
   6897         return (NULL);
   6898     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
   6899     if (stream == NULL) {
   6900         xmlFreeParserInputBuffer(input);
   6901         return (NULL);
   6902     }
   6903     inputPush(ctxt, stream);
   6904     return (htmlDoRead(ctxt, URL, encoding, options, 1));
   6905 }
   6906 
   6907 /**
   6908  * htmlCtxtReadIO:
   6909  * @ctxt:  an HTML parser context
   6910  * @ioread:  an I/O read function
   6911  * @ioclose:  an I/O close function
   6912  * @ioctx:  an I/O handler
   6913  * @URL:  the base URL to use for the document
   6914  * @encoding:  the document encoding, or NULL
   6915  * @options:  a combination of htmlParserOption(s)
   6916  *
   6917  * parse an HTML document from I/O functions and source and build a tree.
   6918  * This reuses the existing @ctxt parser context
   6919  *
   6920  * Returns the resulting document tree
   6921  */
   6922 htmlDocPtr
   6923 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
   6924               xmlInputCloseCallback ioclose, void *ioctx,
   6925 	      const char *URL,
   6926               const char *encoding, int options)
   6927 {
   6928     xmlParserInputBufferPtr input;
   6929     xmlParserInputPtr stream;
   6930 
   6931     if (ioread == NULL)
   6932         return (NULL);
   6933     if (ctxt == NULL)
   6934         return (NULL);
   6935 
   6936     htmlCtxtReset(ctxt);
   6937 
   6938     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
   6939                                          XML_CHAR_ENCODING_NONE);
   6940     if (input == NULL)
   6941         return (NULL);
   6942     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
   6943     if (stream == NULL) {
   6944         xmlFreeParserInputBuffer(input);
   6945         return (NULL);
   6946     }
   6947     inputPush(ctxt, stream);
   6948     return (htmlDoRead(ctxt, URL, encoding, options, 1));
   6949 }
   6950 
   6951 #define bottom_HTMLparser
   6952 #include "elfgcchack.h"
   6953 #endif /* LIBXML_HTML_ENABLED */
   6954