Home | History | Annotate | Download | only in libxml2
      1 /*
      2  * HTMLparser.c : an HTML 4.0 non-verifying parser
      3  *
      4  * See Copyright for the status of this software.
      5  *
      6  * daniel (at) veillard.com
      7  */
      8 
      9 #define IN_LIBXML
     10 #include "libxml.h"
     11 #ifdef LIBXML_HTML_ENABLED
     12 
     13 #include <string.h>
     14 #ifdef HAVE_CTYPE_H
     15 #include <ctype.h>
     16 #endif
     17 #ifdef HAVE_STDLIB_H
     18 #include <stdlib.h>
     19 #endif
     20 #ifdef HAVE_SYS_STAT_H
     21 #include <sys/stat.h>
     22 #endif
     23 #ifdef HAVE_FCNTL_H
     24 #include <fcntl.h>
     25 #endif
     26 #ifdef HAVE_UNISTD_H
     27 #include <unistd.h>
     28 #endif
     29 #ifdef HAVE_ZLIB_H
     30 #include <zlib.h>
     31 #endif
     32 
     33 #include <libxml/xmlmemory.h>
     34 #include <libxml/tree.h>
     35 #include <libxml/parser.h>
     36 #include <libxml/parserInternals.h>
     37 #include <libxml/xmlerror.h>
     38 #include <libxml/HTMLparser.h>
     39 #include <libxml/HTMLtree.h>
     40 #include <libxml/entities.h>
     41 #include <libxml/encoding.h>
     42 #include <libxml/valid.h>
     43 #include <libxml/xmlIO.h>
     44 #include <libxml/globals.h>
     45 #include <libxml/uri.h>
     46 
     47 #define HTML_MAX_NAMELEN 1000
     48 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
     49 #define HTML_PARSER_BUFFER_SIZE 100
     50 
     51 /* #define DEBUG */
     52 /* #define DEBUG_PUSH */
     53 
     54 static int htmlOmittedDefaultValue = 1;
     55 
     56 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
     57 			     xmlChar end, xmlChar  end2, xmlChar end3);
     58 static void htmlParseComment(htmlParserCtxtPtr ctxt);
     59 
     60 /************************************************************************
     61  *									*
     62  *		Some factorized error routines				*
     63  *									*
     64  ************************************************************************/
     65 
     66 /**
     67  * htmlErrMemory:
     68  * @ctxt:  an HTML parser context
     69  * @extra:  extra informations
     70  *
     71  * Handle a redefinition of attribute error
     72  */
     73 static void
     74 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
     75 {
     76     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
     77         (ctxt->instate == XML_PARSER_EOF))
     78 	return;
     79     if (ctxt != NULL) {
     80         ctxt->errNo = XML_ERR_NO_MEMORY;
     81         ctxt->instate = XML_PARSER_EOF;
     82         ctxt->disableSAX = 1;
     83     }
     84     if (extra)
     85         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
     86                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
     87                         NULL, NULL, 0, 0,
     88                         "Memory allocation failed : %s\n", extra);
     89     else
     90         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
     91                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
     92                         NULL, NULL, 0, 0, "Memory allocation failed\n");
     93 }
     94 
     95 /**
     96  * htmlParseErr:
     97  * @ctxt:  an HTML parser context
     98  * @error:  the error number
     99  * @msg:  the error message
    100  * @str1:  string infor
    101  * @str2:  string infor
    102  *
    103  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
    104  */
    105 static void
    106 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
    107              const char *msg, const xmlChar *str1, const xmlChar *str2)
    108 {
    109     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
    110         (ctxt->instate == XML_PARSER_EOF))
    111 	return;
    112     if (ctxt != NULL)
    113 	ctxt->errNo = error;
    114     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
    115                     XML_ERR_ERROR, NULL, 0,
    116 		    (const char *) str1, (const char *) str2,
    117 		    NULL, 0, 0,
    118 		    msg, str1, str2);
    119     if (ctxt != NULL)
    120 	ctxt->wellFormed = 0;
    121 }
    122 
    123 /**
    124  * htmlParseErrInt:
    125  * @ctxt:  an HTML parser context
    126  * @error:  the error number
    127  * @msg:  the error message
    128  * @val:  integer info
    129  *
    130  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
    131  */
    132 static void
    133 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
    134              const char *msg, int val)
    135 {
    136     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
    137         (ctxt->instate == XML_PARSER_EOF))
    138 	return;
    139     if (ctxt != NULL)
    140 	ctxt->errNo = error;
    141     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
    142                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
    143 		    NULL, val, 0, msg, val);
    144     if (ctxt != NULL)
    145 	ctxt->wellFormed = 0;
    146 }
    147 
    148 /************************************************************************
    149  *									*
    150  *	Parser stacks related functions and macros		*
    151  *									*
    152  ************************************************************************/
    153 
    154 /**
    155  * htmlnamePush:
    156  * @ctxt:  an HTML parser context
    157  * @value:  the element name
    158  *
    159  * Pushes a new element name on top of the name stack
    160  *
    161  * Returns 0 in case of error, the index in the stack otherwise
    162  */
    163 static int
    164 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
    165 {
    166     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
    167         ctxt->html = 3;
    168     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
    169         ctxt->html = 10;
    170     if (ctxt->nameNr >= ctxt->nameMax) {
    171         ctxt->nameMax *= 2;
    172         ctxt->nameTab = (const xmlChar * *)
    173                          xmlRealloc((xmlChar * *)ctxt->nameTab,
    174                                     ctxt->nameMax *
    175                                     sizeof(ctxt->nameTab[0]));
    176         if (ctxt->nameTab == NULL) {
    177             htmlErrMemory(ctxt, NULL);
    178             return (0);
    179         }
    180     }
    181     ctxt->nameTab[ctxt->nameNr] = value;
    182     ctxt->name = value;
    183     return (ctxt->nameNr++);
    184 }
    185 /**
    186  * htmlnamePop:
    187  * @ctxt: an HTML parser context
    188  *
    189  * Pops the top element name from the name stack
    190  *
    191  * Returns the name just removed
    192  */
    193 static const xmlChar *
    194 htmlnamePop(htmlParserCtxtPtr ctxt)
    195 {
    196     const xmlChar *ret;
    197 
    198     if (ctxt->nameNr <= 0)
    199         return (NULL);
    200     ctxt->nameNr--;
    201     if (ctxt->nameNr < 0)
    202         return (NULL);
    203     if (ctxt->nameNr > 0)
    204         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
    205     else
    206         ctxt->name = NULL;
    207     ret = ctxt->nameTab[ctxt->nameNr];
    208     ctxt->nameTab[ctxt->nameNr] = NULL;
    209     return (ret);
    210 }
    211 
    212 /**
    213  * htmlNodeInfoPush:
    214  * @ctxt:  an HTML parser context
    215  * @value:  the node info
    216  *
    217  * Pushes a new element name on top of the node info stack
    218  *
    219  * Returns 0 in case of error, the index in the stack otherwise
    220  */
    221 static int
    222 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
    223 {
    224     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
    225         if (ctxt->nodeInfoMax == 0)
    226                 ctxt->nodeInfoMax = 5;
    227         ctxt->nodeInfoMax *= 2;
    228         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
    229                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
    230                                     ctxt->nodeInfoMax *
    231                                     sizeof(ctxt->nodeInfoTab[0]));
    232         if (ctxt->nodeInfoTab == NULL) {
    233             htmlErrMemory(ctxt, NULL);
    234             return (0);
    235         }
    236     }
    237     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
    238     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
    239     return (ctxt->nodeInfoNr++);
    240 }
    241 
    242 /**
    243  * htmlNodeInfoPop:
    244  * @ctxt:  an HTML parser context
    245  *
    246  * Pops the top element name from the node info stack
    247  *
    248  * Returns 0 in case of error, the pointer to NodeInfo otherwise
    249  */
    250 static htmlParserNodeInfo *
    251 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
    252 {
    253     if (ctxt->nodeInfoNr <= 0)
    254         return (NULL);
    255     ctxt->nodeInfoNr--;
    256     if (ctxt->nodeInfoNr < 0)
    257         return (NULL);
    258     if (ctxt->nodeInfoNr > 0)
    259         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
    260     else
    261         ctxt->nodeInfo = NULL;
    262     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
    263 }
    264 
    265 /*
    266  * Macros for accessing the content. Those should be used only by the parser,
    267  * and not exported.
    268  *
    269  * Dirty macros, i.e. one need to make assumption on the context to use them
    270  *
    271  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
    272  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
    273  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
    274  *           in UNICODE mode. This should be used internally by the parser
    275  *           only to compare to ASCII values otherwise it would break when
    276  *           running with UTF-8 encoding.
    277  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
    278  *           to compare on ASCII based substring.
    279  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
    280  *           it should be used only to compare on ASCII based substring.
    281  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
    282  *           strings without newlines within the parser.
    283  *
    284  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
    285  *
    286  *   CURRENT Returns the current char value, with the full decoding of
    287  *           UTF-8 if we are using this mode. It returns an int.
    288  *   NEXT    Skip to the next character, this does the proper decoding
    289  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
    290  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
    291  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
    292  */
    293 
    294 #define UPPER (toupper(*ctxt->input->cur))
    295 
    296 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
    297 
    298 #define NXT(val) ctxt->input->cur[(val)]
    299 
    300 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
    301 
    302 #define CUR_PTR ctxt->input->cur
    303 
    304 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
    305 		   (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
    306 	xmlParserInputShrink(ctxt->input)
    307 
    308 #define GROW if ((ctxt->progressive == 0) &&				\
    309 		 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))	\
    310 	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
    311 
    312 #define CURRENT ((int) (*ctxt->input->cur))
    313 
    314 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
    315 
    316 /* Inported from XML */
    317 
    318 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
    319 #define CUR ((int) (*ctxt->input->cur))
    320 #define NEXT xmlNextChar(ctxt)
    321 
    322 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
    323 
    324 
    325 #define NEXTL(l) do {							\
    326     if (*(ctxt->input->cur) == '\n') {					\
    327 	ctxt->input->line++; ctxt->input->col = 1;			\
    328     } else ctxt->input->col++;						\
    329     ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;		\
    330   } while (0)
    331 
    332 /************
    333     \
    334     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);	\
    335     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
    336  ************/
    337 
    338 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
    339 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
    340 
    341 #define COPY_BUF(l,b,i,v)						\
    342     if (l == 1) b[i++] = (xmlChar) v;					\
    343     else i += xmlCopyChar(l,&b[i],v)
    344 
    345 /**
    346  * htmlFindEncoding:
    347  * @the HTML parser context
    348  *
    349  * Ty to find and encoding in the current data available in the input
    350  * buffer this is needed to try to switch to the proper encoding when
    351  * one face a character error.
    352  * That's an heuristic, since it's operating outside of parsing it could
    353  * try to use a meta which had been commented out, that's the reason it
    354  * should only be used in case of error, not as a default.
    355  *
    356  * Returns an encoding string or NULL if not found, the string need to
    357  *   be freed
    358  */
    359 static xmlChar *
    360 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
    361     const xmlChar *start, *cur, *end;
    362 
    363     if ((ctxt == NULL) || (ctxt->input == NULL) ||
    364         (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
    365         (ctxt->input->buf->encoder != NULL))
    366         return(NULL);
    367     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
    368         return(NULL);
    369 
    370     start = ctxt->input->cur;
    371     end = ctxt->input->end;
    372     /* we also expect the input buffer to be zero terminated */
    373     if (*end != 0)
    374         return(NULL);
    375 
    376     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
    377     if (cur == NULL)
    378         return(NULL);
    379     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
    380     if (cur == NULL)
    381         return(NULL);
    382     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
    383     if (cur == NULL)
    384         return(NULL);
    385     cur += 8;
    386     start = cur;
    387     while (((*cur >= 'A') && (*cur <= 'Z')) ||
    388            ((*cur >= 'a') && (*cur <= 'z')) ||
    389            ((*cur >= '0') && (*cur <= '9')) ||
    390            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
    391            cur++;
    392     if (cur == start)
    393         return(NULL);
    394     return(xmlStrndup(start, cur - start));
    395 }
    396 
    397 /**
    398  * htmlCurrentChar:
    399  * @ctxt:  the HTML parser context
    400  * @len:  pointer to the length of the char read
    401  *
    402  * The current char value, if using UTF-8 this may actually span multiple
    403  * bytes in the input buffer. Implement the end of line normalization:
    404  * 2.11 End-of-Line Handling
    405  * If the encoding is unspecified, in the case we find an ISO-Latin-1
    406  * char, then the encoding converter is plugged in automatically.
    407  *
    408  * Returns the current char value and its length
    409  */
    410 
    411 static int
    412 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
    413     if (ctxt->instate == XML_PARSER_EOF)
    414 	return(0);
    415 
    416     if (ctxt->token != 0) {
    417 	*len = 0;
    418 	return(ctxt->token);
    419     }
    420     if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
    421 	/*
    422 	 * We are supposed to handle UTF8, check it's valid
    423 	 * From rfc2044: encoding of the Unicode values on UTF-8:
    424 	 *
    425 	 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
    426 	 * 0000 0000-0000 007F   0xxxxxxx
    427 	 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
    428 	 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
    429 	 *
    430 	 * Check for the 0x110000 limit too
    431 	 */
    432 	const unsigned char *cur = ctxt->input->cur;
    433 	unsigned char c;
    434 	unsigned int val;
    435 
    436 	c = *cur;
    437 	if (c & 0x80) {
    438 	    if (cur[1] == 0) {
    439 		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
    440                 cur = ctxt->input->cur;
    441             }
    442 	    if ((cur[1] & 0xc0) != 0x80)
    443 		goto encoding_error;
    444 	    if ((c & 0xe0) == 0xe0) {
    445 
    446 		if (cur[2] == 0) {
    447 		    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
    448                     cur = ctxt->input->cur;
    449                 }
    450 		if ((cur[2] & 0xc0) != 0x80)
    451 		    goto encoding_error;
    452 		if ((c & 0xf0) == 0xf0) {
    453 		    if (cur[3] == 0) {
    454 			xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
    455                         cur = ctxt->input->cur;
    456                     }
    457 		    if (((c & 0xf8) != 0xf0) ||
    458 			((cur[3] & 0xc0) != 0x80))
    459 			goto encoding_error;
    460 		    /* 4-byte code */
    461 		    *len = 4;
    462 		    val = (cur[0] & 0x7) << 18;
    463 		    val |= (cur[1] & 0x3f) << 12;
    464 		    val |= (cur[2] & 0x3f) << 6;
    465 		    val |= cur[3] & 0x3f;
    466 		} else {
    467 		  /* 3-byte code */
    468 		    *len = 3;
    469 		    val = (cur[0] & 0xf) << 12;
    470 		    val |= (cur[1] & 0x3f) << 6;
    471 		    val |= cur[2] & 0x3f;
    472 		}
    473 	    } else {
    474 	      /* 2-byte code */
    475 		*len = 2;
    476 		val = (cur[0] & 0x1f) << 6;
    477 		val |= cur[1] & 0x3f;
    478 	    }
    479 	    if (!IS_CHAR(val)) {
    480 	        htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
    481 				"Char 0x%X out of allowed range\n", val);
    482 	    }
    483 	    return(val);
    484 	} else {
    485             if ((*ctxt->input->cur == 0) &&
    486                 (ctxt->input->cur < ctxt->input->end)) {
    487                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
    488 				"Char 0x%X out of allowed range\n", 0);
    489                 *len = 1;
    490                 return(' ');
    491             }
    492 	    /* 1-byte code */
    493 	    *len = 1;
    494 	    return((int) *ctxt->input->cur);
    495 	}
    496     }
    497     /*
    498      * Assume it's a fixed length encoding (1) with
    499      * a compatible encoding for the ASCII set, since
    500      * XML constructs only use < 128 chars
    501      */
    502     *len = 1;
    503     if ((int) *ctxt->input->cur < 0x80)
    504 	return((int) *ctxt->input->cur);
    505 
    506     /*
    507      * Humm this is bad, do an automatic flow conversion
    508      */
    509     {
    510         xmlChar * guess;
    511         xmlCharEncodingHandlerPtr handler;
    512 
    513         guess = htmlFindEncoding(ctxt);
    514         if (guess == NULL) {
    515             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
    516         } else {
    517             if (ctxt->input->encoding != NULL)
    518                 xmlFree((xmlChar *) ctxt->input->encoding);
    519             ctxt->input->encoding = guess;
    520             handler = xmlFindCharEncodingHandler((const char *) guess);
    521             if (handler != NULL) {
    522                 xmlSwitchToEncoding(ctxt, handler);
    523             } else {
    524                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
    525                              "Unsupported encoding %s", guess, NULL);
    526             }
    527         }
    528         ctxt->charset = XML_CHAR_ENCODING_UTF8;
    529     }
    530 
    531     return(xmlCurrentChar(ctxt, len));
    532 
    533 encoding_error:
    534     /*
    535      * If we detect an UTF8 error that probably mean that the
    536      * input encoding didn't get properly advertized in the
    537      * declaration header. Report the error and switch the encoding
    538      * to ISO-Latin-1 (if you don't like this policy, just declare the
    539      * encoding !)
    540      */
    541     {
    542         char buffer[150];
    543 
    544 	if (ctxt->input->end - ctxt->input->cur >= 4) {
    545 	    snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
    546 			    ctxt->input->cur[0], ctxt->input->cur[1],
    547 			    ctxt->input->cur[2], ctxt->input->cur[3]);
    548 	} else {
    549 	    snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
    550 	}
    551 	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
    552 		     "Input is not proper UTF-8, indicate encoding !\n",
    553 		     BAD_CAST buffer, NULL);
    554     }
    555 
    556     ctxt->charset = XML_CHAR_ENCODING_8859_1;
    557     *len = 1;
    558     return((int) *ctxt->input->cur);
    559 }
    560 
    561 /**
    562  * htmlSkipBlankChars:
    563  * @ctxt:  the HTML parser context
    564  *
    565  * skip all blanks character found at that point in the input streams.
    566  *
    567  * Returns the number of space chars skipped
    568  */
    569 
    570 static int
    571 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
    572     int res = 0;
    573 
    574     while (IS_BLANK_CH(*(ctxt->input->cur))) {
    575 	if ((*ctxt->input->cur == 0) &&
    576 	    (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
    577 		xmlPopInput(ctxt);
    578 	} else {
    579 	    if (*(ctxt->input->cur) == '\n') {
    580 		ctxt->input->line++; ctxt->input->col = 1;
    581 	    } else ctxt->input->col++;
    582 	    ctxt->input->cur++;
    583 	    ctxt->nbChars++;
    584 	    if (*ctxt->input->cur == 0)
    585 		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
    586 	}
    587 	res++;
    588     }
    589     return(res);
    590 }
    591 
    592 
    593 
    594 /************************************************************************
    595  *									*
    596  *	The list of HTML elements and their properties		*
    597  *									*
    598  ************************************************************************/
    599 
    600 /*
    601  *  Start Tag: 1 means the start tag can be ommited
    602  *  End Tag:   1 means the end tag can be ommited
    603  *             2 means it's forbidden (empty elements)
    604  *             3 means the tag is stylistic and should be closed easily
    605  *  Depr:      this element is deprecated
    606  *  DTD:       1 means that this element is valid only in the Loose DTD
    607  *             2 means that this element is valid only in the Frameset DTD
    608  *
    609  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
    610 	, subElements , impliedsubelt , Attributes, userdata
    611  */
    612 
    613 /* Definitions and a couple of vars for HTML Elements */
    614 
    615 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
    616 #define NB_FONTSTYLE 8
    617 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
    618 #define NB_PHRASE 10
    619 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
    620 #define NB_SPECIAL 16
    621 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
    622 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
    623 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
    624 #define NB_BLOCK NB_HEADING + NB_LIST + 14
    625 #define FORMCTRL "input", "select", "textarea", "label", "button"
    626 #define NB_FORMCTRL 5
    627 #define PCDATA
    628 #define NB_PCDATA 0
    629 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
    630 #define NB_HEADING 6
    631 #define LIST "ul", "ol", "dir", "menu"
    632 #define NB_LIST 4
    633 #define MODIFIER
    634 #define NB_MODIFIER 0
    635 #define FLOW BLOCK,INLINE
    636 #define NB_FLOW NB_BLOCK + NB_INLINE
    637 #define EMPTY NULL
    638 
    639 
    640 static const char* const html_flow[] = { FLOW, NULL } ;
    641 static const char* const html_inline[] = { INLINE, NULL } ;
    642 
    643 /* placeholders: elts with content but no subelements */
    644 static const char* const html_pcdata[] = { NULL } ;
    645 #define html_cdata html_pcdata
    646 
    647 
    648 /* ... and for HTML Attributes */
    649 
    650 #define COREATTRS "id", "class", "style", "title"
    651 #define NB_COREATTRS 4
    652 #define I18N "lang", "dir"
    653 #define NB_I18N 2
    654 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
    655 #define NB_EVENTS 9
    656 #define ATTRS COREATTRS,I18N,EVENTS
    657 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
    658 #define CELLHALIGN "align", "char", "charoff"
    659 #define NB_CELLHALIGN 3
    660 #define CELLVALIGN "valign"
    661 #define NB_CELLVALIGN 1
    662 
    663 static const char* const html_attrs[] = { ATTRS, NULL } ;
    664 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
    665 static const char* const core_attrs[] = { COREATTRS, NULL } ;
    666 static const char* const i18n_attrs[] = { I18N, NULL } ;
    667 
    668 
    669 /* Other declarations that should go inline ... */
    670 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
    671 	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
    672 	"tabindex", "onfocus", "onblur", NULL } ;
    673 static const char* const target_attr[] = { "target", NULL } ;
    674 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
    675 static const char* const alt_attr[] = { "alt", NULL } ;
    676 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
    677 static const char* const href_attrs[] = { "href", NULL } ;
    678 static const char* const clear_attrs[] = { "clear", NULL } ;
    679 static const char* const inline_p[] = { INLINE, "p", NULL } ;
    680 
    681 static const char* const flow_param[] = { FLOW, "param", NULL } ;
    682 static const char* const applet_attrs[] = { COREATTRS , "codebase",
    683 		"archive", "alt", "name", "height", "width", "align",
    684 		"hspace", "vspace", NULL } ;
    685 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
    686 	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
    687 static const char* const basefont_attrs[] =
    688 	{ "id", "size", "color", "face", NULL } ;
    689 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
    690 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
    691 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
    692 static const char* const body_depr[] = { "background", "bgcolor", "text",
    693 	"link", "vlink", "alink", NULL } ;
    694 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
    695 	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
    696 
    697 
    698 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
    699 static const char* const col_elt[] = { "col", NULL } ;
    700 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
    701 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
    702 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
    703 static const char* const compact_attr[] = { "compact", NULL } ;
    704 static const char* const label_attr[] = { "label", NULL } ;
    705 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
    706 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
    707 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
    708 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
    709 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
    710 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
    711 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
    712 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
    713 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
    714 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
    715 static const char* const version_attr[] = { "version", NULL } ;
    716 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
    717 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
    718 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
    719 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
    720 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
    721 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
    722 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
    723 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
    724 static const char* const align_attr[] = { "align", NULL } ;
    725 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
    726 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
    727 static const char* const name_attr[] = { "name", NULL } ;
    728 static const char* const action_attr[] = { "action", NULL } ;
    729 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
    730 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
    731 static const char* const content_attr[] = { "content", NULL } ;
    732 static const char* const type_attr[] = { "type", NULL } ;
    733 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
    734 static const char* const object_contents[] = { FLOW, "param", NULL } ;
    735 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
    736 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
    737 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
    738 static const char* const option_elt[] = { "option", NULL } ;
    739 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
    740 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
    741 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
    742 static const char* const width_attr[] = { "width", NULL } ;
    743 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
    744 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
    745 static const char* const language_attr[] = { "language", NULL } ;
    746 static const char* const select_content[] = { "optgroup", "option", NULL } ;
    747 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
    748 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
    749 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
    750 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
    751 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
    752 static const char* const tr_elt[] = { "tr", NULL } ;
    753 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
    754 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
    755 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
    756 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
    757 static const char* const tr_contents[] = { "th", "td", NULL } ;
    758 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
    759 static const char* const li_elt[] = { "li", NULL } ;
    760 static const char* const ul_depr[] = { "type", "compact", NULL} ;
    761 static const char* const dir_attr[] = { "dir", NULL} ;
    762 
    763 #define DECL (const char**)
    764 
    765 static const htmlElemDesc
    766 html40ElementTable[] = {
    767 { "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
    768 	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
    769 },
    770 { "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
    771 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    772 },
    773 { "acronym",	0, 0, 0, 0, 0, 0, 1, "",
    774 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    775 },
    776 { "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
    777 	DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
    778 },
    779 { "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
    780 	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
    781 },
    782 { "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
    783 	EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
    784 },
    785 { "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
    786 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    787 },
    788 { "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
    789 	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
    790 },
    791 { "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
    792 	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
    793 },
    794 { "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
    795 	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
    796 },
    797 { "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
    798 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    799 },
    800 { "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
    801 	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
    802 },
    803 { "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
    804 	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
    805 },
    806 { "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
    807 	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
    808 },
    809 { "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
    810 	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
    811 },
    812 { "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
    813 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    814 },
    815 { "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
    816 	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
    817 },
    818 { "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
    819 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    820 },
    821 { "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
    822 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    823 },
    824 { "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
    825 	EMPTY , NULL , DECL col_attrs , NULL, NULL
    826 },
    827 { "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
    828 	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
    829 },
    830 { "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
    831 	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
    832 },
    833 { "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
    834 	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
    835 },
    836 { "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
    837 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
    838 },
    839 { "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
    840 	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
    841 },
    842 { "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
    843 	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
    844 },
    845 { "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
    846 	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
    847 },
    848 { "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
    849 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    850 },
    851 { "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
    852 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    853 },
    854 { "embed",	0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
    855 	EMPTY, NULL, DECL embed_attrs, NULL, NULL
    856 },
    857 { "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
    858 	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
    859 },
    860 { "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
    861 	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
    862 },
    863 { "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
    864 	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
    865 },
    866 { "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
    867 	EMPTY, NULL, NULL, DECL frame_attrs, NULL
    868 },
    869 { "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
    870 	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
    871 },
    872 { "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
    873 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    874 },
    875 { "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
    876 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    877 },
    878 { "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
    879 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    880 },
    881 { "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
    882 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    883 },
    884 { "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
    885 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    886 },
    887 { "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
    888 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    889 },
    890 { "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
    891 	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
    892 },
    893 { "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
    894 	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
    895 },
    896 { "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
    897 	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
    898 },
    899 { "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
    900 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    901 },
    902 { "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
    903 	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
    904 },
    905 { "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
    906 	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
    907 },
    908 { "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
    909 	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
    910 },
    911 { "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
    912 	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
    913 },
    914 { "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
    915 	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
    916 },
    917 { "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
    918 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    919 },
    920 { "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
    921 	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
    922 },
    923 { "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
    924 	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
    925 },
    926 { "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
    927 	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
    928 },
    929 { "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
    930 	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
    931 },
    932 { "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
    933 	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
    934 },
    935 { "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
    936 	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
    937 },
    938 { "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
    939 	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
    940 },
    941 { "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
    942 	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
    943 },
    944 { "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
    945 	DECL html_flow, "div", DECL html_attrs, NULL, NULL
    946 },
    947 { "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
    948 	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
    949 },
    950 { "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
    951 	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
    952 },
    953 { "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
    954 	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
    955 },
    956 { "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
    957 	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
    958 },
    959 { "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
    960 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
    961 },
    962 { "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
    963 	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
    964 },
    965 { "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
    966 	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
    967 },
    968 { "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
    969 	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
    970 },
    971 { "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
    972 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
    973 },
    974 { "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
    975 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    976 },
    977 { "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
    978 	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
    979 },
    980 { "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
    981 	DECL select_content, NULL, DECL select_attrs, NULL, NULL
    982 },
    983 { "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
    984 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    985 },
    986 { "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
    987 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    988 },
    989 { "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
    990 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
    991 },
    992 { "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
    993 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
    994 },
    995 { "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
    996 	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
    997 },
    998 { "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
    999 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
   1000 },
   1001 { "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
   1002 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
   1003 },
   1004 { "table",	0, 0, 0, 0, 0, 0, 0, "",
   1005 	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
   1006 },
   1007 { "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
   1008 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
   1009 },
   1010 { "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
   1011 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
   1012 },
   1013 { "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
   1014 	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
   1015 },
   1016 { "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
   1017 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
   1018 },
   1019 { "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
   1020 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
   1021 },
   1022 { "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
   1023 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
   1024 },
   1025 { "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
   1026 	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
   1027 },
   1028 { "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
   1029 	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
   1030 },
   1031 { "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
   1032 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
   1033 },
   1034 { "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
   1035 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
   1036 },
   1037 { "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
   1038 	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
   1039 },
   1040 { "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
   1041 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
   1042 }
   1043 };
   1044 
   1045 /*
   1046  * start tags that imply the end of current element
   1047  */
   1048 static const char * const htmlStartClose[] = {
   1049 "form",		"form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
   1050 		"dl", "ul", "ol", "menu", "dir", "address", "pre",
   1051 		"listing", "xmp", "head", NULL,
   1052 "head",		"p", NULL,
   1053 "title",	"p", NULL,
   1054 "body",		"head", "style", "link", "title", "p", NULL,
   1055 "frameset",	"head", "style", "link", "title", "p", NULL,
   1056 "li",		"p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
   1057 		"pre", "listing", "xmp", "head", "li", NULL,
   1058 "hr",		"p", "head", NULL,
   1059 "h1",		"p", "head", NULL,
   1060 "h2",		"p", "head", NULL,
   1061 "h3",		"p", "head", NULL,
   1062 "h4",		"p", "head", NULL,
   1063 "h5",		"p", "head", NULL,
   1064 "h6",		"p", "head", NULL,
   1065 "dir",		"p", "head", NULL,
   1066 "address",	"p", "head", "ul", NULL,
   1067 "pre",		"p", "head", "ul", NULL,
   1068 "listing",	"p", "head", NULL,
   1069 "xmp",		"p", "head", NULL,
   1070 "blockquote",	"p", "head", NULL,
   1071 "dl",		"p", "dt", "menu", "dir", "address", "pre", "listing",
   1072 		"xmp", "head", NULL,
   1073 "dt",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
   1074                 "head", "dd", NULL,
   1075 "dd",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
   1076                 "head", "dt", NULL,
   1077 "ul",		"p", "head", "ol", "menu", "dir", "address", "pre",
   1078 		"listing", "xmp", NULL,
   1079 "ol",		"p", "head", "ul", NULL,
   1080 "menu",		"p", "head", "ul", NULL,
   1081 "p",		"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
   1082 "div",		"p", "head", NULL,
   1083 "noscript",	"p", "head", NULL,
   1084 "center",	"font", "b", "i", "p", "head", NULL,
   1085 "a",		"a", NULL,
   1086 "caption",	"p", NULL,
   1087 "colgroup",	"caption", "colgroup", "col", "p", NULL,
   1088 "col",		"caption", "col", "p", NULL,
   1089 "table",	"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
   1090 		"listing", "xmp", "a", NULL,
   1091 "th",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
   1092 "td",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
   1093 "tr",		"th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
   1094 "thead",	"caption", "col", "colgroup", NULL,
   1095 "tfoot",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
   1096 		"tbody", "p", NULL,
   1097 "tbody",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
   1098 		"tfoot", "tbody", "p", NULL,
   1099 "optgroup",	"option", NULL,
   1100 "option",	"option", NULL,
   1101 "fieldset",	"legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
   1102 		"pre", "listing", "xmp", "a", NULL,
   1103 NULL
   1104 };
   1105 
   1106 /*
   1107  * The list of HTML elements which are supposed not to have
   1108  * CDATA content and where a p element will be implied
   1109  *
   1110  * TODO: extend that list by reading the HTML SGML DTD on
   1111  *       implied paragraph
   1112  */
   1113 static const char *const htmlNoContentElements[] = {
   1114     "html",
   1115     "head",
   1116     NULL
   1117 };
   1118 
   1119 /*
   1120  * The list of HTML attributes which are of content %Script;
   1121  * NOTE: when adding ones, check htmlIsScriptAttribute() since
   1122  *       it assumes the name starts with 'on'
   1123  */
   1124 static const char *const htmlScriptAttributes[] = {
   1125     "onclick",
   1126     "ondblclick",
   1127     "onmousedown",
   1128     "onmouseup",
   1129     "onmouseover",
   1130     "onmousemove",
   1131     "onmouseout",
   1132     "onkeypress",
   1133     "onkeydown",
   1134     "onkeyup",
   1135     "onload",
   1136     "onunload",
   1137     "onfocus",
   1138     "onblur",
   1139     "onsubmit",
   1140     "onrest",
   1141     "onchange",
   1142     "onselect"
   1143 };
   1144 
   1145 /*
   1146  * This table is used by the htmlparser to know what to do with
   1147  * broken html pages. By assigning different priorities to different
   1148  * elements the parser can decide how to handle extra endtags.
   1149  * Endtags are only allowed to close elements with lower or equal
   1150  * priority.
   1151  */
   1152 
   1153 typedef struct {
   1154     const char *name;
   1155     int priority;
   1156 } elementPriority;
   1157 
   1158 static const elementPriority htmlEndPriority[] = {
   1159     {"div",   150},
   1160     {"td",    160},
   1161     {"th",    160},
   1162     {"tr",    170},
   1163     {"thead", 180},
   1164     {"tbody", 180},
   1165     {"tfoot", 180},
   1166     {"table", 190},
   1167     {"head",  200},
   1168     {"body",  200},
   1169     {"html",  220},
   1170     {NULL,    100} /* Default priority */
   1171 };
   1172 
   1173 static const char** htmlStartCloseIndex[100];
   1174 static int htmlStartCloseIndexinitialized = 0;
   1175 
   1176 /************************************************************************
   1177  *									*
   1178  *	functions to handle HTML specific data			*
   1179  *									*
   1180  ************************************************************************/
   1181 
   1182 /**
   1183  * htmlInitAutoClose:
   1184  *
   1185  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
   1186  * This is not reentrant. Call xmlInitParser() once before processing in
   1187  * case of use in multithreaded programs.
   1188  */
   1189 void
   1190 htmlInitAutoClose(void) {
   1191     int indx, i = 0;
   1192 
   1193     if (htmlStartCloseIndexinitialized) return;
   1194 
   1195     for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
   1196     indx = 0;
   1197     while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
   1198         htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
   1199 	while (htmlStartClose[i] != NULL) i++;
   1200 	i++;
   1201     }
   1202     htmlStartCloseIndexinitialized = 1;
   1203 }
   1204 
   1205 /**
   1206  * htmlTagLookup:
   1207  * @tag:  The tag name in lowercase
   1208  *
   1209  * Lookup the HTML tag in the ElementTable
   1210  *
   1211  * Returns the related htmlElemDescPtr or NULL if not found.
   1212  */
   1213 const htmlElemDesc *
   1214 htmlTagLookup(const xmlChar *tag) {
   1215     unsigned int i;
   1216 
   1217     for (i = 0; i < (sizeof(html40ElementTable) /
   1218                      sizeof(html40ElementTable[0]));i++) {
   1219         if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
   1220 	    return((htmlElemDescPtr) &html40ElementTable[i]);
   1221     }
   1222     return(NULL);
   1223 }
   1224 
   1225 /**
   1226  * htmlGetEndPriority:
   1227  * @name: The name of the element to look up the priority for.
   1228  *
   1229  * Return value: The "endtag" priority.
   1230  **/
   1231 static int
   1232 htmlGetEndPriority (const xmlChar *name) {
   1233     int i = 0;
   1234 
   1235     while ((htmlEndPriority[i].name != NULL) &&
   1236 	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
   1237 	i++;
   1238 
   1239     return(htmlEndPriority[i].priority);
   1240 }
   1241 
   1242 
   1243 /**
   1244  * htmlCheckAutoClose:
   1245  * @newtag:  The new tag name
   1246  * @oldtag:  The old tag name
   1247  *
   1248  * Checks whether the new tag is one of the registered valid tags for
   1249  * closing old.
   1250  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
   1251  *
   1252  * Returns 0 if no, 1 if yes.
   1253  */
   1254 static int
   1255 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
   1256 {
   1257     int i, indx;
   1258     const char **closed = NULL;
   1259 
   1260     if (htmlStartCloseIndexinitialized == 0)
   1261         htmlInitAutoClose();
   1262 
   1263     /* inefficient, but not a big deal */
   1264     for (indx = 0; indx < 100; indx++) {
   1265         closed = htmlStartCloseIndex[indx];
   1266         if (closed == NULL)
   1267             return (0);
   1268         if (xmlStrEqual(BAD_CAST * closed, newtag))
   1269             break;
   1270     }
   1271 
   1272     i = closed - htmlStartClose;
   1273     i++;
   1274     while (htmlStartClose[i] != NULL) {
   1275         if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
   1276             return (1);
   1277         }
   1278         i++;
   1279     }
   1280     return (0);
   1281 }
   1282 
   1283 /**
   1284  * htmlAutoCloseOnClose:
   1285  * @ctxt:  an HTML parser context
   1286  * @newtag:  The new tag name
   1287  * @force:  force the tag closure
   1288  *
   1289  * The HTML DTD allows an ending tag to implicitly close other tags.
   1290  */
   1291 static void
   1292 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
   1293 {
   1294     const htmlElemDesc *info;
   1295     int i, priority;
   1296 
   1297     priority = htmlGetEndPriority(newtag);
   1298 
   1299     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
   1300 
   1301         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
   1302             break;
   1303         /*
   1304          * A missplaced endtag can only close elements with lower
   1305          * or equal priority, so if we find an element with higher
   1306          * priority before we find an element with
   1307          * matching name, we just ignore this endtag
   1308          */
   1309         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
   1310             return;
   1311     }
   1312     if (i < 0)
   1313         return;
   1314 
   1315     while (!xmlStrEqual(newtag, ctxt->name)) {
   1316         info = htmlTagLookup(ctxt->name);
   1317         if ((info != NULL) && (info->endTag == 3)) {
   1318             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
   1319 	                 "Opening and ending tag mismatch: %s and %s\n",
   1320 			 newtag, ctxt->name);
   1321         }
   1322         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   1323             ctxt->sax->endElement(ctxt->userData, ctxt->name);
   1324 	htmlnamePop(ctxt);
   1325     }
   1326 }
   1327 
   1328 /**
   1329  * htmlAutoCloseOnEnd:
   1330  * @ctxt:  an HTML parser context
   1331  *
   1332  * Close all remaining tags at the end of the stream
   1333  */
   1334 static void
   1335 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
   1336 {
   1337     int i;
   1338 
   1339     if (ctxt->nameNr == 0)
   1340         return;
   1341     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
   1342         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   1343             ctxt->sax->endElement(ctxt->userData, ctxt->name);
   1344 	htmlnamePop(ctxt);
   1345     }
   1346 }
   1347 
   1348 /**
   1349  * htmlAutoClose:
   1350  * @ctxt:  an HTML parser context
   1351  * @newtag:  The new tag name or NULL
   1352  *
   1353  * The HTML DTD allows a tag to implicitly close other tags.
   1354  * The list is kept in htmlStartClose array. This function is
   1355  * called when a new tag has been detected and generates the
   1356  * appropriates closes if possible/needed.
   1357  * If newtag is NULL this mean we are at the end of the resource
   1358  * and we should check
   1359  */
   1360 static void
   1361 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
   1362 {
   1363     while ((newtag != NULL) && (ctxt->name != NULL) &&
   1364            (htmlCheckAutoClose(newtag, ctxt->name))) {
   1365         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   1366             ctxt->sax->endElement(ctxt->userData, ctxt->name);
   1367 	htmlnamePop(ctxt);
   1368     }
   1369     if (newtag == NULL) {
   1370         htmlAutoCloseOnEnd(ctxt);
   1371         return;
   1372     }
   1373     while ((newtag == NULL) && (ctxt->name != NULL) &&
   1374            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
   1375             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
   1376             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
   1377         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   1378             ctxt->sax->endElement(ctxt->userData, ctxt->name);
   1379 	htmlnamePop(ctxt);
   1380     }
   1381 }
   1382 
   1383 /**
   1384  * htmlAutoCloseTag:
   1385  * @doc:  the HTML document
   1386  * @name:  The tag name
   1387  * @elem:  the HTML element
   1388  *
   1389  * The HTML DTD allows a tag to implicitly close other tags.
   1390  * The list is kept in htmlStartClose array. This function checks
   1391  * if the element or one of it's children would autoclose the
   1392  * given tag.
   1393  *
   1394  * Returns 1 if autoclose, 0 otherwise
   1395  */
   1396 int
   1397 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
   1398     htmlNodePtr child;
   1399 
   1400     if (elem == NULL) return(1);
   1401     if (xmlStrEqual(name, elem->name)) return(0);
   1402     if (htmlCheckAutoClose(elem->name, name)) return(1);
   1403     child = elem->children;
   1404     while (child != NULL) {
   1405         if (htmlAutoCloseTag(doc, name, child)) return(1);
   1406 	child = child->next;
   1407     }
   1408     return(0);
   1409 }
   1410 
   1411 /**
   1412  * htmlIsAutoClosed:
   1413  * @doc:  the HTML document
   1414  * @elem:  the HTML element
   1415  *
   1416  * The HTML DTD allows a tag to implicitly close other tags.
   1417  * The list is kept in htmlStartClose array. This function checks
   1418  * if a tag is autoclosed by one of it's child
   1419  *
   1420  * Returns 1 if autoclosed, 0 otherwise
   1421  */
   1422 int
   1423 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
   1424     htmlNodePtr child;
   1425 
   1426     if (elem == NULL) return(1);
   1427     child = elem->children;
   1428     while (child != NULL) {
   1429 	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
   1430 	child = child->next;
   1431     }
   1432     return(0);
   1433 }
   1434 
   1435 /**
   1436  * htmlCheckImplied:
   1437  * @ctxt:  an HTML parser context
   1438  * @newtag:  The new tag name
   1439  *
   1440  * The HTML DTD allows a tag to exists only implicitly
   1441  * called when a new tag has been detected and generates the
   1442  * appropriates implicit tags if missing
   1443  */
   1444 static void
   1445 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
   1446     int i;
   1447 
   1448     if (ctxt->options & HTML_PARSE_NOIMPLIED)
   1449         return;
   1450     if (!htmlOmittedDefaultValue)
   1451 	return;
   1452     if (xmlStrEqual(newtag, BAD_CAST"html"))
   1453 	return;
   1454     if (ctxt->nameNr <= 0) {
   1455 	htmlnamePush(ctxt, BAD_CAST"html");
   1456 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1457 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
   1458     }
   1459     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
   1460         return;
   1461     if ((ctxt->nameNr <= 1) &&
   1462         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
   1463 	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
   1464 	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
   1465 	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
   1466 	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
   1467 	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
   1468         if (ctxt->html >= 3) {
   1469             /* we already saw or generated an <head> before */
   1470             return;
   1471         }
   1472         /*
   1473          * dropped OBJECT ... i you put it first BODY will be
   1474          * assumed !
   1475          */
   1476         htmlnamePush(ctxt, BAD_CAST"head");
   1477         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1478             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
   1479     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
   1480 	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
   1481 	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
   1482         if (ctxt->html >= 10) {
   1483             /* we already saw or generated a <body> before */
   1484             return;
   1485         }
   1486 	for (i = 0;i < ctxt->nameNr;i++) {
   1487 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
   1488 		return;
   1489 	    }
   1490 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
   1491 		return;
   1492 	    }
   1493 	}
   1494 
   1495 	htmlnamePush(ctxt, BAD_CAST"body");
   1496 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1497 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
   1498     }
   1499 }
   1500 
   1501 /**
   1502  * htmlCheckParagraph
   1503  * @ctxt:  an HTML parser context
   1504  *
   1505  * Check whether a p element need to be implied before inserting
   1506  * characters in the current element.
   1507  *
   1508  * Returns 1 if a paragraph has been inserted, 0 if not and -1
   1509  *         in case of error.
   1510  */
   1511 
   1512 static int
   1513 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
   1514     const xmlChar *tag;
   1515     int i;
   1516 
   1517     if (ctxt == NULL)
   1518 	return(-1);
   1519     tag = ctxt->name;
   1520     if (tag == NULL) {
   1521 	htmlAutoClose(ctxt, BAD_CAST"p");
   1522 	htmlCheckImplied(ctxt, BAD_CAST"p");
   1523 	htmlnamePush(ctxt, BAD_CAST"p");
   1524 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1525 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
   1526 	return(1);
   1527     }
   1528     if (!htmlOmittedDefaultValue)
   1529 	return(0);
   1530     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
   1531 	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
   1532 	    htmlAutoClose(ctxt, BAD_CAST"p");
   1533 	    htmlCheckImplied(ctxt, BAD_CAST"p");
   1534 	    htmlnamePush(ctxt, BAD_CAST"p");
   1535 	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
   1536 		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
   1537 	    return(1);
   1538 	}
   1539     }
   1540     return(0);
   1541 }
   1542 
   1543 /**
   1544  * htmlIsScriptAttribute:
   1545  * @name:  an attribute name
   1546  *
   1547  * Check if an attribute is of content type Script
   1548  *
   1549  * Returns 1 is the attribute is a script 0 otherwise
   1550  */
   1551 int
   1552 htmlIsScriptAttribute(const xmlChar *name) {
   1553     unsigned int i;
   1554 
   1555     if (name == NULL)
   1556       return(0);
   1557     /*
   1558      * all script attributes start with 'on'
   1559      */
   1560     if ((name[0] != 'o') || (name[1] != 'n'))
   1561       return(0);
   1562     for (i = 0;
   1563 	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
   1564 	 i++) {
   1565 	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
   1566 	    return(1);
   1567     }
   1568     return(0);
   1569 }
   1570 
   1571 /************************************************************************
   1572  *									*
   1573  *	The list of HTML predefined entities			*
   1574  *									*
   1575  ************************************************************************/
   1576 
   1577 
   1578 static const htmlEntityDesc  html40EntitiesTable[] = {
   1579 /*
   1580  * the 4 absolute ones, plus apostrophe.
   1581  */
   1582 { 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
   1583 { 38,	"amp",	"ampersand, U+0026 ISOnum" },
   1584 { 39,	"apos",	"single quote" },
   1585 { 60,	"lt",	"less-than sign, U+003C ISOnum" },
   1586 { 62,	"gt",	"greater-than sign, U+003E ISOnum" },
   1587 
   1588 /*
   1589  * A bunch still in the 128-255 range
   1590  * Replacing them depend really on the charset used.
   1591  */
   1592 { 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
   1593 { 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
   1594 { 162,	"cent",	"cent sign, U+00A2 ISOnum" },
   1595 { 163,	"pound","pound sign, U+00A3 ISOnum" },
   1596 { 164,	"curren","currency sign, U+00A4 ISOnum" },
   1597 { 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
   1598 { 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
   1599 { 167,	"sect",	"section sign, U+00A7 ISOnum" },
   1600 { 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
   1601 { 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
   1602 { 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
   1603 { 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
   1604 { 172,	"not",	"not sign, U+00AC ISOnum" },
   1605 { 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
   1606 { 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
   1607 { 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
   1608 { 176,	"deg",	"degree sign, U+00B0 ISOnum" },
   1609 { 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
   1610 { 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
   1611 { 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
   1612 { 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
   1613 { 181,	"micro","micro sign, U+00B5 ISOnum" },
   1614 { 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
   1615 { 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
   1616 { 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
   1617 { 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
   1618 { 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
   1619 { 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
   1620 { 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
   1621 { 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
   1622 { 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
   1623 { 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
   1624 { 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
   1625 { 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
   1626 { 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
   1627 { 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
   1628 { 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
   1629 { 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
   1630 { 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
   1631 { 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
   1632 { 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
   1633 { 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
   1634 { 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
   1635 { 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
   1636 { 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
   1637 { 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
   1638 { 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
   1639 { 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
   1640 { 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
   1641 { 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
   1642 { 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
   1643 { 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
   1644 { 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
   1645 { 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
   1646 { 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
   1647 { 215,	"times","multiplication sign, U+00D7 ISOnum" },
   1648 { 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
   1649 { 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
   1650 { 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
   1651 { 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
   1652 { 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
   1653 { 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
   1654 { 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
   1655 { 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
   1656 { 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
   1657 { 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
   1658 { 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
   1659 { 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
   1660 { 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
   1661 { 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
   1662 { 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
   1663 { 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
   1664 { 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
   1665 { 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
   1666 { 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
   1667 { 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
   1668 { 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
   1669 { 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
   1670 { 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
   1671 { 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
   1672 { 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
   1673 { 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
   1674 { 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
   1675 { 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
   1676 { 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
   1677 { 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
   1678 { 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
   1679 { 247,	"divide","division sign, U+00F7 ISOnum" },
   1680 { 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
   1681 { 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
   1682 { 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
   1683 { 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
   1684 { 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
   1685 { 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
   1686 { 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
   1687 { 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
   1688 
   1689 { 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
   1690 { 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
   1691 { 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
   1692 { 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
   1693 { 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
   1694 
   1695 /*
   1696  * Anything below should really be kept as entities references
   1697  */
   1698 { 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
   1699 
   1700 { 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
   1701 { 732,	"tilde","small tilde, U+02DC ISOdia" },
   1702 
   1703 { 913,	"Alpha","greek capital letter alpha, U+0391" },
   1704 { 914,	"Beta",	"greek capital letter beta, U+0392" },
   1705 { 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
   1706 { 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
   1707 { 917,	"Epsilon","greek capital letter epsilon, U+0395" },
   1708 { 918,	"Zeta",	"greek capital letter zeta, U+0396" },
   1709 { 919,	"Eta",	"greek capital letter eta, U+0397" },
   1710 { 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
   1711 { 921,	"Iota",	"greek capital letter iota, U+0399" },
   1712 { 922,	"Kappa","greek capital letter kappa, U+039A" },
   1713 { 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
   1714 { 924,	"Mu",	"greek capital letter mu, U+039C" },
   1715 { 925,	"Nu",	"greek capital letter nu, U+039D" },
   1716 { 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
   1717 { 927,	"Omicron","greek capital letter omicron, U+039F" },
   1718 { 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
   1719 { 929,	"Rho",	"greek capital letter rho, U+03A1" },
   1720 { 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
   1721 { 932,	"Tau",	"greek capital letter tau, U+03A4" },
   1722 { 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
   1723 { 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
   1724 { 935,	"Chi",	"greek capital letter chi, U+03A7" },
   1725 { 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
   1726 { 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
   1727 
   1728 { 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
   1729 { 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
   1730 { 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
   1731 { 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
   1732 { 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
   1733 { 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
   1734 { 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
   1735 { 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
   1736 { 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
   1737 { 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
   1738 { 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
   1739 { 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
   1740 { 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
   1741 { 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
   1742 { 959,	"omicron","greek small letter omicron, U+03BF NEW" },
   1743 { 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
   1744 { 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
   1745 { 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
   1746 { 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
   1747 { 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
   1748 { 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
   1749 { 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
   1750 { 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
   1751 { 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
   1752 { 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
   1753 { 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
   1754 { 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
   1755 { 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
   1756 
   1757 { 8194,	"ensp",	"en space, U+2002 ISOpub" },
   1758 { 8195,	"emsp",	"em space, U+2003 ISOpub" },
   1759 { 8201,	"thinsp","thin space, U+2009 ISOpub" },
   1760 { 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
   1761 { 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
   1762 { 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
   1763 { 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
   1764 { 8211,	"ndash","en dash, U+2013 ISOpub" },
   1765 { 8212,	"mdash","em dash, U+2014 ISOpub" },
   1766 { 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
   1767 { 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
   1768 { 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
   1769 { 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
   1770 { 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
   1771 { 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
   1772 { 8224,	"dagger","dagger, U+2020 ISOpub" },
   1773 { 8225,	"Dagger","double dagger, U+2021 ISOpub" },
   1774 
   1775 { 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
   1776 { 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
   1777 
   1778 { 8240,	"permil","per mille sign, U+2030 ISOtech" },
   1779 
   1780 { 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
   1781 { 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
   1782 
   1783 { 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
   1784 { 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
   1785 
   1786 { 8254,	"oline","overline = spacing overscore, U+203E NEW" },
   1787 { 8260,	"frasl","fraction slash, U+2044 NEW" },
   1788 
   1789 { 8364,	"euro",	"euro sign, U+20AC NEW" },
   1790 
   1791 { 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
   1792 { 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
   1793 { 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
   1794 { 8482,	"trade","trade mark sign, U+2122 ISOnum" },
   1795 { 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
   1796 { 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
   1797 { 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
   1798 { 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
   1799 { 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
   1800 { 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
   1801 { 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
   1802 { 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
   1803 { 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
   1804 { 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
   1805 { 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
   1806 { 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
   1807 
   1808 { 8704,	"forall","for all, U+2200 ISOtech" },
   1809 { 8706,	"part",	"partial differential, U+2202 ISOtech" },
   1810 { 8707,	"exist","there exists, U+2203 ISOtech" },
   1811 { 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
   1812 { 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
   1813 { 8712,	"isin",	"element of, U+2208 ISOtech" },
   1814 { 8713,	"notin","not an element of, U+2209 ISOtech" },
   1815 { 8715,	"ni",	"contains as member, U+220B ISOtech" },
   1816 { 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
   1817 { 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
   1818 { 8722,	"minus","minus sign, U+2212 ISOtech" },
   1819 { 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
   1820 { 8730,	"radic","square root = radical sign, U+221A ISOtech" },
   1821 { 8733,	"prop",	"proportional to, U+221D ISOtech" },
   1822 { 8734,	"infin","infinity, U+221E ISOtech" },
   1823 { 8736,	"ang",	"angle, U+2220 ISOamso" },
   1824 { 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
   1825 { 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
   1826 { 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
   1827 { 8746,	"cup",	"union = cup, U+222A ISOtech" },
   1828 { 8747,	"int",	"integral, U+222B ISOtech" },
   1829 { 8756,	"there4","therefore, U+2234 ISOtech" },
   1830 { 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
   1831 { 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
   1832 { 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
   1833 { 8800,	"ne",	"not equal to, U+2260 ISOtech" },
   1834 { 8801,	"equiv","identical to, U+2261 ISOtech" },
   1835 { 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
   1836 { 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
   1837 { 8834,	"sub",	"subset of, U+2282 ISOtech" },
   1838 { 8835,	"sup",	"superset of, U+2283 ISOtech" },
   1839 { 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
   1840 { 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
   1841 { 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
   1842 { 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
   1843 { 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
   1844 { 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
   1845 { 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
   1846 { 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
   1847 { 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
   1848 { 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
   1849 { 8971,	"rfloor","right floor, U+230B ISOamsc" },
   1850 { 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
   1851 { 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
   1852 { 9674,	"loz",	"lozenge, U+25CA ISOpub" },
   1853 
   1854 { 9824,	"spades","black spade suit, U+2660 ISOpub" },
   1855 { 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
   1856 { 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
   1857 { 9830,	"diams","black diamond suit, U+2666 ISOpub" },
   1858 
   1859 };
   1860 
   1861 /************************************************************************
   1862  *									*
   1863  *		Commodity functions to handle entities			*
   1864  *									*
   1865  ************************************************************************/
   1866 
   1867 /*
   1868  * Macro used to grow the current buffer.
   1869  */
   1870 #define growBuffer(buffer) {						\
   1871     xmlChar *tmp;							\
   1872     buffer##_size *= 2;							\
   1873     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
   1874     if (tmp == NULL) {						\
   1875 	htmlErrMemory(ctxt, "growing buffer\n");			\
   1876 	xmlFree(buffer);						\
   1877 	return(NULL);							\
   1878     }									\
   1879     buffer = tmp;							\
   1880 }
   1881 
   1882 /**
   1883  * htmlEntityLookup:
   1884  * @name: the entity name
   1885  *
   1886  * Lookup the given entity in EntitiesTable
   1887  *
   1888  * TODO: the linear scan is really ugly, an hash table is really needed.
   1889  *
   1890  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
   1891  */
   1892 const htmlEntityDesc *
   1893 htmlEntityLookup(const xmlChar *name) {
   1894     unsigned int i;
   1895 
   1896     for (i = 0;i < (sizeof(html40EntitiesTable)/
   1897                     sizeof(html40EntitiesTable[0]));i++) {
   1898         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
   1899             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
   1900 	}
   1901     }
   1902     return(NULL);
   1903 }
   1904 
   1905 /**
   1906  * htmlEntityValueLookup:
   1907  * @value: the entity's unicode value
   1908  *
   1909  * Lookup the given entity in EntitiesTable
   1910  *
   1911  * TODO: the linear scan is really ugly, an hash table is really needed.
   1912  *
   1913  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
   1914  */
   1915 const htmlEntityDesc *
   1916 htmlEntityValueLookup(unsigned int value) {
   1917     unsigned int i;
   1918 
   1919     for (i = 0;i < (sizeof(html40EntitiesTable)/
   1920                     sizeof(html40EntitiesTable[0]));i++) {
   1921         if (html40EntitiesTable[i].value >= value) {
   1922 	    if (html40EntitiesTable[i].value > value)
   1923 		break;
   1924             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
   1925 	}
   1926     }
   1927     return(NULL);
   1928 }
   1929 
   1930 /**
   1931  * UTF8ToHtml:
   1932  * @out:  a pointer to an array of bytes to store the result
   1933  * @outlen:  the length of @out
   1934  * @in:  a pointer to an array of UTF-8 chars
   1935  * @inlen:  the length of @in
   1936  *
   1937  * Take a block of UTF-8 chars in and try to convert it to an ASCII
   1938  * plus HTML entities block of chars out.
   1939  *
   1940  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
   1941  * The value of @inlen after return is the number of octets consumed
   1942  *     as the return value is positive, else unpredictable.
   1943  * The value of @outlen after return is the number of octets consumed.
   1944  */
   1945 int
   1946 UTF8ToHtml(unsigned char* out, int *outlen,
   1947               const unsigned char* in, int *inlen) {
   1948     const unsigned char* processed = in;
   1949     const unsigned char* outend;
   1950     const unsigned char* outstart = out;
   1951     const unsigned char* instart = in;
   1952     const unsigned char* inend;
   1953     unsigned int c, d;
   1954     int trailing;
   1955 
   1956     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
   1957     if (in == NULL) {
   1958         /*
   1959 	 * initialization nothing to do
   1960 	 */
   1961 	*outlen = 0;
   1962 	*inlen = 0;
   1963 	return(0);
   1964     }
   1965     inend = in + (*inlen);
   1966     outend = out + (*outlen);
   1967     while (in < inend) {
   1968 	d = *in++;
   1969 	if      (d < 0x80)  { c= d; trailing= 0; }
   1970 	else if (d < 0xC0) {
   1971 	    /* trailing byte in leading position */
   1972 	    *outlen = out - outstart;
   1973 	    *inlen = processed - instart;
   1974 	    return(-2);
   1975         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
   1976         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
   1977         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
   1978 	else {
   1979 	    /* no chance for this in Ascii */
   1980 	    *outlen = out - outstart;
   1981 	    *inlen = processed - instart;
   1982 	    return(-2);
   1983 	}
   1984 
   1985 	if (inend - in < trailing) {
   1986 	    break;
   1987 	}
   1988 
   1989 	for ( ; trailing; trailing--) {
   1990 	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
   1991 		break;
   1992 	    c <<= 6;
   1993 	    c |= d & 0x3F;
   1994 	}
   1995 
   1996 	/* assertion: c is a single UTF-4 value */
   1997 	if (c < 0x80) {
   1998 	    if (out + 1 >= outend)
   1999 		break;
   2000 	    *out++ = c;
   2001 	} else {
   2002 	    int len;
   2003 	    const htmlEntityDesc * ent;
   2004 	    const char *cp;
   2005 	    char nbuf[16];
   2006 
   2007 	    /*
   2008 	     * Try to lookup a predefined HTML entity for it
   2009 	     */
   2010 
   2011 	    ent = htmlEntityValueLookup(c);
   2012 	    if (ent == NULL) {
   2013 	      snprintf(nbuf, sizeof(nbuf), "#%u", c);
   2014 	      cp = nbuf;
   2015 	    }
   2016 	    else
   2017 	      cp = ent->name;
   2018 	    len = strlen(cp);
   2019 	    if (out + 2 + len >= outend)
   2020 		break;
   2021 	    *out++ = '&';
   2022 	    memcpy(out, cp, len);
   2023 	    out += len;
   2024 	    *out++ = ';';
   2025 	}
   2026 	processed = in;
   2027     }
   2028     *outlen = out - outstart;
   2029     *inlen = processed - instart;
   2030     return(0);
   2031 }
   2032 
   2033 /**
   2034  * htmlEncodeEntities:
   2035  * @out:  a pointer to an array of bytes to store the result
   2036  * @outlen:  the length of @out
   2037  * @in:  a pointer to an array of UTF-8 chars
   2038  * @inlen:  the length of @in
   2039  * @quoteChar: the quote character to escape (' or ") or zero.
   2040  *
   2041  * Take a block of UTF-8 chars in and try to convert it to an ASCII
   2042  * plus HTML entities block of chars out.
   2043  *
   2044  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
   2045  * The value of @inlen after return is the number of octets consumed
   2046  *     as the return value is positive, else unpredictable.
   2047  * The value of @outlen after return is the number of octets consumed.
   2048  */
   2049 int
   2050 htmlEncodeEntities(unsigned char* out, int *outlen,
   2051 		   const unsigned char* in, int *inlen, int quoteChar) {
   2052     const unsigned char* processed = in;
   2053     const unsigned char* outend;
   2054     const unsigned char* outstart = out;
   2055     const unsigned char* instart = in;
   2056     const unsigned char* inend;
   2057     unsigned int c, d;
   2058     int trailing;
   2059 
   2060     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
   2061         return(-1);
   2062     outend = out + (*outlen);
   2063     inend = in + (*inlen);
   2064     while (in < inend) {
   2065 	d = *in++;
   2066 	if      (d < 0x80)  { c= d; trailing= 0; }
   2067 	else if (d < 0xC0) {
   2068 	    /* trailing byte in leading position */
   2069 	    *outlen = out - outstart;
   2070 	    *inlen = processed - instart;
   2071 	    return(-2);
   2072         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
   2073         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
   2074         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
   2075 	else {
   2076 	    /* no chance for this in Ascii */
   2077 	    *outlen = out - outstart;
   2078 	    *inlen = processed - instart;
   2079 	    return(-2);
   2080 	}
   2081 
   2082 	if (inend - in < trailing)
   2083 	    break;
   2084 
   2085 	while (trailing--) {
   2086 	    if (((d= *in++) & 0xC0) != 0x80) {
   2087 		*outlen = out - outstart;
   2088 		*inlen = processed - instart;
   2089 		return(-2);
   2090 	    }
   2091 	    c <<= 6;
   2092 	    c |= d & 0x3F;
   2093 	}
   2094 
   2095 	/* assertion: c is a single UTF-4 value */
   2096 	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
   2097 	    (c != '&') && (c != '<') && (c != '>')) {
   2098 	    if (out >= outend)
   2099 		break;
   2100 	    *out++ = c;
   2101 	} else {
   2102 	    const htmlEntityDesc * ent;
   2103 	    const char *cp;
   2104 	    char nbuf[16];
   2105 	    int len;
   2106 
   2107 	    /*
   2108 	     * Try to lookup a predefined HTML entity for it
   2109 	     */
   2110 	    ent = htmlEntityValueLookup(c);
   2111 	    if (ent == NULL) {
   2112 		snprintf(nbuf, sizeof(nbuf), "#%u", c);
   2113 		cp = nbuf;
   2114 	    }
   2115 	    else
   2116 		cp = ent->name;
   2117 	    len = strlen(cp);
   2118 	    if (out + 2 + len > outend)
   2119 		break;
   2120 	    *out++ = '&';
   2121 	    memcpy(out, cp, len);
   2122 	    out += len;
   2123 	    *out++ = ';';
   2124 	}
   2125 	processed = in;
   2126     }
   2127     *outlen = out - outstart;
   2128     *inlen = processed - instart;
   2129     return(0);
   2130 }
   2131 
   2132 /************************************************************************
   2133  *									*
   2134  *		Commodity functions to handle streams			*
   2135  *									*
   2136  ************************************************************************/
   2137 
   2138 /**
   2139  * htmlNewInputStream:
   2140  * @ctxt:  an HTML parser context
   2141  *
   2142  * Create a new input stream structure
   2143  * Returns the new input stream or NULL
   2144  */
   2145 static htmlParserInputPtr
   2146 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
   2147     htmlParserInputPtr input;
   2148 
   2149     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
   2150     if (input == NULL) {
   2151         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
   2152 	return(NULL);
   2153     }
   2154     memset(input, 0, sizeof(htmlParserInput));
   2155     input->filename = NULL;
   2156     input->directory = NULL;
   2157     input->base = NULL;
   2158     input->cur = NULL;
   2159     input->buf = NULL;
   2160     input->line = 1;
   2161     input->col = 1;
   2162     input->buf = NULL;
   2163     input->free = NULL;
   2164     input->version = NULL;
   2165     input->consumed = 0;
   2166     input->length = 0;
   2167     return(input);
   2168 }
   2169 
   2170 
   2171 /************************************************************************
   2172  *									*
   2173  *		Commodity functions, cleanup needed ?			*
   2174  *									*
   2175  ************************************************************************/
   2176 /*
   2177  * all tags allowing pc data from the html 4.01 loose dtd
   2178  * NOTE: it might be more apropriate to integrate this information
   2179  * into the html40ElementTable array but I don't want to risk any
   2180  * binary incomptibility
   2181  */
   2182 static const char *allowPCData[] = {
   2183     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
   2184     "blockquote", "body", "button", "caption", "center", "cite", "code",
   2185     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
   2186     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
   2187     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
   2188     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
   2189 };
   2190 
   2191 /**
   2192  * areBlanks:
   2193  * @ctxt:  an HTML parser context
   2194  * @str:  a xmlChar *
   2195  * @len:  the size of @str
   2196  *
   2197  * Is this a sequence of blank chars that one can ignore ?
   2198  *
   2199  * Returns 1 if ignorable 0 otherwise.
   2200  */
   2201 
   2202 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
   2203     unsigned int i;
   2204     int j;
   2205     xmlNodePtr lastChild;
   2206     xmlDtdPtr dtd;
   2207 
   2208     for (j = 0;j < len;j++)
   2209         if (!(IS_BLANK_CH(str[j]))) return(0);
   2210 
   2211     if (CUR == 0) return(1);
   2212     if (CUR != '<') return(0);
   2213     if (ctxt->name == NULL)
   2214 	return(1);
   2215     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
   2216 	return(1);
   2217     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
   2218 	return(1);
   2219 
   2220     /* Only strip CDATA children of the body tag for strict HTML DTDs */
   2221     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
   2222         dtd = xmlGetIntSubset(ctxt->myDoc);
   2223         if (dtd != NULL && dtd->ExternalID != NULL) {
   2224             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
   2225                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
   2226                 return(1);
   2227         }
   2228     }
   2229 
   2230     if (ctxt->node == NULL) return(0);
   2231     lastChild = xmlGetLastChild(ctxt->node);
   2232     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
   2233 	lastChild = lastChild->prev;
   2234     if (lastChild == NULL) {
   2235         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
   2236             (ctxt->node->content != NULL)) return(0);
   2237 	/* keep ws in constructs like ...<b> </b>...
   2238 	   for all tags "b" allowing PCDATA */
   2239 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
   2240 	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
   2241 		return(0);
   2242 	    }
   2243 	}
   2244     } else if (xmlNodeIsText(lastChild)) {
   2245         return(0);
   2246     } else {
   2247 	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
   2248 	   for all tags "p" allowing PCDATA */
   2249 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
   2250 	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
   2251 		return(0);
   2252 	    }
   2253 	}
   2254     }
   2255     return(1);
   2256 }
   2257 
   2258 /**
   2259  * htmlNewDocNoDtD:
   2260  * @URI:  URI for the dtd, or NULL
   2261  * @ExternalID:  the external ID of the DTD, or NULL
   2262  *
   2263  * Creates a new HTML document without a DTD node if @URI and @ExternalID
   2264  * are NULL
   2265  *
   2266  * Returns a new document, do not initialize the DTD if not provided
   2267  */
   2268 htmlDocPtr
   2269 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
   2270     xmlDocPtr cur;
   2271 
   2272     /*
   2273      * Allocate a new document and fill the fields.
   2274      */
   2275     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
   2276     if (cur == NULL) {
   2277 	htmlErrMemory(NULL, "HTML document creation failed\n");
   2278 	return(NULL);
   2279     }
   2280     memset(cur, 0, sizeof(xmlDoc));
   2281 
   2282     cur->type = XML_HTML_DOCUMENT_NODE;
   2283     cur->version = NULL;
   2284     cur->intSubset = NULL;
   2285     cur->doc = cur;
   2286     cur->name = NULL;
   2287     cur->children = NULL;
   2288     cur->extSubset = NULL;
   2289     cur->oldNs = NULL;
   2290     cur->encoding = NULL;
   2291     cur->standalone = 1;
   2292     cur->compression = 0;
   2293     cur->ids = NULL;
   2294     cur->refs = NULL;
   2295     cur->_private = NULL;
   2296     cur->charset = XML_CHAR_ENCODING_UTF8;
   2297     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
   2298     if ((ExternalID != NULL) ||
   2299 	(URI != NULL))
   2300 	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
   2301     return(cur);
   2302 }
   2303 
   2304 /**
   2305  * htmlNewDoc:
   2306  * @URI:  URI for the dtd, or NULL
   2307  * @ExternalID:  the external ID of the DTD, or NULL
   2308  *
   2309  * Creates a new HTML document
   2310  *
   2311  * Returns a new document
   2312  */
   2313 htmlDocPtr
   2314 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
   2315     if ((URI == NULL) && (ExternalID == NULL))
   2316 	return(htmlNewDocNoDtD(
   2317 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
   2318 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
   2319 
   2320     return(htmlNewDocNoDtD(URI, ExternalID));
   2321 }
   2322 
   2323 
   2324 /************************************************************************
   2325  *									*
   2326  *			The parser itself				*
   2327  *	Relates to http://www.w3.org/TR/html40				*
   2328  *									*
   2329  ************************************************************************/
   2330 
   2331 /************************************************************************
   2332  *									*
   2333  *			The parser itself				*
   2334  *									*
   2335  ************************************************************************/
   2336 
   2337 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
   2338 
   2339 /**
   2340  * htmlParseHTMLName:
   2341  * @ctxt:  an HTML parser context
   2342  *
   2343  * parse an HTML tag or attribute name, note that we convert it to lowercase
   2344  * since HTML names are not case-sensitive.
   2345  *
   2346  * Returns the Tag Name parsed or NULL
   2347  */
   2348 
   2349 static const xmlChar *
   2350 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
   2351     int i = 0;
   2352     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
   2353 
   2354     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
   2355         (CUR != ':') && (CUR != '.')) return(NULL);
   2356 
   2357     while ((i < HTML_PARSER_BUFFER_SIZE) &&
   2358            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
   2359 	   (CUR == ':') || (CUR == '-') || (CUR == '_') ||
   2360            (CUR == '.'))) {
   2361 	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
   2362         else loc[i] = CUR;
   2363 	i++;
   2364 
   2365 	NEXT;
   2366     }
   2367 
   2368     return(xmlDictLookup(ctxt->dict, loc, i));
   2369 }
   2370 
   2371 
   2372 /**
   2373  * htmlParseHTMLName_nonInvasive:
   2374  * @ctxt:  an HTML parser context
   2375  *
   2376  * parse an HTML tag or attribute name, note that we convert it to lowercase
   2377  * since HTML names are not case-sensitive, this doesn't consume the data
   2378  * from the stream, it's a look-ahead
   2379  *
   2380  * Returns the Tag Name parsed or NULL
   2381  */
   2382 
   2383 static const xmlChar *
   2384 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
   2385     int i = 0;
   2386     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
   2387 
   2388     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
   2389         (NXT(1) != ':')) return(NULL);
   2390 
   2391     while ((i < HTML_PARSER_BUFFER_SIZE) &&
   2392            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
   2393 	   (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
   2394 	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
   2395         else loc[i] = NXT(1+i);
   2396 	i++;
   2397     }
   2398 
   2399     return(xmlDictLookup(ctxt->dict, loc, i));
   2400 }
   2401 
   2402 
   2403 /**
   2404  * htmlParseName:
   2405  * @ctxt:  an HTML parser context
   2406  *
   2407  * parse an HTML name, this routine is case sensitive.
   2408  *
   2409  * Returns the Name parsed or NULL
   2410  */
   2411 
   2412 static const xmlChar *
   2413 htmlParseName(htmlParserCtxtPtr ctxt) {
   2414     const xmlChar *in;
   2415     const xmlChar *ret;
   2416     int count = 0;
   2417 
   2418     GROW;
   2419 
   2420     /*
   2421      * Accelerator for simple ASCII names
   2422      */
   2423     in = ctxt->input->cur;
   2424     if (((*in >= 0x61) && (*in <= 0x7A)) ||
   2425 	((*in >= 0x41) && (*in <= 0x5A)) ||
   2426 	(*in == '_') || (*in == ':')) {
   2427 	in++;
   2428 	while (((*in >= 0x61) && (*in <= 0x7A)) ||
   2429 	       ((*in >= 0x41) && (*in <= 0x5A)) ||
   2430 	       ((*in >= 0x30) && (*in <= 0x39)) ||
   2431 	       (*in == '_') || (*in == '-') ||
   2432 	       (*in == ':') || (*in == '.'))
   2433 	    in++;
   2434 	if ((*in > 0) && (*in < 0x80)) {
   2435 	    count = in - ctxt->input->cur;
   2436 	    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
   2437 	    ctxt->input->cur = in;
   2438 	    ctxt->nbChars += count;
   2439 	    ctxt->input->col += count;
   2440 	    return(ret);
   2441 	}
   2442     }
   2443     return(htmlParseNameComplex(ctxt));
   2444 }
   2445 
   2446 static const xmlChar *
   2447 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
   2448     int len = 0, l;
   2449     int c;
   2450     int count = 0;
   2451 
   2452     /*
   2453      * Handler for more complex cases
   2454      */
   2455     GROW;
   2456     c = CUR_CHAR(l);
   2457     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
   2458 	(!IS_LETTER(c) && (c != '_') &&
   2459          (c != ':'))) {
   2460 	return(NULL);
   2461     }
   2462 
   2463     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
   2464 	   ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
   2465             (c == '.') || (c == '-') ||
   2466 	    (c == '_') || (c == ':') ||
   2467 	    (IS_COMBINING(c)) ||
   2468 	    (IS_EXTENDER(c)))) {
   2469 	if (count++ > 100) {
   2470 	    count = 0;
   2471 	    GROW;
   2472 	}
   2473 	len += l;
   2474 	NEXTL(l);
   2475 	c = CUR_CHAR(l);
   2476     }
   2477     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
   2478 }
   2479 
   2480 
   2481 /**
   2482  * htmlParseHTMLAttribute:
   2483  * @ctxt:  an HTML parser context
   2484  * @stop:  a char stop value
   2485  *
   2486  * parse an HTML attribute value till the stop (quote), if
   2487  * stop is 0 then it stops at the first space
   2488  *
   2489  * Returns the attribute parsed or NULL
   2490  */
   2491 
   2492 static xmlChar *
   2493 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
   2494     xmlChar *buffer = NULL;
   2495     int buffer_size = 0;
   2496     xmlChar *out = NULL;
   2497     const xmlChar *name = NULL;
   2498     const xmlChar *cur = NULL;
   2499     const htmlEntityDesc * ent;
   2500 
   2501     /*
   2502      * allocate a translation buffer.
   2503      */
   2504     buffer_size = HTML_PARSER_BUFFER_SIZE;
   2505     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
   2506     if (buffer == NULL) {
   2507 	htmlErrMemory(ctxt, "buffer allocation failed\n");
   2508 	return(NULL);
   2509     }
   2510     out = buffer;
   2511 
   2512     /*
   2513      * Ok loop until we reach one of the ending chars
   2514      */
   2515     while ((CUR != 0) && (CUR != stop)) {
   2516 	if ((stop == 0) && (CUR == '>')) break;
   2517 	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
   2518         if (CUR == '&') {
   2519 	    if (NXT(1) == '#') {
   2520 		unsigned int c;
   2521 		int bits;
   2522 
   2523 		c = htmlParseCharRef(ctxt);
   2524 		if      (c <    0x80)
   2525 		        { *out++  = c;                bits= -6; }
   2526 		else if (c <   0x800)
   2527 		        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   2528 		else if (c < 0x10000)
   2529 		        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   2530 		else
   2531 		        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   2532 
   2533 		for ( ; bits >= 0; bits-= 6) {
   2534 		    *out++  = ((c >> bits) & 0x3F) | 0x80;
   2535 		}
   2536 
   2537 		if (out - buffer > buffer_size - 100) {
   2538 			int indx = out - buffer;
   2539 
   2540 			growBuffer(buffer);
   2541 			out = &buffer[indx];
   2542 		}
   2543 	    } else {
   2544 		ent = htmlParseEntityRef(ctxt, &name);
   2545 		if (name == NULL) {
   2546 		    *out++ = '&';
   2547 		    if (out - buffer > buffer_size - 100) {
   2548 			int indx = out - buffer;
   2549 
   2550 			growBuffer(buffer);
   2551 			out = &buffer[indx];
   2552 		    }
   2553 		} else if (ent == NULL) {
   2554 		    *out++ = '&';
   2555 		    cur = name;
   2556 		    while (*cur != 0) {
   2557 			if (out - buffer > buffer_size - 100) {
   2558 			    int indx = out - buffer;
   2559 
   2560 			    growBuffer(buffer);
   2561 			    out = &buffer[indx];
   2562 			}
   2563 			*out++ = *cur++;
   2564 		    }
   2565 		} else {
   2566 		    unsigned int c;
   2567 		    int bits;
   2568 
   2569 		    if (out - buffer > buffer_size - 100) {
   2570 			int indx = out - buffer;
   2571 
   2572 			growBuffer(buffer);
   2573 			out = &buffer[indx];
   2574 		    }
   2575 		    c = ent->value;
   2576 		    if      (c <    0x80)
   2577 			{ *out++  = c;                bits= -6; }
   2578 		    else if (c <   0x800)
   2579 			{ *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   2580 		    else if (c < 0x10000)
   2581 			{ *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   2582 		    else
   2583 			{ *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   2584 
   2585 		    for ( ; bits >= 0; bits-= 6) {
   2586 			*out++  = ((c >> bits) & 0x3F) | 0x80;
   2587 		    }
   2588 		}
   2589 	    }
   2590 	} else {
   2591 	    unsigned int c;
   2592 	    int bits, l;
   2593 
   2594 	    if (out - buffer > buffer_size - 100) {
   2595 		int indx = out - buffer;
   2596 
   2597 		growBuffer(buffer);
   2598 		out = &buffer[indx];
   2599 	    }
   2600 	    c = CUR_CHAR(l);
   2601 	    if      (c <    0x80)
   2602 		    { *out++  = c;                bits= -6; }
   2603 	    else if (c <   0x800)
   2604 		    { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   2605 	    else if (c < 0x10000)
   2606 		    { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   2607 	    else
   2608 		    { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   2609 
   2610 	    for ( ; bits >= 0; bits-= 6) {
   2611 		*out++  = ((c >> bits) & 0x3F) | 0x80;
   2612 	    }
   2613 	    NEXT;
   2614 	}
   2615     }
   2616     *out = 0;
   2617     return(buffer);
   2618 }
   2619 
   2620 /**
   2621  * htmlParseEntityRef:
   2622  * @ctxt:  an HTML parser context
   2623  * @str:  location to store the entity name
   2624  *
   2625  * parse an HTML ENTITY references
   2626  *
   2627  * [68] EntityRef ::= '&' Name ';'
   2628  *
   2629  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
   2630  *         if non-NULL *str will have to be freed by the caller.
   2631  */
   2632 const htmlEntityDesc *
   2633 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
   2634     const xmlChar *name;
   2635     const htmlEntityDesc * ent = NULL;
   2636 
   2637     if (str != NULL) *str = NULL;
   2638     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
   2639 
   2640     if (CUR == '&') {
   2641         NEXT;
   2642         name = htmlParseName(ctxt);
   2643 	if (name == NULL) {
   2644 	    htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   2645 	                 "htmlParseEntityRef: no name\n", NULL, NULL);
   2646 	} else {
   2647 	    GROW;
   2648 	    if (CUR == ';') {
   2649 	        if (str != NULL)
   2650 		    *str = name;
   2651 
   2652 		/*
   2653 		 * Lookup the entity in the table.
   2654 		 */
   2655 		ent = htmlEntityLookup(name);
   2656 		if (ent != NULL) /* OK that's ugly !!! */
   2657 		    NEXT;
   2658 	    } else {
   2659 		htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
   2660 		             "htmlParseEntityRef: expecting ';'\n",
   2661 			     NULL, NULL);
   2662 	        if (str != NULL)
   2663 		    *str = name;
   2664 	    }
   2665 	}
   2666     }
   2667     return(ent);
   2668 }
   2669 
   2670 /**
   2671  * htmlParseAttValue:
   2672  * @ctxt:  an HTML parser context
   2673  *
   2674  * parse a value for an attribute
   2675  * Note: the parser won't do substitution of entities here, this
   2676  * will be handled later in xmlStringGetNodeList, unless it was
   2677  * asked for ctxt->replaceEntities != 0
   2678  *
   2679  * Returns the AttValue parsed or NULL.
   2680  */
   2681 
   2682 static xmlChar *
   2683 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
   2684     xmlChar *ret = NULL;
   2685 
   2686     if (CUR == '"') {
   2687         NEXT;
   2688 	ret = htmlParseHTMLAttribute(ctxt, '"');
   2689         if (CUR != '"') {
   2690 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
   2691 	                 "AttValue: \" expected\n", NULL, NULL);
   2692 	} else
   2693 	    NEXT;
   2694     } else if (CUR == '\'') {
   2695         NEXT;
   2696 	ret = htmlParseHTMLAttribute(ctxt, '\'');
   2697         if (CUR != '\'') {
   2698 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
   2699 	                 "AttValue: ' expected\n", NULL, NULL);
   2700 	} else
   2701 	    NEXT;
   2702     } else {
   2703         /*
   2704 	 * That's an HTMLism, the attribute value may not be quoted
   2705 	 */
   2706 	ret = htmlParseHTMLAttribute(ctxt, 0);
   2707 	if (ret == NULL) {
   2708 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
   2709 	                 "AttValue: no value found\n", NULL, NULL);
   2710 	}
   2711     }
   2712     return(ret);
   2713 }
   2714 
   2715 /**
   2716  * htmlParseSystemLiteral:
   2717  * @ctxt:  an HTML parser context
   2718  *
   2719  * parse an HTML Literal
   2720  *
   2721  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
   2722  *
   2723  * Returns the SystemLiteral parsed or NULL
   2724  */
   2725 
   2726 static xmlChar *
   2727 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
   2728     const xmlChar *q;
   2729     xmlChar *ret = NULL;
   2730 
   2731     if (CUR == '"') {
   2732         NEXT;
   2733 	q = CUR_PTR;
   2734 	while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
   2735 	    NEXT;
   2736 	if (!IS_CHAR_CH(CUR)) {
   2737 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
   2738 			 "Unfinished SystemLiteral\n", NULL, NULL);
   2739 	} else {
   2740 	    ret = xmlStrndup(q, CUR_PTR - q);
   2741 	    NEXT;
   2742         }
   2743     } else if (CUR == '\'') {
   2744         NEXT;
   2745 	q = CUR_PTR;
   2746 	while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
   2747 	    NEXT;
   2748 	if (!IS_CHAR_CH(CUR)) {
   2749 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
   2750 			 "Unfinished SystemLiteral\n", NULL, NULL);
   2751 	} else {
   2752 	    ret = xmlStrndup(q, CUR_PTR - q);
   2753 	    NEXT;
   2754         }
   2755     } else {
   2756 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
   2757 	             " or ' expected\n", NULL, NULL);
   2758     }
   2759 
   2760     return(ret);
   2761 }
   2762 
   2763 /**
   2764  * htmlParsePubidLiteral:
   2765  * @ctxt:  an HTML parser context
   2766  *
   2767  * parse an HTML public literal
   2768  *
   2769  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
   2770  *
   2771  * Returns the PubidLiteral parsed or NULL.
   2772  */
   2773 
   2774 static xmlChar *
   2775 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
   2776     const xmlChar *q;
   2777     xmlChar *ret = NULL;
   2778     /*
   2779      * Name ::= (Letter | '_') (NameChar)*
   2780      */
   2781     if (CUR == '"') {
   2782         NEXT;
   2783 	q = CUR_PTR;
   2784 	while (IS_PUBIDCHAR_CH(CUR)) NEXT;
   2785 	if (CUR != '"') {
   2786 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
   2787 	                 "Unfinished PubidLiteral\n", NULL, NULL);
   2788 	} else {
   2789 	    ret = xmlStrndup(q, CUR_PTR - q);
   2790 	    NEXT;
   2791 	}
   2792     } else if (CUR == '\'') {
   2793         NEXT;
   2794 	q = CUR_PTR;
   2795 	while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
   2796 	    NEXT;
   2797 	if (CUR != '\'') {
   2798 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
   2799 	                 "Unfinished PubidLiteral\n", NULL, NULL);
   2800 	} else {
   2801 	    ret = xmlStrndup(q, CUR_PTR - q);
   2802 	    NEXT;
   2803 	}
   2804     } else {
   2805 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
   2806 	             "PubidLiteral \" or ' expected\n", NULL, NULL);
   2807     }
   2808 
   2809     return(ret);
   2810 }
   2811 
   2812 /**
   2813  * htmlParseScript:
   2814  * @ctxt:  an HTML parser context
   2815  *
   2816  * parse the content of an HTML SCRIPT or STYLE element
   2817  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
   2818  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
   2819  * http://www.w3.org/TR/html4/types.html#type-script
   2820  * http://www.w3.org/TR/html4/types.html#h-6.15
   2821  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
   2822  *
   2823  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
   2824  * element and the value of intrinsic event attributes. User agents must
   2825  * not evaluate script data as HTML markup but instead must pass it on as
   2826  * data to a script engine.
   2827  * NOTES:
   2828  * - The content is passed like CDATA
   2829  * - the attributes for style and scripting "onXXX" are also described
   2830  *   as CDATA but SGML allows entities references in attributes so their
   2831  *   processing is identical as other attributes
   2832  */
   2833 static void
   2834 htmlParseScript(htmlParserCtxtPtr ctxt) {
   2835     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
   2836     int nbchar = 0;
   2837     int cur,l;
   2838 
   2839     SHRINK;
   2840     cur = CUR_CHAR(l);
   2841     while (IS_CHAR_CH(cur)) {
   2842 	if ((cur == '<') && (NXT(1) == '/')) {
   2843             /*
   2844              * One should break here, the specification is clear:
   2845              * Authors should therefore escape "</" within the content.
   2846              * Escape mechanisms are specific to each scripting or
   2847              * style sheet language.
   2848              *
   2849              * In recovery mode, only break if end tag match the
   2850              * current tag, effectively ignoring all tags inside the
   2851              * script/style block and treating the entire block as
   2852              * CDATA.
   2853              */
   2854             if (ctxt->recovery) {
   2855                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
   2856 				   xmlStrlen(ctxt->name)) == 0)
   2857                 {
   2858                     break; /* while */
   2859                 } else {
   2860 		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
   2861 				 "Element %s embeds close tag\n",
   2862 		                 ctxt->name, NULL);
   2863 		}
   2864             } else {
   2865                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
   2866                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
   2867                 {
   2868                     break; /* while */
   2869                 }
   2870             }
   2871 	}
   2872 	COPY_BUF(l,buf,nbchar,cur);
   2873 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
   2874 	    if (ctxt->sax->cdataBlock!= NULL) {
   2875 		/*
   2876 		 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
   2877 		 */
   2878 		ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
   2879 	    } else if (ctxt->sax->characters != NULL) {
   2880 		ctxt->sax->characters(ctxt->userData, buf, nbchar);
   2881 	    }
   2882 	    nbchar = 0;
   2883 	}
   2884 	GROW;
   2885 	NEXTL(l);
   2886 	cur = CUR_CHAR(l);
   2887     }
   2888 
   2889     if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
   2890         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
   2891                     "Invalid char in CDATA 0x%X\n", cur);
   2892         if (ctxt->input->cur < ctxt->input->end) {
   2893             NEXT;
   2894         }
   2895     }
   2896 
   2897     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
   2898 	if (ctxt->sax->cdataBlock!= NULL) {
   2899 	    /*
   2900 	     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
   2901 	     */
   2902 	    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
   2903 	} else if (ctxt->sax->characters != NULL) {
   2904 	    ctxt->sax->characters(ctxt->userData, buf, nbchar);
   2905 	}
   2906     }
   2907 }
   2908 
   2909 
   2910 /**
   2911  * htmlParseCharData:
   2912  * @ctxt:  an HTML parser context
   2913  *
   2914  * parse a CharData section.
   2915  * if we are within a CDATA section ']]>' marks an end of section.
   2916  *
   2917  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
   2918  */
   2919 
   2920 static void
   2921 htmlParseCharData(htmlParserCtxtPtr ctxt) {
   2922     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
   2923     int nbchar = 0;
   2924     int cur, l;
   2925     int chunk = 0;
   2926 
   2927     SHRINK;
   2928     cur = CUR_CHAR(l);
   2929     while (((cur != '<') || (ctxt->token == '<')) &&
   2930            ((cur != '&') || (ctxt->token == '&')) &&
   2931 	   (cur != 0)) {
   2932 	if (!(IS_CHAR(cur))) {
   2933 	    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
   2934 	                "Invalid char in CDATA 0x%X\n", cur);
   2935 	} else {
   2936 	    COPY_BUF(l,buf,nbchar,cur);
   2937 	}
   2938 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
   2939 	    /*
   2940 	     * Ok the segment is to be consumed as chars.
   2941 	     */
   2942 	    if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
   2943 		if (areBlanks(ctxt, buf, nbchar)) {
   2944 		    if (ctxt->sax->ignorableWhitespace != NULL)
   2945 			ctxt->sax->ignorableWhitespace(ctxt->userData,
   2946 			                               buf, nbchar);
   2947 		} else {
   2948 		    htmlCheckParagraph(ctxt);
   2949 		    if (ctxt->sax->characters != NULL)
   2950 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
   2951 		}
   2952 	    }
   2953 	    nbchar = 0;
   2954 	}
   2955 	NEXTL(l);
   2956         chunk++;
   2957         if (chunk > HTML_PARSER_BUFFER_SIZE) {
   2958             chunk = 0;
   2959             SHRINK;
   2960             GROW;
   2961         }
   2962 	cur = CUR_CHAR(l);
   2963 	if (cur == 0) {
   2964 	    SHRINK;
   2965 	    GROW;
   2966 	    cur = CUR_CHAR(l);
   2967 	}
   2968     }
   2969     if (nbchar != 0) {
   2970         buf[nbchar] = 0;
   2971 
   2972 	/*
   2973 	 * Ok the segment is to be consumed as chars.
   2974 	 */
   2975 	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
   2976 	    if (areBlanks(ctxt, buf, nbchar)) {
   2977 		if (ctxt->sax->ignorableWhitespace != NULL)
   2978 		    ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
   2979 	    } else {
   2980 		htmlCheckParagraph(ctxt);
   2981 		if (ctxt->sax->characters != NULL)
   2982 		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
   2983 	    }
   2984 	}
   2985     } else {
   2986 	/*
   2987 	 * Loop detection
   2988 	 */
   2989 	if (cur == 0)
   2990 	    ctxt->instate = XML_PARSER_EOF;
   2991     }
   2992 }
   2993 
   2994 /**
   2995  * htmlParseExternalID:
   2996  * @ctxt:  an HTML parser context
   2997  * @publicID:  a xmlChar** receiving PubidLiteral
   2998  *
   2999  * Parse an External ID or a Public ID
   3000  *
   3001  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
   3002  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
   3003  *
   3004  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
   3005  *
   3006  * Returns the function returns SystemLiteral and in the second
   3007  *                case publicID receives PubidLiteral, is strict is off
   3008  *                it is possible to return NULL and have publicID set.
   3009  */
   3010 
   3011 static xmlChar *
   3012 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
   3013     xmlChar *URI = NULL;
   3014 
   3015     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
   3016          (UPP(2) == 'S') && (UPP(3) == 'T') &&
   3017 	 (UPP(4) == 'E') && (UPP(5) == 'M')) {
   3018         SKIP(6);
   3019 	if (!IS_BLANK_CH(CUR)) {
   3020 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
   3021 	                 "Space required after 'SYSTEM'\n", NULL, NULL);
   3022 	}
   3023         SKIP_BLANKS;
   3024 	URI = htmlParseSystemLiteral(ctxt);
   3025 	if (URI == NULL) {
   3026 	    htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
   3027 	                 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
   3028         }
   3029     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
   3030 	       (UPP(2) == 'B') && (UPP(3) == 'L') &&
   3031 	       (UPP(4) == 'I') && (UPP(5) == 'C')) {
   3032         SKIP(6);
   3033 	if (!IS_BLANK_CH(CUR)) {
   3034 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
   3035 	                 "Space required after 'PUBLIC'\n", NULL, NULL);
   3036 	}
   3037         SKIP_BLANKS;
   3038 	*publicID = htmlParsePubidLiteral(ctxt);
   3039 	if (*publicID == NULL) {
   3040 	    htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
   3041 	                 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
   3042 			 NULL, NULL);
   3043 	}
   3044         SKIP_BLANKS;
   3045         if ((CUR == '"') || (CUR == '\'')) {
   3046 	    URI = htmlParseSystemLiteral(ctxt);
   3047 	}
   3048     }
   3049     return(URI);
   3050 }
   3051 
   3052 /**
   3053  * xmlParsePI:
   3054  * @ctxt:  an XML parser context
   3055  *
   3056  * parse an XML Processing Instruction.
   3057  *
   3058  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
   3059  */
   3060 static void
   3061 htmlParsePI(htmlParserCtxtPtr ctxt) {
   3062     xmlChar *buf = NULL;
   3063     int len = 0;
   3064     int size = HTML_PARSER_BUFFER_SIZE;
   3065     int cur, l;
   3066     const xmlChar *target;
   3067     xmlParserInputState state;
   3068     int count = 0;
   3069 
   3070     if ((RAW == '<') && (NXT(1) == '?')) {
   3071 	state = ctxt->instate;
   3072         ctxt->instate = XML_PARSER_PI;
   3073 	/*
   3074 	 * this is a Processing Instruction.
   3075 	 */
   3076 	SKIP(2);
   3077 	SHRINK;
   3078 
   3079 	/*
   3080 	 * Parse the target name and check for special support like
   3081 	 * namespace.
   3082 	 */
   3083         target = htmlParseName(ctxt);
   3084 	if (target != NULL) {
   3085 	    if (RAW == '>') {
   3086 		SKIP(1);
   3087 
   3088 		/*
   3089 		 * SAX: PI detected.
   3090 		 */
   3091 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
   3092 		    (ctxt->sax->processingInstruction != NULL))
   3093 		    ctxt->sax->processingInstruction(ctxt->userData,
   3094 		                                     target, NULL);
   3095 		ctxt->instate = state;
   3096 		return;
   3097 	    }
   3098 	    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
   3099 	    if (buf == NULL) {
   3100 		htmlErrMemory(ctxt, NULL);
   3101 		ctxt->instate = state;
   3102 		return;
   3103 	    }
   3104 	    cur = CUR;
   3105 	    if (!IS_BLANK(cur)) {
   3106 		htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
   3107 			  "ParsePI: PI %s space expected\n", target, NULL);
   3108 	    }
   3109             SKIP_BLANKS;
   3110 	    cur = CUR_CHAR(l);
   3111 	    while (IS_CHAR(cur) && (cur != '>')) {
   3112 		if (len + 5 >= size) {
   3113 		    xmlChar *tmp;
   3114 
   3115 		    size *= 2;
   3116 		    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
   3117 		    if (tmp == NULL) {
   3118 			htmlErrMemory(ctxt, NULL);
   3119 			xmlFree(buf);
   3120 			ctxt->instate = state;
   3121 			return;
   3122 		    }
   3123 		    buf = tmp;
   3124 		}
   3125 		count++;
   3126 		if (count > 50) {
   3127 		    GROW;
   3128 		    count = 0;
   3129 		}
   3130 		COPY_BUF(l,buf,len,cur);
   3131 		NEXTL(l);
   3132 		cur = CUR_CHAR(l);
   3133 		if (cur == 0) {
   3134 		    SHRINK;
   3135 		    GROW;
   3136 		    cur = CUR_CHAR(l);
   3137 		}
   3138 	    }
   3139 	    buf[len] = 0;
   3140 	    if (cur != '>') {
   3141 		htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
   3142 		      "ParsePI: PI %s never end ...\n", target, NULL);
   3143 	    } else {
   3144 		SKIP(1);
   3145 
   3146 		/*
   3147 		 * SAX: PI detected.
   3148 		 */
   3149 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
   3150 		    (ctxt->sax->processingInstruction != NULL))
   3151 		    ctxt->sax->processingInstruction(ctxt->userData,
   3152 		                                     target, buf);
   3153 	    }
   3154 	    xmlFree(buf);
   3155 	} else {
   3156 	    htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
   3157                          "PI is not started correctly", NULL, NULL);
   3158 	}
   3159 	ctxt->instate = state;
   3160     }
   3161 }
   3162 
   3163 /**
   3164  * htmlParseComment:
   3165  * @ctxt:  an HTML parser context
   3166  *
   3167  * Parse an XML (SGML) comment <!-- .... -->
   3168  *
   3169  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
   3170  */
   3171 static void
   3172 htmlParseComment(htmlParserCtxtPtr ctxt) {
   3173     xmlChar *buf = NULL;
   3174     int len;
   3175     int size = HTML_PARSER_BUFFER_SIZE;
   3176     int q, ql;
   3177     int r, rl;
   3178     int cur, l;
   3179     xmlParserInputState state;
   3180 
   3181     /*
   3182      * Check that there is a comment right here.
   3183      */
   3184     if ((RAW != '<') || (NXT(1) != '!') ||
   3185         (NXT(2) != '-') || (NXT(3) != '-')) return;
   3186 
   3187     state = ctxt->instate;
   3188     ctxt->instate = XML_PARSER_COMMENT;
   3189     SHRINK;
   3190     SKIP(4);
   3191     buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
   3192     if (buf == NULL) {
   3193         htmlErrMemory(ctxt, "buffer allocation failed\n");
   3194 	ctxt->instate = state;
   3195 	return;
   3196     }
   3197     q = CUR_CHAR(ql);
   3198     NEXTL(ql);
   3199     r = CUR_CHAR(rl);
   3200     NEXTL(rl);
   3201     cur = CUR_CHAR(l);
   3202     len = 0;
   3203     while (IS_CHAR(cur) &&
   3204            ((cur != '>') ||
   3205 	    (r != '-') || (q != '-'))) {
   3206 	if (len + 5 >= size) {
   3207 	    xmlChar *tmp;
   3208 
   3209 	    size *= 2;
   3210 	    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
   3211 	    if (tmp == NULL) {
   3212 	        xmlFree(buf);
   3213 	        htmlErrMemory(ctxt, "growing buffer failed\n");
   3214 		ctxt->instate = state;
   3215 		return;
   3216 	    }
   3217 	    buf = tmp;
   3218 	}
   3219 	COPY_BUF(ql,buf,len,q);
   3220 	q = r;
   3221 	ql = rl;
   3222 	r = cur;
   3223 	rl = l;
   3224 	NEXTL(l);
   3225 	cur = CUR_CHAR(l);
   3226 	if (cur == 0) {
   3227 	    SHRINK;
   3228 	    GROW;
   3229 	    cur = CUR_CHAR(l);
   3230 	}
   3231     }
   3232     buf[len] = 0;
   3233     if (!IS_CHAR(cur)) {
   3234 	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
   3235 	             "Comment not terminated \n<!--%.50s\n", buf, NULL);
   3236 	xmlFree(buf);
   3237     } else {
   3238         NEXT;
   3239 	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
   3240 	    (!ctxt->disableSAX))
   3241 	    ctxt->sax->comment(ctxt->userData, buf);
   3242 	xmlFree(buf);
   3243     }
   3244     ctxt->instate = state;
   3245 }
   3246 
   3247 /**
   3248  * htmlParseCharRef:
   3249  * @ctxt:  an HTML parser context
   3250  *
   3251  * parse Reference declarations
   3252  *
   3253  * [66] CharRef ::= '&#' [0-9]+ ';' |
   3254  *                  '&#x' [0-9a-fA-F]+ ';'
   3255  *
   3256  * Returns the value parsed (as an int)
   3257  */
   3258 int
   3259 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
   3260     int val = 0;
   3261 
   3262     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   3263 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   3264 		     "htmlParseCharRef: context error\n",
   3265 		     NULL, NULL);
   3266         return(0);
   3267     }
   3268     if ((CUR == '&') && (NXT(1) == '#') &&
   3269         ((NXT(2) == 'x') || NXT(2) == 'X')) {
   3270 	SKIP(3);
   3271 	while (CUR != ';') {
   3272 	    if ((CUR >= '0') && (CUR <= '9'))
   3273 	        val = val * 16 + (CUR - '0');
   3274 	    else if ((CUR >= 'a') && (CUR <= 'f'))
   3275 	        val = val * 16 + (CUR - 'a') + 10;
   3276 	    else if ((CUR >= 'A') && (CUR <= 'F'))
   3277 	        val = val * 16 + (CUR - 'A') + 10;
   3278 	    else {
   3279 	        htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
   3280 		             "htmlParseCharRef: missing semicolon\n",
   3281 			     NULL, NULL);
   3282 		break;
   3283 	    }
   3284 	    NEXT;
   3285 	}
   3286 	if (CUR == ';')
   3287 	    NEXT;
   3288     } else if  ((CUR == '&') && (NXT(1) == '#')) {
   3289 	SKIP(2);
   3290 	while (CUR != ';') {
   3291 	    if ((CUR >= '0') && (CUR <= '9'))
   3292 	        val = val * 10 + (CUR - '0');
   3293 	    else {
   3294 	        htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
   3295 		             "htmlParseCharRef: missing semicolon\n",
   3296 			     NULL, NULL);
   3297 		break;
   3298 	    }
   3299 	    NEXT;
   3300 	}
   3301 	if (CUR == ';')
   3302 	    NEXT;
   3303     } else {
   3304 	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
   3305 	             "htmlParseCharRef: invalid value\n", NULL, NULL);
   3306     }
   3307     /*
   3308      * Check the value IS_CHAR ...
   3309      */
   3310     if (IS_CHAR(val)) {
   3311         return(val);
   3312     } else {
   3313 	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
   3314 			"htmlParseCharRef: invalid xmlChar value %d\n",
   3315 			val);
   3316     }
   3317     return(0);
   3318 }
   3319 
   3320 
   3321 /**
   3322  * htmlParseDocTypeDecl:
   3323  * @ctxt:  an HTML parser context
   3324  *
   3325  * parse a DOCTYPE declaration
   3326  *
   3327  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
   3328  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
   3329  */
   3330 
   3331 static void
   3332 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
   3333     const xmlChar *name;
   3334     xmlChar *ExternalID = NULL;
   3335     xmlChar *URI = NULL;
   3336 
   3337     /*
   3338      * We know that '<!DOCTYPE' has been detected.
   3339      */
   3340     SKIP(9);
   3341 
   3342     SKIP_BLANKS;
   3343 
   3344     /*
   3345      * Parse the DOCTYPE name.
   3346      */
   3347     name = htmlParseName(ctxt);
   3348     if (name == NULL) {
   3349 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   3350 	             "htmlParseDocTypeDecl : no DOCTYPE name !\n",
   3351 		     NULL, NULL);
   3352     }
   3353     /*
   3354      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
   3355      */
   3356 
   3357     SKIP_BLANKS;
   3358 
   3359     /*
   3360      * Check for SystemID and ExternalID
   3361      */
   3362     URI = htmlParseExternalID(ctxt, &ExternalID);
   3363     SKIP_BLANKS;
   3364 
   3365     /*
   3366      * We should be at the end of the DOCTYPE declaration.
   3367      */
   3368     if (CUR != '>') {
   3369 	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
   3370 	             "DOCTYPE improperly terminated\n", NULL, NULL);
   3371         /* We shouldn't try to resynchronize ... */
   3372     }
   3373     NEXT;
   3374 
   3375     /*
   3376      * Create or update the document accordingly to the DOCTYPE
   3377      */
   3378     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
   3379 	(!ctxt->disableSAX))
   3380 	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
   3381 
   3382     /*
   3383      * Cleanup, since we don't use all those identifiers
   3384      */
   3385     if (URI != NULL) xmlFree(URI);
   3386     if (ExternalID != NULL) xmlFree(ExternalID);
   3387 }
   3388 
   3389 /**
   3390  * htmlParseAttribute:
   3391  * @ctxt:  an HTML parser context
   3392  * @value:  a xmlChar ** used to store the value of the attribute
   3393  *
   3394  * parse an attribute
   3395  *
   3396  * [41] Attribute ::= Name Eq AttValue
   3397  *
   3398  * [25] Eq ::= S? '=' S?
   3399  *
   3400  * With namespace:
   3401  *
   3402  * [NS 11] Attribute ::= QName Eq AttValue
   3403  *
   3404  * Also the case QName == xmlns:??? is handled independently as a namespace
   3405  * definition.
   3406  *
   3407  * Returns the attribute name, and the value in *value.
   3408  */
   3409 
   3410 static const xmlChar *
   3411 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
   3412     const xmlChar *name;
   3413     xmlChar *val = NULL;
   3414 
   3415     *value = NULL;
   3416     name = htmlParseHTMLName(ctxt);
   3417     if (name == NULL) {
   3418 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   3419 	             "error parsing attribute name\n", NULL, NULL);
   3420         return(NULL);
   3421     }
   3422 
   3423     /*
   3424      * read the value
   3425      */
   3426     SKIP_BLANKS;
   3427     if (CUR == '=') {
   3428         NEXT;
   3429 	SKIP_BLANKS;
   3430 	val = htmlParseAttValue(ctxt);
   3431     }
   3432 
   3433     *value = val;
   3434     return(name);
   3435 }
   3436 
   3437 /**
   3438  * htmlCheckEncoding:
   3439  * @ctxt:  an HTML parser context
   3440  * @attvalue: the attribute value
   3441  *
   3442  * Checks an http-equiv attribute from a Meta tag to detect
   3443  * the encoding
   3444  * If a new encoding is detected the parser is switched to decode
   3445  * it and pass UTF8
   3446  */
   3447 static void
   3448 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
   3449     const xmlChar *encoding;
   3450 
   3451     if ((ctxt == NULL) || (attvalue == NULL) ||
   3452         (ctxt->options & HTML_PARSE_IGNORE_ENC))
   3453 	return;
   3454 
   3455     /* do not change encoding */
   3456     if (ctxt->input->encoding != NULL)
   3457         return;
   3458 
   3459     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
   3460     if (encoding != NULL) {
   3461 	encoding += 8;
   3462     } else {
   3463 	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
   3464 	if (encoding != NULL)
   3465 	    encoding += 9;
   3466     }
   3467     if (encoding != NULL) {
   3468 	xmlCharEncoding enc;
   3469 	xmlCharEncodingHandlerPtr handler;
   3470 
   3471 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
   3472 
   3473 	if (ctxt->input->encoding != NULL)
   3474 	    xmlFree((xmlChar *) ctxt->input->encoding);
   3475 	ctxt->input->encoding = xmlStrdup(encoding);
   3476 
   3477 	enc = xmlParseCharEncoding((const char *) encoding);
   3478 	/*
   3479 	 * registered set of known encodings
   3480 	 */
   3481 	if (enc != XML_CHAR_ENCODING_ERROR) {
   3482 	    if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
   3483 	         (enc == XML_CHAR_ENCODING_UTF16BE) ||
   3484 		 (enc == XML_CHAR_ENCODING_UCS4LE) ||
   3485 		 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
   3486 		(ctxt->input->buf != NULL) &&
   3487 		(ctxt->input->buf->encoder == NULL)) {
   3488 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
   3489 		             "htmlCheckEncoding: wrong encoding meta\n",
   3490 			     NULL, NULL);
   3491 	    } else {
   3492 		xmlSwitchEncoding(ctxt, enc);
   3493 	    }
   3494 	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
   3495 	} else {
   3496 	    /*
   3497 	     * fallback for unknown encodings
   3498 	     */
   3499 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
   3500 	    if (handler != NULL) {
   3501 		xmlSwitchToEncoding(ctxt, handler);
   3502 		ctxt->charset = XML_CHAR_ENCODING_UTF8;
   3503 	    } else {
   3504 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
   3505 		             "htmlCheckEncoding: unknown encoding %s\n",
   3506 			     encoding, NULL);
   3507 	    }
   3508 	}
   3509 
   3510 	if ((ctxt->input->buf != NULL) &&
   3511 	    (ctxt->input->buf->encoder != NULL) &&
   3512 	    (ctxt->input->buf->raw != NULL) &&
   3513 	    (ctxt->input->buf->buffer != NULL)) {
   3514 	    int nbchars;
   3515 	    int processed;
   3516 
   3517 	    /*
   3518 	     * convert as much as possible to the parser reading buffer.
   3519 	     */
   3520 	    processed = ctxt->input->cur - ctxt->input->base;
   3521 	    xmlBufferShrink(ctxt->input->buf->buffer, processed);
   3522 	    nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
   3523 		                       ctxt->input->buf->buffer,
   3524 				       ctxt->input->buf->raw);
   3525 	    if (nbchars < 0) {
   3526 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
   3527 		             "htmlCheckEncoding: encoder error\n",
   3528 			     NULL, NULL);
   3529 	    }
   3530 	    ctxt->input->base =
   3531 	    ctxt->input->cur = ctxt->input->buf->buffer->content;
   3532             ctxt->input->end =
   3533                           &ctxt->input->base[ctxt->input->buf->buffer->use];
   3534 	}
   3535     }
   3536 }
   3537 
   3538 /**
   3539  * htmlCheckMeta:
   3540  * @ctxt:  an HTML parser context
   3541  * @atts:  the attributes values
   3542  *
   3543  * Checks an attributes from a Meta tag
   3544  */
   3545 static void
   3546 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
   3547     int i;
   3548     const xmlChar *att, *value;
   3549     int http = 0;
   3550     const xmlChar *content = NULL;
   3551 
   3552     if ((ctxt == NULL) || (atts == NULL))
   3553 	return;
   3554 
   3555     i = 0;
   3556     att = atts[i++];
   3557     while (att != NULL) {
   3558 	value = atts[i++];
   3559 	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
   3560 	 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
   3561 	    http = 1;
   3562 	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
   3563 	    content = value;
   3564 	att = atts[i++];
   3565     }
   3566     if ((http) && (content != NULL))
   3567 	htmlCheckEncoding(ctxt, content);
   3568 
   3569 }
   3570 
   3571 /**
   3572  * htmlParseStartTag:
   3573  * @ctxt:  an HTML parser context
   3574  *
   3575  * parse a start of tag either for rule element or
   3576  * EmptyElement. In both case we don't parse the tag closing chars.
   3577  *
   3578  * [40] STag ::= '<' Name (S Attribute)* S? '>'
   3579  *
   3580  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
   3581  *
   3582  * With namespace:
   3583  *
   3584  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
   3585  *
   3586  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
   3587  *
   3588  * Returns 0 in case of success, -1 in case of error and 1 if discarded
   3589  */
   3590 
   3591 static int
   3592 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
   3593     const xmlChar *name;
   3594     const xmlChar *attname;
   3595     xmlChar *attvalue;
   3596     const xmlChar **atts;
   3597     int nbatts = 0;
   3598     int maxatts;
   3599     int meta = 0;
   3600     int i;
   3601     int discardtag = 0;
   3602 
   3603     if (ctxt->instate == XML_PARSER_EOF)
   3604         return(-1);
   3605     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   3606 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   3607 		     "htmlParseStartTag: context error\n", NULL, NULL);
   3608 	return -1;
   3609     }
   3610     if (CUR != '<') return -1;
   3611     NEXT;
   3612 
   3613     atts = ctxt->atts;
   3614     maxatts = ctxt->maxatts;
   3615 
   3616     GROW;
   3617     name = htmlParseHTMLName(ctxt);
   3618     if (name == NULL) {
   3619 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   3620 	             "htmlParseStartTag: invalid element name\n",
   3621 		     NULL, NULL);
   3622 	/* Dump the bogus tag like browsers do */
   3623 	while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
   3624                (ctxt->instate != XML_PARSER_EOF))
   3625 	    NEXT;
   3626         return -1;
   3627     }
   3628     if (xmlStrEqual(name, BAD_CAST"meta"))
   3629 	meta = 1;
   3630 
   3631     /*
   3632      * Check for auto-closure of HTML elements.
   3633      */
   3634     htmlAutoClose(ctxt, name);
   3635 
   3636     /*
   3637      * Check for implied HTML elements.
   3638      */
   3639     htmlCheckImplied(ctxt, name);
   3640 
   3641     /*
   3642      * Avoid html at any level > 0, head at any level != 1
   3643      * or any attempt to recurse body
   3644      */
   3645     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
   3646 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   3647 	             "htmlParseStartTag: misplaced <html> tag\n",
   3648 		     name, NULL);
   3649 	discardtag = 1;
   3650 	ctxt->depth++;
   3651     }
   3652     if ((ctxt->nameNr != 1) &&
   3653 	(xmlStrEqual(name, BAD_CAST"head"))) {
   3654 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   3655 	             "htmlParseStartTag: misplaced <head> tag\n",
   3656 		     name, NULL);
   3657 	discardtag = 1;
   3658 	ctxt->depth++;
   3659     }
   3660     if (xmlStrEqual(name, BAD_CAST"body")) {
   3661 	int indx;
   3662 	for (indx = 0;indx < ctxt->nameNr;indx++) {
   3663 	    if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
   3664 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   3665 		             "htmlParseStartTag: misplaced <body> tag\n",
   3666 			     name, NULL);
   3667 		discardtag = 1;
   3668 		ctxt->depth++;
   3669 	    }
   3670 	}
   3671     }
   3672 
   3673     /*
   3674      * Now parse the attributes, it ends up with the ending
   3675      *
   3676      * (S Attribute)* S?
   3677      */
   3678     SKIP_BLANKS;
   3679     while ((IS_CHAR_CH(CUR)) &&
   3680            (CUR != '>') &&
   3681 	   ((CUR != '/') || (NXT(1) != '>'))) {
   3682 	long cons = ctxt->nbChars;
   3683 
   3684 	GROW;
   3685 	attname = htmlParseAttribute(ctxt, &attvalue);
   3686         if (attname != NULL) {
   3687 
   3688 	    /*
   3689 	     * Well formedness requires at most one declaration of an attribute
   3690 	     */
   3691 	    for (i = 0; i < nbatts;i += 2) {
   3692 	        if (xmlStrEqual(atts[i], attname)) {
   3693 		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
   3694 		                 "Attribute %s redefined\n", attname, NULL);
   3695 		    if (attvalue != NULL)
   3696 			xmlFree(attvalue);
   3697 		    goto failed;
   3698 		}
   3699 	    }
   3700 
   3701 	    /*
   3702 	     * Add the pair to atts
   3703 	     */
   3704 	    if (atts == NULL) {
   3705 	        maxatts = 22; /* allow for 10 attrs by default */
   3706 	        atts = (const xmlChar **)
   3707 		       xmlMalloc(maxatts * sizeof(xmlChar *));
   3708 		if (atts == NULL) {
   3709 		    htmlErrMemory(ctxt, NULL);
   3710 		    if (attvalue != NULL)
   3711 			xmlFree(attvalue);
   3712 		    goto failed;
   3713 		}
   3714 		ctxt->atts = atts;
   3715 		ctxt->maxatts = maxatts;
   3716 	    } else if (nbatts + 4 > maxatts) {
   3717 	        const xmlChar **n;
   3718 
   3719 	        maxatts *= 2;
   3720 	        n = (const xmlChar **) xmlRealloc((void *) atts,
   3721 					     maxatts * sizeof(const xmlChar *));
   3722 		if (n == NULL) {
   3723 		    htmlErrMemory(ctxt, NULL);
   3724 		    if (attvalue != NULL)
   3725 			xmlFree(attvalue);
   3726 		    goto failed;
   3727 		}
   3728 		atts = n;
   3729 		ctxt->atts = atts;
   3730 		ctxt->maxatts = maxatts;
   3731 	    }
   3732 	    atts[nbatts++] = attname;
   3733 	    atts[nbatts++] = attvalue;
   3734 	    atts[nbatts] = NULL;
   3735 	    atts[nbatts + 1] = NULL;
   3736 	}
   3737 	else {
   3738 	    if (attvalue != NULL)
   3739 	        xmlFree(attvalue);
   3740 	    /* Dump the bogus attribute string up to the next blank or
   3741 	     * the end of the tag. */
   3742 	    while ((IS_CHAR_CH(CUR)) &&
   3743 	           !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
   3744 		   ((CUR != '/') || (NXT(1) != '>')))
   3745 		NEXT;
   3746 	}
   3747 
   3748 failed:
   3749 	SKIP_BLANKS;
   3750         if (cons == ctxt->nbChars) {
   3751 	    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   3752 	                 "htmlParseStartTag: problem parsing attributes\n",
   3753 			 NULL, NULL);
   3754 	    break;
   3755 	}
   3756     }
   3757 
   3758     /*
   3759      * Handle specific association to the META tag
   3760      */
   3761     if (meta && (nbatts != 0))
   3762 	htmlCheckMeta(ctxt, atts);
   3763 
   3764     /*
   3765      * SAX: Start of Element !
   3766      */
   3767     if (!discardtag) {
   3768 	htmlnamePush(ctxt, name);
   3769 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
   3770 	    if (nbatts != 0)
   3771 		ctxt->sax->startElement(ctxt->userData, name, atts);
   3772 	    else
   3773 		ctxt->sax->startElement(ctxt->userData, name, NULL);
   3774 	}
   3775     }
   3776 
   3777     if (atts != NULL) {
   3778         for (i = 1;i < nbatts;i += 2) {
   3779 	    if (atts[i] != NULL)
   3780 		xmlFree((xmlChar *) atts[i]);
   3781 	}
   3782     }
   3783 
   3784     return(discardtag);
   3785 }
   3786 
   3787 /**
   3788  * htmlParseEndTag:
   3789  * @ctxt:  an HTML parser context
   3790  *
   3791  * parse an end of tag
   3792  *
   3793  * [42] ETag ::= '</' Name S? '>'
   3794  *
   3795  * With namespace
   3796  *
   3797  * [NS 9] ETag ::= '</' QName S? '>'
   3798  *
   3799  * Returns 1 if the current level should be closed.
   3800  */
   3801 
   3802 static int
   3803 htmlParseEndTag(htmlParserCtxtPtr ctxt)
   3804 {
   3805     const xmlChar *name;
   3806     const xmlChar *oldname;
   3807     int i, ret;
   3808 
   3809     if ((CUR != '<') || (NXT(1) != '/')) {
   3810         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
   3811 	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
   3812         return (0);
   3813     }
   3814     SKIP(2);
   3815 
   3816     name = htmlParseHTMLName(ctxt);
   3817     if (name == NULL)
   3818         return (0);
   3819     /*
   3820      * We should definitely be at the ending "S? '>'" part
   3821      */
   3822     SKIP_BLANKS;
   3823     if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
   3824         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
   3825 	             "End tag : expected '>'\n", NULL, NULL);
   3826 	if (ctxt->recovery) {
   3827 	    /*
   3828 	     * We're not at the ending > !!
   3829 	     * Error, unless in recover mode where we search forwards
   3830 	     * until we find a >
   3831 	     */
   3832 	    while (CUR != '\0' && CUR != '>') NEXT;
   3833 	    NEXT;
   3834 	}
   3835     } else
   3836         NEXT;
   3837 
   3838     /*
   3839      * if we ignored misplaced tags in htmlParseStartTag don't pop them
   3840      * out now.
   3841      */
   3842     if ((ctxt->depth > 0) &&
   3843         (xmlStrEqual(name, BAD_CAST "html") ||
   3844          xmlStrEqual(name, BAD_CAST "body") ||
   3845 	 xmlStrEqual(name, BAD_CAST "head"))) {
   3846 	ctxt->depth--;
   3847 	return (0);
   3848     }
   3849 
   3850     /*
   3851      * If the name read is not one of the element in the parsing stack
   3852      * then return, it's just an error.
   3853      */
   3854     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
   3855         if (xmlStrEqual(name, ctxt->nameTab[i]))
   3856             break;
   3857     }
   3858     if (i < 0) {
   3859         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
   3860 	             "Unexpected end tag : %s\n", name, NULL);
   3861         return (0);
   3862     }
   3863 
   3864 
   3865     /*
   3866      * Check for auto-closure of HTML elements.
   3867      */
   3868 
   3869     htmlAutoCloseOnClose(ctxt, name);
   3870 
   3871     /*
   3872      * Well formedness constraints, opening and closing must match.
   3873      * With the exception that the autoclose may have popped stuff out
   3874      * of the stack.
   3875      */
   3876     if (!xmlStrEqual(name, ctxt->name)) {
   3877         if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
   3878             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
   3879 	                 "Opening and ending tag mismatch: %s and %s\n",
   3880 			 name, ctxt->name);
   3881         }
   3882     }
   3883 
   3884     /*
   3885      * SAX: End of Tag
   3886      */
   3887     oldname = ctxt->name;
   3888     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
   3889         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   3890             ctxt->sax->endElement(ctxt->userData, name);
   3891         htmlnamePop(ctxt);
   3892         ret = 1;
   3893     } else {
   3894         ret = 0;
   3895     }
   3896 
   3897     return (ret);
   3898 }
   3899 
   3900 
   3901 /**
   3902  * htmlParseReference:
   3903  * @ctxt:  an HTML parser context
   3904  *
   3905  * parse and handle entity references in content,
   3906  * this will end-up in a call to character() since this is either a
   3907  * CharRef, or a predefined entity.
   3908  */
   3909 static void
   3910 htmlParseReference(htmlParserCtxtPtr ctxt) {
   3911     const htmlEntityDesc * ent;
   3912     xmlChar out[6];
   3913     const xmlChar *name;
   3914     if (CUR != '&') return;
   3915 
   3916     if (NXT(1) == '#') {
   3917 	unsigned int c;
   3918 	int bits, i = 0;
   3919 
   3920 	c = htmlParseCharRef(ctxt);
   3921 	if (c == 0)
   3922 	    return;
   3923 
   3924         if      (c <    0x80) { out[i++]= c;                bits= -6; }
   3925         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   3926         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   3927         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   3928 
   3929         for ( ; bits >= 0; bits-= 6) {
   3930             out[i++]= ((c >> bits) & 0x3F) | 0x80;
   3931         }
   3932 	out[i] = 0;
   3933 
   3934 	htmlCheckParagraph(ctxt);
   3935 	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
   3936 	    ctxt->sax->characters(ctxt->userData, out, i);
   3937     } else {
   3938 	ent = htmlParseEntityRef(ctxt, &name);
   3939 	if (name == NULL) {
   3940 	    htmlCheckParagraph(ctxt);
   3941 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
   3942 	        ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
   3943 	    return;
   3944 	}
   3945 	if ((ent == NULL) || !(ent->value > 0)) {
   3946 	    htmlCheckParagraph(ctxt);
   3947 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
   3948 		ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
   3949 		ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
   3950 		/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
   3951 	    }
   3952 	} else {
   3953 	    unsigned int c;
   3954 	    int bits, i = 0;
   3955 
   3956 	    c = ent->value;
   3957 	    if      (c <    0x80)
   3958 	            { out[i++]= c;                bits= -6; }
   3959 	    else if (c <   0x800)
   3960 	            { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
   3961 	    else if (c < 0x10000)
   3962 	            { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
   3963 	    else
   3964 	            { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
   3965 
   3966 	    for ( ; bits >= 0; bits-= 6) {
   3967 		out[i++]= ((c >> bits) & 0x3F) | 0x80;
   3968 	    }
   3969 	    out[i] = 0;
   3970 
   3971 	    htmlCheckParagraph(ctxt);
   3972 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
   3973 		ctxt->sax->characters(ctxt->userData, out, i);
   3974 	}
   3975     }
   3976 }
   3977 
   3978 /**
   3979  * htmlParseContent:
   3980  * @ctxt:  an HTML parser context
   3981  *
   3982  * Parse a content: comment, sub-element, reference or text.
   3983  * Kept for compatibility with old code
   3984  */
   3985 
   3986 static void
   3987 htmlParseContent(htmlParserCtxtPtr ctxt) {
   3988     xmlChar *currentNode;
   3989     int depth;
   3990     const xmlChar *name;
   3991 
   3992     currentNode = xmlStrdup(ctxt->name);
   3993     depth = ctxt->nameNr;
   3994     while (1) {
   3995 	long cons = ctxt->nbChars;
   3996 
   3997         GROW;
   3998 
   3999         if (ctxt->instate == XML_PARSER_EOF)
   4000             break;
   4001 
   4002 	/*
   4003 	 * Our tag or one of it's parent or children is ending.
   4004 	 */
   4005         if ((CUR == '<') && (NXT(1) == '/')) {
   4006 	    if (htmlParseEndTag(ctxt) &&
   4007 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
   4008 		if (currentNode != NULL)
   4009 		    xmlFree(currentNode);
   4010 		return;
   4011 	    }
   4012 	    continue; /* while */
   4013         }
   4014 
   4015 	else if ((CUR == '<') &&
   4016 	         ((IS_ASCII_LETTER(NXT(1))) ||
   4017 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
   4018 	    name = htmlParseHTMLName_nonInvasive(ctxt);
   4019 	    if (name == NULL) {
   4020 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
   4021 			 "htmlParseStartTag: invalid element name\n",
   4022 			 NULL, NULL);
   4023 	        /* Dump the bogus tag like browsers do */
   4024         while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
   4025 	            NEXT;
   4026 
   4027 	        if (currentNode != NULL)
   4028 	            xmlFree(currentNode);
   4029 	        return;
   4030 	    }
   4031 
   4032 	    if (ctxt->name != NULL) {
   4033 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
   4034 	            htmlAutoClose(ctxt, name);
   4035 	            continue;
   4036 	        }
   4037 	    }
   4038 	}
   4039 
   4040 	/*
   4041 	 * Has this node been popped out during parsing of
   4042 	 * the next element
   4043 	 */
   4044         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
   4045 	    (!xmlStrEqual(currentNode, ctxt->name)))
   4046 	     {
   4047 	    if (currentNode != NULL) xmlFree(currentNode);
   4048 	    return;
   4049 	}
   4050 
   4051 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
   4052 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
   4053 	    /*
   4054 	     * Handle SCRIPT/STYLE separately
   4055 	     */
   4056 	    htmlParseScript(ctxt);
   4057 	} else {
   4058 	    /*
   4059 	     * Sometimes DOCTYPE arrives in the middle of the document
   4060 	     */
   4061 	    if ((CUR == '<') && (NXT(1) == '!') &&
   4062 		(UPP(2) == 'D') && (UPP(3) == 'O') &&
   4063 		(UPP(4) == 'C') && (UPP(5) == 'T') &&
   4064 		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
   4065 		(UPP(8) == 'E')) {
   4066 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
   4067 		             "Misplaced DOCTYPE declaration\n",
   4068 			     BAD_CAST "DOCTYPE" , NULL);
   4069 		htmlParseDocTypeDecl(ctxt);
   4070 	    }
   4071 
   4072 	    /*
   4073 	     * First case :  a comment
   4074 	     */
   4075 	    if ((CUR == '<') && (NXT(1) == '!') &&
   4076 		(NXT(2) == '-') && (NXT(3) == '-')) {
   4077 		htmlParseComment(ctxt);
   4078 	    }
   4079 
   4080 	    /*
   4081 	     * Second case : a Processing Instruction.
   4082 	     */
   4083 	    else if ((CUR == '<') && (NXT(1) == '?')) {
   4084 		htmlParsePI(ctxt);
   4085 	    }
   4086 
   4087 	    /*
   4088 	     * Third case :  a sub-element.
   4089 	     */
   4090 	    else if (CUR == '<') {
   4091 		htmlParseElement(ctxt);
   4092 	    }
   4093 
   4094 	    /*
   4095 	     * Fourth case : a reference. If if has not been resolved,
   4096 	     *    parsing returns it's Name, create the node
   4097 	     */
   4098 	    else if (CUR == '&') {
   4099 		htmlParseReference(ctxt);
   4100 	    }
   4101 
   4102 	    /*
   4103 	     * Fifth case : end of the resource
   4104 	     */
   4105 	    else if (CUR == 0) {
   4106 		htmlAutoCloseOnEnd(ctxt);
   4107 		break;
   4108 	    }
   4109 
   4110 	    /*
   4111 	     * Last case, text. Note that References are handled directly.
   4112 	     */
   4113 	    else {
   4114 		htmlParseCharData(ctxt);
   4115 	    }
   4116 
   4117 	    if (cons == ctxt->nbChars) {
   4118 		if (ctxt->node != NULL) {
   4119 		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   4120 		                 "detected an error in element content\n",
   4121 				 NULL, NULL);
   4122 		}
   4123 		break;
   4124 	    }
   4125 	}
   4126         GROW;
   4127     }
   4128     if (currentNode != NULL) xmlFree(currentNode);
   4129 }
   4130 
   4131 /**
   4132  * htmlParseElement:
   4133  * @ctxt:  an HTML parser context
   4134  *
   4135  * parse an HTML element, this is highly recursive
   4136  * this is kept for compatibility with previous code versions
   4137  *
   4138  * [39] element ::= EmptyElemTag | STag content ETag
   4139  *
   4140  * [41] Attribute ::= Name Eq AttValue
   4141  */
   4142 
   4143 void
   4144 htmlParseElement(htmlParserCtxtPtr ctxt) {
   4145     const xmlChar *name;
   4146     xmlChar *currentNode = NULL;
   4147     const htmlElemDesc * info;
   4148     htmlParserNodeInfo node_info;
   4149     int failed;
   4150     int depth;
   4151     const xmlChar *oldptr;
   4152 
   4153     if ((ctxt == NULL) || (ctxt->input == NULL)) {
   4154 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
   4155 		     "htmlParseElement: context error\n", NULL, NULL);
   4156 	return;
   4157     }
   4158 
   4159     if (ctxt->instate == XML_PARSER_EOF)
   4160         return;
   4161 
   4162     /* Capture start position */
   4163     if (ctxt->record_info) {
   4164         node_info.begin_pos = ctxt->input->consumed +
   4165                           (CUR_PTR - ctxt->input->base);
   4166 	node_info.begin_line = ctxt->input->line;
   4167     }
   4168 
   4169     failed = htmlParseStartTag(ctxt);
   4170     name = ctxt->name;
   4171     if ((failed == -1) || (name == NULL)) {
   4172 	if (CUR == '>')
   4173 	    NEXT;
   4174         return;
   4175     }
   4176 
   4177     /*
   4178      * Lookup the info for that element.
   4179      */
   4180     info = htmlTagLookup(name);
   4181     if (info == NULL) {
   4182 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
   4183 	             "Tag %s invalid\n", name, NULL);
   4184     }
   4185 
   4186     /*
   4187      * Check for an Empty Element labeled the XML/SGML way
   4188      */
   4189     if ((CUR == '/') && (NXT(1) == '>')) {
   4190         SKIP(2);
   4191 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   4192 	    ctxt->sax->endElement(ctxt->userData, name);
   4193 	htmlnamePop(ctxt);
   4194 	return;
   4195     }
   4196 
   4197     if (CUR == '>') {
   4198         NEXT;
   4199     } else {
   4200 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
   4201 	             "Couldn't find end of Start Tag %s\n", name, NULL);
   4202 
   4203 	/*
   4204 	 * end of parsing of this node.
   4205 	 */
   4206 	if (xmlStrEqual(name, ctxt->name)) {
   4207 	    nodePop(ctxt);
   4208 	    htmlnamePop(ctxt);
   4209 	}
   4210 
   4211 	/*
   4212 	 * Capture end position and add node
   4213 	 */
   4214 	if (ctxt->record_info) {
   4215 	   node_info.end_pos = ctxt->input->consumed +
   4216 			      (CUR_PTR - ctxt->input->base);
   4217 	   node_info.end_line = ctxt->input->line;
   4218 	   node_info.node = ctxt->node;
   4219 	   xmlParserAddNodeInfo(ctxt, &node_info);
   4220 	}
   4221 	return;
   4222     }
   4223 
   4224     /*
   4225      * Check for an Empty Element from DTD definition
   4226      */
   4227     if ((info != NULL) && (info->empty)) {
   4228 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
   4229 	    ctxt->sax->endElement(ctxt->userData, name);
   4230 	htmlnamePop(ctxt);
   4231 	return;
   4232     }
   4233 
   4234     /*
   4235      * Parse the content of the element:
   4236      */
   4237     currentNode = xmlStrdup(ctxt->name);
   4238     depth = ctxt->nameNr;
   4239     while (IS_CHAR_CH(CUR)) {
   4240 	oldptr = ctxt->input->cur;
   4241 	htmlParseContent(ctxt);
   4242 	if (oldptr==ctxt->input->cur) break;
   4243 	if (ctxt->nameNr < depth) break;
   4244     }
   4245 
   4246     /*
   4247      * Capture end position and add node
   4248      */
   4249     if ( currentNode != NULL && ctxt->record_info ) {
   4250        node_info.end_pos = ctxt->input->consumed +
   4251                           (CUR_PTR - ctxt->input->base);
   4252        node_info.end_line = ctxt->input->line;
   4253        node_info.node = ctxt->node;
   4254        xmlParserAddNodeInfo(ctxt, &node_info);
   4255     }
   4256     if (!IS_CHAR_CH(CUR)) {
   4257 	htmlAutoCloseOnEnd(ctxt);
   4258     }
   4259 
   4260     if (currentNode != NULL)
   4261 	xmlFree(currentNode);
   4262 }
   4263 
   4264 static void
   4265