1 /* 2 * HTMLparser.c : an HTML 4.0 non-verifying parser 3 * 4 * See Copyright for the status of this software. 5 * 6 * daniel (at) veillard.com 7 */ 8 9 #define IN_LIBXML 10 #include "libxml.h" 11 #ifdef LIBXML_HTML_ENABLED 12 13 #include <string.h> 14 #ifdef HAVE_CTYPE_H 15 #include <ctype.h> 16 #endif 17 #ifdef HAVE_STDLIB_H 18 #include <stdlib.h> 19 #endif 20 #ifdef HAVE_SYS_STAT_H 21 #include <sys/stat.h> 22 #endif 23 #ifdef HAVE_FCNTL_H 24 #include <fcntl.h> 25 #endif 26 #ifdef HAVE_UNISTD_H 27 #include <unistd.h> 28 #endif 29 #ifdef HAVE_ZLIB_H 30 #include <zlib.h> 31 #endif 32 33 #include <libxml/xmlmemory.h> 34 #include <libxml/tree.h> 35 #include <libxml/parser.h> 36 #include <libxml/parserInternals.h> 37 #include <libxml/xmlerror.h> 38 #include <libxml/HTMLparser.h> 39 #include <libxml/HTMLtree.h> 40 #include <libxml/entities.h> 41 #include <libxml/encoding.h> 42 #include <libxml/valid.h> 43 #include <libxml/xmlIO.h> 44 #include <libxml/globals.h> 45 #include <libxml/uri.h> 46 47 #define HTML_MAX_NAMELEN 1000 48 #define HTML_PARSER_BIG_BUFFER_SIZE 1000 49 #define HTML_PARSER_BUFFER_SIZE 100 50 51 /* #define DEBUG */ 52 /* #define DEBUG_PUSH */ 53 54 static int htmlOmittedDefaultValue = 1; 55 56 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len, 57 xmlChar end, xmlChar end2, xmlChar end3); 58 static void htmlParseComment(htmlParserCtxtPtr ctxt); 59 60 /************************************************************************ 61 * * 62 * Some factorized error routines * 63 * * 64 ************************************************************************/ 65 66 /** 67 * htmlErrMemory: 68 * @ctxt: an HTML parser context 69 * @extra: extra informations 70 * 71 * Handle a redefinition of attribute error 72 */ 73 static void 74 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra) 75 { 76 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 77 (ctxt->instate == XML_PARSER_EOF)) 78 return; 79 if (ctxt != NULL) { 80 ctxt->errNo = XML_ERR_NO_MEMORY; 81 ctxt->instate = XML_PARSER_EOF; 82 ctxt->disableSAX = 1; 83 } 84 if (extra) 85 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 86 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 87 NULL, NULL, 0, 0, 88 "Memory allocation failed : %s\n", extra); 89 else 90 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 91 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL, 92 NULL, NULL, 0, 0, "Memory allocation failed\n"); 93 } 94 95 /** 96 * htmlParseErr: 97 * @ctxt: an HTML parser context 98 * @error: the error number 99 * @msg: the error message 100 * @str1: string infor 101 * @str2: string infor 102 * 103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints 104 */ 105 static void 106 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, 107 const char *msg, const xmlChar *str1, const xmlChar *str2) 108 { 109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 110 (ctxt->instate == XML_PARSER_EOF)) 111 return; 112 if (ctxt != NULL) 113 ctxt->errNo = error; 114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 115 XML_ERR_ERROR, NULL, 0, 116 (const char *) str1, (const char *) str2, 117 NULL, 0, 0, 118 msg, str1, str2); 119 if (ctxt != NULL) 120 ctxt->wellFormed = 0; 121 } 122 123 /** 124 * htmlParseErrInt: 125 * @ctxt: an HTML parser context 126 * @error: the error number 127 * @msg: the error message 128 * @val: integer info 129 * 130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints 131 */ 132 static void 133 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error, 134 const char *msg, int val) 135 { 136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 137 (ctxt->instate == XML_PARSER_EOF)) 138 return; 139 if (ctxt != NULL) 140 ctxt->errNo = error; 141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 142 XML_ERR_ERROR, NULL, 0, NULL, NULL, 143 NULL, val, 0, msg, val); 144 if (ctxt != NULL) 145 ctxt->wellFormed = 0; 146 } 147 148 /************************************************************************ 149 * * 150 * Parser stacks related functions and macros * 151 * * 152 ************************************************************************/ 153 154 /** 155 * htmlnamePush: 156 * @ctxt: an HTML parser context 157 * @value: the element name 158 * 159 * Pushes a new element name on top of the name stack 160 * 161 * Returns 0 in case of error, the index in the stack otherwise 162 */ 163 static int 164 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value) 165 { 166 if (ctxt->nameNr >= ctxt->nameMax) { 167 ctxt->nameMax *= 2; 168 ctxt->nameTab = (const xmlChar * *) 169 xmlRealloc((xmlChar * *)ctxt->nameTab, 170 ctxt->nameMax * 171 sizeof(ctxt->nameTab[0])); 172 if (ctxt->nameTab == NULL) { 173 htmlErrMemory(ctxt, NULL); 174 return (0); 175 } 176 } 177 ctxt->nameTab[ctxt->nameNr] = value; 178 ctxt->name = value; 179 return (ctxt->nameNr++); 180 } 181 /** 182 * htmlnamePop: 183 * @ctxt: an HTML parser context 184 * 185 * Pops the top element name from the name stack 186 * 187 * Returns the name just removed 188 */ 189 static const xmlChar * 190 htmlnamePop(htmlParserCtxtPtr ctxt) 191 { 192 const xmlChar *ret; 193 194 if (ctxt->nameNr <= 0) 195 return (NULL); 196 ctxt->nameNr--; 197 if (ctxt->nameNr < 0) 198 return (NULL); 199 if (ctxt->nameNr > 0) 200 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1]; 201 else 202 ctxt->name = NULL; 203 ret = ctxt->nameTab[ctxt->nameNr]; 204 ctxt->nameTab[ctxt->nameNr] = NULL; 205 return (ret); 206 } 207 208 /* 209 * Macros for accessing the content. Those should be used only by the parser, 210 * and not exported. 211 * 212 * Dirty macros, i.e. one need to make assumption on the context to use them 213 * 214 * CUR_PTR return the current pointer to the xmlChar to be parsed. 215 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled 216 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled 217 * in UNICODE mode. This should be used internally by the parser 218 * only to compare to ASCII values otherwise it would break when 219 * running with UTF-8 encoding. 220 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only 221 * to compare on ASCII based substring. 222 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR 223 * it should be used only to compare on ASCII based substring. 224 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined 225 * strings without newlines within the parser. 226 * 227 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding 228 * 229 * CURRENT Returns the current char value, with the full decoding of 230 * UTF-8 if we are using this mode. It returns an int. 231 * NEXT Skip to the next character, this does the proper decoding 232 * in UTF-8 mode. It also pop-up unfinished entities on the fly. 233 * NEXTL(l) Skip the current unicode character of l xmlChars long. 234 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly 235 */ 236 237 #define UPPER (toupper(*ctxt->input->cur)) 238 239 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val) 240 241 #define NXT(val) ctxt->input->cur[(val)] 242 243 #define UPP(val) (toupper(ctxt->input->cur[(val)])) 244 245 #define CUR_PTR ctxt->input->cur 246 247 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \ 248 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \ 249 xmlParserInputShrink(ctxt->input) 250 251 #define GROW if ((ctxt->progressive == 0) && \ 252 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \ 253 xmlParserInputGrow(ctxt->input, INPUT_CHUNK) 254 255 #define CURRENT ((int) (*ctxt->input->cur)) 256 257 #define SKIP_BLANKS htmlSkipBlankChars(ctxt) 258 259 /* Inported from XML */ 260 261 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */ 262 #define CUR ((int) (*ctxt->input->cur)) 263 #define NEXT xmlNextChar(ctxt) 264 265 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur)) 266 #define NXT(val) ctxt->input->cur[(val)] 267 #define CUR_PTR ctxt->input->cur 268 269 270 #define NEXTL(l) do { \ 271 if (*(ctxt->input->cur) == '\n') { \ 272 ctxt->input->line++; ctxt->input->col = 1; \ 273 } else ctxt->input->col++; \ 274 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \ 275 } while (0) 276 277 /************ 278 \ 279 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \ 280 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt); 281 ************/ 282 283 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l) 284 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l) 285 286 #define COPY_BUF(l,b,i,v) \ 287 if (l == 1) b[i++] = (xmlChar) v; \ 288 else i += xmlCopyChar(l,&b[i],v) 289 290 /** 291 * htmlCurrentChar: 292 * @ctxt: the HTML parser context 293 * @len: pointer to the length of the char read 294 * 295 * The current char value, if using UTF-8 this may actually span multiple 296 * bytes in the input buffer. Implement the end of line normalization: 297 * 2.11 End-of-Line Handling 298 * If the encoding is unspecified, in the case we find an ISO-Latin-1 299 * char, then the encoding converter is plugged in automatically. 300 * 301 * Returns the current char value and its length 302 */ 303 304 static int 305 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { 306 if (ctxt->instate == XML_PARSER_EOF) 307 return(0); 308 309 if (ctxt->token != 0) { 310 *len = 0; 311 return(ctxt->token); 312 } 313 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { 314 /* 315 * We are supposed to handle UTF8, check it's valid 316 * From rfc2044: encoding of the Unicode values on UTF-8: 317 * 318 * UCS-4 range (hex.) UTF-8 octet sequence (binary) 319 * 0000 0000-0000 007F 0xxxxxxx 320 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx 321 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 322 * 323 * Check for the 0x110000 limit too 324 */ 325 const unsigned char *cur = ctxt->input->cur; 326 unsigned char c; 327 unsigned int val; 328 329 c = *cur; 330 if (c & 0x80) { 331 if (cur[1] == 0) 332 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 333 if ((cur[1] & 0xc0) != 0x80) 334 goto encoding_error; 335 if ((c & 0xe0) == 0xe0) { 336 337 if (cur[2] == 0) 338 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 339 if ((cur[2] & 0xc0) != 0x80) 340 goto encoding_error; 341 if ((c & 0xf0) == 0xf0) { 342 if (cur[3] == 0) 343 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 344 if (((c & 0xf8) != 0xf0) || 345 ((cur[3] & 0xc0) != 0x80)) 346 goto encoding_error; 347 /* 4-byte code */ 348 *len = 4; 349 val = (cur[0] & 0x7) << 18; 350 val |= (cur[1] & 0x3f) << 12; 351 val |= (cur[2] & 0x3f) << 6; 352 val |= cur[3] & 0x3f; 353 } else { 354 /* 3-byte code */ 355 *len = 3; 356 val = (cur[0] & 0xf) << 12; 357 val |= (cur[1] & 0x3f) << 6; 358 val |= cur[2] & 0x3f; 359 } 360 } else { 361 /* 2-byte code */ 362 *len = 2; 363 val = (cur[0] & 0x1f) << 6; 364 val |= cur[1] & 0x3f; 365 } 366 if (!IS_CHAR(val)) { 367 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 368 "Char 0x%X out of allowed range\n", val); 369 } 370 return(val); 371 } else { 372 /* 1-byte code */ 373 *len = 1; 374 return((int) *ctxt->input->cur); 375 } 376 } 377 /* 378 * Assume it's a fixed length encoding (1) with 379 * a compatible encoding for the ASCII set, since 380 * XML constructs only use < 128 chars 381 */ 382 *len = 1; 383 if ((int) *ctxt->input->cur < 0x80) 384 return((int) *ctxt->input->cur); 385 386 /* 387 * Humm this is bad, do an automatic flow conversion 388 */ 389 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); 390 ctxt->charset = XML_CHAR_ENCODING_UTF8; 391 return(xmlCurrentChar(ctxt, len)); 392 393 encoding_error: 394 /* 395 * If we detect an UTF8 error that probably mean that the 396 * input encoding didn't get properly advertized in the 397 * declaration header. Report the error and switch the encoding 398 * to ISO-Latin-1 (if you don't like this policy, just declare the 399 * encoding !) 400 */ 401 { 402 char buffer[150]; 403 404 if (ctxt->input->end - ctxt->input->cur >= 4) { 405 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", 406 ctxt->input->cur[0], ctxt->input->cur[1], 407 ctxt->input->cur[2], ctxt->input->cur[3]); 408 } else { 409 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]); 410 } 411 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 412 "Input is not proper UTF-8, indicate encoding !\n", 413 BAD_CAST buffer, NULL); 414 } 415 416 ctxt->charset = XML_CHAR_ENCODING_8859_1; 417 *len = 1; 418 return((int) *ctxt->input->cur); 419 } 420 421 /** 422 * htmlSkipBlankChars: 423 * @ctxt: the HTML parser context 424 * 425 * skip all blanks character found at that point in the input streams. 426 * 427 * Returns the number of space chars skipped 428 */ 429 430 static int 431 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) { 432 int res = 0; 433 434 while (IS_BLANK_CH(*(ctxt->input->cur))) { 435 if ((*ctxt->input->cur == 0) && 436 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { 437 xmlPopInput(ctxt); 438 } else { 439 if (*(ctxt->input->cur) == '\n') { 440 ctxt->input->line++; ctxt->input->col = 1; 441 } else ctxt->input->col++; 442 ctxt->input->cur++; 443 ctxt->nbChars++; 444 if (*ctxt->input->cur == 0) 445 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 446 } 447 res++; 448 } 449 return(res); 450 } 451 452 453 454 /************************************************************************ 455 * * 456 * The list of HTML elements and their properties * 457 * * 458 ************************************************************************/ 459 460 /* 461 * Start Tag: 1 means the start tag can be ommited 462 * End Tag: 1 means the end tag can be ommited 463 * 2 means it's forbidden (empty elements) 464 * 3 means the tag is stylistic and should be closed easily 465 * Depr: this element is deprecated 466 * DTD: 1 means that this element is valid only in the Loose DTD 467 * 2 means that this element is valid only in the Frameset DTD 468 * 469 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description 470 , subElements , impliedsubelt , Attributes, userdata 471 */ 472 473 /* Definitions and a couple of vars for HTML Elements */ 474 475 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small" 476 #define NB_FONTSTYLE 8 477 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym" 478 #define NB_PHRASE 10 479 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe" 480 #define NB_SPECIAL 16 481 #define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL 482 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL 483 #define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address" 484 #define NB_BLOCK NB_HEADING + NB_LIST + 14 485 #define FORMCTRL "input", "select", "textarea", "label", "button" 486 #define NB_FORMCTRL 5 487 #define PCDATA 488 #define NB_PCDATA 0 489 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6" 490 #define NB_HEADING 6 491 #define LIST "ul", "ol", "dir", "menu" 492 #define NB_LIST 4 493 #define MODIFIER 494 #define NB_MODIFIER 0 495 #define FLOW BLOCK,INLINE 496 #define NB_FLOW NB_BLOCK + NB_INLINE 497 #define EMPTY NULL 498 499 500 static const char* const html_flow[] = { FLOW, NULL } ; 501 static const char* const html_inline[] = { INLINE, NULL } ; 502 503 /* placeholders: elts with content but no subelements */ 504 static const char* const html_pcdata[] = { NULL } ; 505 #define html_cdata html_pcdata 506 507 508 /* ... and for HTML Attributes */ 509 510 #define COREATTRS "id", "class", "style", "title" 511 #define NB_COREATTRS 4 512 #define I18N "lang", "dir" 513 #define NB_I18N 2 514 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup" 515 #define NB_EVENTS 9 516 #define ATTRS COREATTRS,I18N,EVENTS 517 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS 518 #define CELLHALIGN "align", "char", "charoff" 519 #define NB_CELLHALIGN 3 520 #define CELLVALIGN "valign" 521 #define NB_CELLVALIGN 1 522 523 static const char* const html_attrs[] = { ATTRS, NULL } ; 524 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ; 525 static const char* const core_attrs[] = { COREATTRS, NULL } ; 526 static const char* const i18n_attrs[] = { I18N, NULL } ; 527 528 529 /* Other declarations that should go inline ... */ 530 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name", 531 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords", 532 "tabindex", "onfocus", "onblur", NULL } ; 533 static const char* const target_attr[] = { "target", NULL } ; 534 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ; 535 static const char* const alt_attr[] = { "alt", NULL } ; 536 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ; 537 static const char* const href_attrs[] = { "href", NULL } ; 538 static const char* const clear_attrs[] = { "clear", NULL } ; 539 static const char* const inline_p[] = { INLINE, "p", NULL } ; 540 541 static const char* const flow_param[] = { FLOW, "param", NULL } ; 542 static const char* const applet_attrs[] = { COREATTRS , "codebase", 543 "archive", "alt", "name", "height", "width", "align", 544 "hspace", "vspace", NULL } ; 545 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref", 546 "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 547 static const char* const basefont_attrs[] = 548 { "id", "size", "color", "face", NULL } ; 549 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ; 550 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ; 551 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ; 552 static const char* const body_depr[] = { "background", "bgcolor", "text", 553 "link", "vlink", "alink", NULL } ; 554 static const char* const button_attrs[] = { ATTRS, "name", "value", "type", 555 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 556 557 558 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ; 559 static const char* const col_elt[] = { "col", NULL } ; 560 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ; 561 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ; 562 static const char* const dl_contents[] = { "dt", "dd", NULL } ; 563 static const char* const compact_attr[] = { "compact", NULL } ; 564 static const char* const label_attr[] = { "label", NULL } ; 565 static const char* const fieldset_contents[] = { FLOW, "legend" } ; 566 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ; 567 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ; 568 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ; 569 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ; 570 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ; 571 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ; 572 static const char* const head_attrs[] = { I18N, "profile", NULL } ; 573 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ; 574 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ; 575 static const char* const version_attr[] = { "version", NULL } ; 576 static const char* const html_content[] = { "head", "body", "frameset", NULL } ; 577 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ; 578 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ; 579 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ; 580 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ; 581 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ; 582 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ; 583 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ; 584 static const char* const align_attr[] = { "align", NULL } ; 585 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ; 586 static const char* const map_contents[] = { BLOCK, "area", NULL } ; 587 static const char* const name_attr[] = { "name", NULL } ; 588 static const char* const action_attr[] = { "action", NULL } ; 589 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ; 590 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ; 591 static const char* const content_attr[] = { "content", NULL } ; 592 static const char* const type_attr[] = { "type", NULL } ; 593 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ; 594 static const char* const object_contents[] = { FLOW, "param", NULL } ; 595 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ; 596 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ; 597 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ; 598 static const char* const option_elt[] = { "option", NULL } ; 599 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ; 600 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ; 601 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ; 602 static const char* const width_attr[] = { "width", NULL } ; 603 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ; 604 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ; 605 static const char* const language_attr[] = { "language", NULL } ; 606 static const char* const select_content[] = { "optgroup", "option", NULL } ; 607 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ; 608 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ; 609 static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ; 610 static const char* const table_depr[] = { "align", "bgcolor", NULL } ; 611 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ; 612 static const char* const tr_elt[] = { "tr", NULL } ; 613 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ; 614 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ; 615 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ; 616 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ; 617 static const char* const tr_contents[] = { "th", "td", NULL } ; 618 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ; 619 static const char* const li_elt[] = { "li", NULL } ; 620 static const char* const ul_depr[] = { "type", "compact", NULL} ; 621 static const char* const dir_attr[] = { "dir", NULL} ; 622 623 #define DECL (const char**) 624 625 static const htmlElemDesc 626 html40ElementTable[] = { 627 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ", 628 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL 629 }, 630 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form", 631 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 632 }, 633 { "acronym", 0, 0, 0, 0, 0, 0, 1, "", 634 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 635 }, 636 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ", 637 DECL inline_p , NULL , DECL html_attrs, NULL, NULL 638 }, 639 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ", 640 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL 641 }, 642 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ", 643 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr 644 }, 645 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style", 646 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 647 }, 648 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ", 649 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs 650 }, 651 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " , 652 EMPTY , NULL , NULL, DECL basefont_attrs, NULL 653 }, 654 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ", 655 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr 656 }, 657 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style", 658 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 659 }, 660 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ", 661 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL 662 }, 663 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ", 664 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL 665 }, 666 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ", 667 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL 668 }, 669 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ", 670 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL 671 }, 672 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ", 673 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 674 }, 675 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ", 676 DECL html_flow , NULL , NULL, DECL html_attrs, NULL 677 }, 678 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation", 679 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 680 }, 681 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment", 682 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 683 }, 684 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ", 685 EMPTY , NULL , DECL col_attrs , NULL, NULL 686 }, 687 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ", 688 DECL col_elt , "col" , DECL col_attrs , NULL, NULL 689 }, 690 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ", 691 DECL html_flow , NULL , DECL html_attrs, NULL, NULL 692 }, 693 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ", 694 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL 695 }, 696 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition", 697 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 698 }, 699 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list", 700 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL 701 }, 702 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container", 703 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL 704 }, 705 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ", 706 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL 707 }, 708 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ", 709 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 710 }, 711 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis", 712 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 713 }, 714 { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ", 715 EMPTY, NULL, DECL embed_attrs, NULL, NULL 716 }, 717 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ", 718 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL 719 }, 720 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ", 721 DECL html_inline, NULL, NULL, DECL font_attrs, NULL 722 }, 723 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ", 724 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr 725 }, 726 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " , 727 EMPTY, NULL, NULL, DECL frame_attrs, NULL 728 }, 729 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" , 730 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL 731 }, 732 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ", 733 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 734 }, 735 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ", 736 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 737 }, 738 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ", 739 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 740 }, 741 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ", 742 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 743 }, 744 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ", 745 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 746 }, 747 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ", 748 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 749 }, 750 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ", 751 DECL head_contents, NULL, DECL head_attrs, NULL, NULL 752 }, 753 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " , 754 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL 755 }, 756 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ", 757 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL 758 }, 759 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style", 760 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 761 }, 762 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ", 763 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL 764 }, 765 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ", 766 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs 767 }, 768 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ", 769 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL 770 }, 771 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text", 772 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL 773 }, 774 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ", 775 EMPTY, NULL, NULL, DECL prompt_attrs, NULL 776 }, 777 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user", 778 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 779 }, 780 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ", 781 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL 782 }, 783 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ", 784 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL 785 }, 786 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ", 787 DECL html_flow, NULL, DECL html_attrs, NULL, NULL 788 }, 789 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ", 790 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL 791 }, 792 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ", 793 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr 794 }, 795 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ", 796 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL 797 }, 798 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ", 799 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr 800 }, 801 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ", 802 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL 803 }, 804 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ", 805 DECL html_flow, "div", DECL html_attrs, NULL, NULL 806 }, 807 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ", 808 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL 809 }, 810 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ", 811 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL 812 }, 813 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ", 814 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr 815 }, 816 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " , 817 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL 818 }, 819 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ", 820 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 821 }, 822 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ", 823 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr 824 }, 825 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ", 826 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL 827 }, 828 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ", 829 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL 830 }, 831 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style", 832 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 833 }, 834 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.", 835 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 836 }, 837 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ", 838 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr 839 }, 840 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ", 841 DECL select_content, NULL, DECL select_attrs, NULL, NULL 842 }, 843 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style", 844 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 845 }, 846 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ", 847 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 848 }, 849 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text", 850 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 851 }, 852 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis", 853 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 854 }, 855 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ", 856 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr 857 }, 858 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript", 859 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 860 }, 861 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ", 862 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 863 }, 864 { "table", 0, 0, 0, 0, 0, 0, 0, "", 865 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL 866 }, 867 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ", 868 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 869 }, 870 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell", 871 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 872 }, 873 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ", 874 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr 875 }, 876 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ", 877 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 878 }, 879 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell", 880 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 881 }, 882 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ", 883 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 884 }, 885 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ", 886 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL 887 }, 888 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ", 889 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL 890 }, 891 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style", 892 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 893 }, 894 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style", 895 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 896 }, 897 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ", 898 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL 899 }, 900 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument", 901 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 902 } 903 }; 904 905 /* 906 * start tags that imply the end of current element 907 */ 908 static const char * const htmlStartClose[] = { 909 "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6", 910 "dl", "ul", "ol", "menu", "dir", "address", "pre", 911 "listing", "xmp", "head", NULL, 912 "head", "p", NULL, 913 "title", "p", NULL, 914 "body", "head", "style", "link", "title", "p", NULL, 915 "frameset", "head", "style", "link", "title", "p", NULL, 916 "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address", 917 "pre", "listing", "xmp", "head", "li", NULL, 918 "hr", "p", "head", NULL, 919 "h1", "p", "head", NULL, 920 "h2", "p", "head", NULL, 921 "h3", "p", "head", NULL, 922 "h4", "p", "head", NULL, 923 "h5", "p", "head", NULL, 924 "h6", "p", "head", NULL, 925 "dir", "p", "head", NULL, 926 "address", "p", "head", "ul", NULL, 927 "pre", "p", "head", "ul", NULL, 928 "listing", "p", "head", NULL, 929 "xmp", "p", "head", NULL, 930 "blockquote", "p", "head", NULL, 931 "dl", "p", "dt", "menu", "dir", "address", "pre", "listing", 932 "xmp", "head", NULL, 933 "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp", 934 "head", "dd", NULL, 935 "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp", 936 "head", "dt", NULL, 937 "ul", "p", "head", "ol", "menu", "dir", "address", "pre", 938 "listing", "xmp", NULL, 939 "ol", "p", "head", "ul", NULL, 940 "menu", "p", "head", "ul", NULL, 941 "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL, 942 "div", "p", "head", NULL, 943 "noscript", "p", "head", NULL, 944 "center", "font", "b", "i", "p", "head", NULL, 945 "a", "a", NULL, 946 "caption", "p", NULL, 947 "colgroup", "caption", "colgroup", "col", "p", NULL, 948 "col", "caption", "col", "p", NULL, 949 "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre", 950 "listing", "xmp", "a", NULL, 951 "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 952 "td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 953 "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL, 954 "thead", "caption", "col", "colgroup", NULL, 955 "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead", 956 "tbody", "p", NULL, 957 "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead", 958 "tfoot", "tbody", "p", NULL, 959 "optgroup", "option", NULL, 960 "option", "option", NULL, 961 "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", 962 "pre", "listing", "xmp", "a", NULL, 963 NULL 964 }; 965 966 /* 967 * The list of HTML elements which are supposed not to have 968 * CDATA content and where a p element will be implied 969 * 970 * TODO: extend that list by reading the HTML SGML DTD on 971 * implied paragraph 972 */ 973 static const char *const htmlNoContentElements[] = { 974 "html", 975 "head", 976 NULL 977 }; 978 979 /* 980 * The list of HTML attributes which are of content %Script; 981 * NOTE: when adding ones, check htmlIsScriptAttribute() since 982 * it assumes the name starts with 'on' 983 */ 984 static const char *const htmlScriptAttributes[] = { 985 "onclick", 986 "ondblclick", 987 "onmousedown", 988 "onmouseup", 989 "onmouseover", 990 "onmousemove", 991 "onmouseout", 992 "onkeypress", 993 "onkeydown", 994 "onkeyup", 995 "onload", 996 "onunload", 997 "onfocus", 998 "onblur", 999 "onsubmit", 1000 "onrest", 1001 "onchange", 1002 "onselect" 1003 }; 1004 1005 /* 1006 * This table is used by the htmlparser to know what to do with 1007 * broken html pages. By assigning different priorities to different 1008 * elements the parser can decide how to handle extra endtags. 1009 * Endtags are only allowed to close elements with lower or equal 1010 * priority. 1011 */ 1012 1013 typedef struct { 1014 const char *name; 1015 int priority; 1016 } elementPriority; 1017 1018 static const elementPriority htmlEndPriority[] = { 1019 {"div", 150}, 1020 {"td", 160}, 1021 {"th", 160}, 1022 {"tr", 170}, 1023 {"thead", 180}, 1024 {"tbody", 180}, 1025 {"tfoot", 180}, 1026 {"table", 190}, 1027 {"head", 200}, 1028 {"body", 200}, 1029 {"html", 220}, 1030 {NULL, 100} /* Default priority */ 1031 }; 1032 1033 static const char** htmlStartCloseIndex[100]; 1034 static int htmlStartCloseIndexinitialized = 0; 1035 1036 /************************************************************************ 1037 * * 1038 * functions to handle HTML specific data * 1039 * * 1040 ************************************************************************/ 1041 1042 /** 1043 * htmlInitAutoClose: 1044 * 1045 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. 1046 * This is not reentrant. Call xmlInitParser() once before processing in 1047 * case of use in multithreaded programs. 1048 */ 1049 void 1050 htmlInitAutoClose(void) { 1051 int indx, i = 0; 1052 1053 if (htmlStartCloseIndexinitialized) return; 1054 1055 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL; 1056 indx = 0; 1057 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) { 1058 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i]; 1059 while (htmlStartClose[i] != NULL) i++; 1060 i++; 1061 } 1062 htmlStartCloseIndexinitialized = 1; 1063 } 1064 1065 /** 1066 * htmlTagLookup: 1067 * @tag: The tag name in lowercase 1068 * 1069 * Lookup the HTML tag in the ElementTable 1070 * 1071 * Returns the related htmlElemDescPtr or NULL if not found. 1072 */ 1073 const htmlElemDesc * 1074 htmlTagLookup(const xmlChar *tag) { 1075 unsigned int i; 1076 1077 for (i = 0; i < (sizeof(html40ElementTable) / 1078 sizeof(html40ElementTable[0]));i++) { 1079 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name)) 1080 return((htmlElemDescPtr) &html40ElementTable[i]); 1081 } 1082 return(NULL); 1083 } 1084 1085 /** 1086 * htmlGetEndPriority: 1087 * @name: The name of the element to look up the priority for. 1088 * 1089 * Return value: The "endtag" priority. 1090 **/ 1091 static int 1092 htmlGetEndPriority (const xmlChar *name) { 1093 int i = 0; 1094 1095 while ((htmlEndPriority[i].name != NULL) && 1096 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name))) 1097 i++; 1098 1099 return(htmlEndPriority[i].priority); 1100 } 1101 1102 1103 /** 1104 * htmlCheckAutoClose: 1105 * @newtag: The new tag name 1106 * @oldtag: The old tag name 1107 * 1108 * Checks whether the new tag is one of the registered valid tags for 1109 * closing old. 1110 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. 1111 * 1112 * Returns 0 if no, 1 if yes. 1113 */ 1114 static int 1115 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag) 1116 { 1117 int i, indx; 1118 const char **closed = NULL; 1119 1120 if (htmlStartCloseIndexinitialized == 0) 1121 htmlInitAutoClose(); 1122 1123 /* inefficient, but not a big deal */ 1124 for (indx = 0; indx < 100; indx++) { 1125 closed = htmlStartCloseIndex[indx]; 1126 if (closed == NULL) 1127 return (0); 1128 if (xmlStrEqual(BAD_CAST * closed, newtag)) 1129 break; 1130 } 1131 1132 i = closed - htmlStartClose; 1133 i++; 1134 while (htmlStartClose[i] != NULL) { 1135 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) { 1136 return (1); 1137 } 1138 i++; 1139 } 1140 return (0); 1141 } 1142 1143 /** 1144 * htmlAutoCloseOnClose: 1145 * @ctxt: an HTML parser context 1146 * @newtag: The new tag name 1147 * @force: force the tag closure 1148 * 1149 * The HTML DTD allows an ending tag to implicitly close other tags. 1150 */ 1151 static void 1152 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1153 { 1154 const htmlElemDesc *info; 1155 int i, priority; 1156 1157 priority = htmlGetEndPriority(newtag); 1158 1159 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1160 1161 if (xmlStrEqual(newtag, ctxt->nameTab[i])) 1162 break; 1163 /* 1164 * A missplaced endtag can only close elements with lower 1165 * or equal priority, so if we find an element with higher 1166 * priority before we find an element with 1167 * matching name, we just ignore this endtag 1168 */ 1169 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority) 1170 return; 1171 } 1172 if (i < 0) 1173 return; 1174 1175 while (!xmlStrEqual(newtag, ctxt->name)) { 1176 info = htmlTagLookup(ctxt->name); 1177 if ((info != NULL) && (info->endTag == 3)) { 1178 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 1179 "Opening and ending tag mismatch: %s and %s\n", 1180 newtag, ctxt->name); 1181 } 1182 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1183 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1184 htmlnamePop(ctxt); 1185 } 1186 } 1187 1188 /** 1189 * htmlAutoCloseOnEnd: 1190 * @ctxt: an HTML parser context 1191 * 1192 * Close all remaining tags at the end of the stream 1193 */ 1194 static void 1195 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) 1196 { 1197 int i; 1198 1199 if (ctxt->nameNr == 0) 1200 return; 1201 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1202 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1203 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1204 htmlnamePop(ctxt); 1205 } 1206 } 1207 1208 /** 1209 * htmlAutoClose: 1210 * @ctxt: an HTML parser context 1211 * @newtag: The new tag name or NULL 1212 * 1213 * The HTML DTD allows a tag to implicitly close other tags. 1214 * The list is kept in htmlStartClose array. This function is 1215 * called when a new tag has been detected and generates the 1216 * appropriates closes if possible/needed. 1217 * If newtag is NULL this mean we are at the end of the resource 1218 * and we should check 1219 */ 1220 static void 1221 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1222 { 1223 while ((newtag != NULL) && (ctxt->name != NULL) && 1224 (htmlCheckAutoClose(newtag, ctxt->name))) { 1225 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1226 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1227 htmlnamePop(ctxt); 1228 } 1229 if (newtag == NULL) { 1230 htmlAutoCloseOnEnd(ctxt); 1231 return; 1232 } 1233 while ((newtag == NULL) && (ctxt->name != NULL) && 1234 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) || 1235 (xmlStrEqual(ctxt->name, BAD_CAST "body")) || 1236 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) { 1237 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1238 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1239 htmlnamePop(ctxt); 1240 } 1241 } 1242 1243 /** 1244 * htmlAutoCloseTag: 1245 * @doc: the HTML document 1246 * @name: The tag name 1247 * @elem: the HTML element 1248 * 1249 * The HTML DTD allows a tag to implicitly close other tags. 1250 * The list is kept in htmlStartClose array. This function checks 1251 * if the element or one of it's children would autoclose the 1252 * given tag. 1253 * 1254 * Returns 1 if autoclose, 0 otherwise 1255 */ 1256 int 1257 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) { 1258 htmlNodePtr child; 1259 1260 if (elem == NULL) return(1); 1261 if (xmlStrEqual(name, elem->name)) return(0); 1262 if (htmlCheckAutoClose(elem->name, name)) return(1); 1263 child = elem->children; 1264 while (child != NULL) { 1265 if (htmlAutoCloseTag(doc, name, child)) return(1); 1266 child = child->next; 1267 } 1268 return(0); 1269 } 1270 1271 /** 1272 * htmlIsAutoClosed: 1273 * @doc: the HTML document 1274 * @elem: the HTML element 1275 * 1276 * The HTML DTD allows a tag to implicitly close other tags. 1277 * The list is kept in htmlStartClose array. This function checks 1278 * if a tag is autoclosed by one of it's child 1279 * 1280 * Returns 1 if autoclosed, 0 otherwise 1281 */ 1282 int 1283 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) { 1284 htmlNodePtr child; 1285 1286 if (elem == NULL) return(1); 1287 child = elem->children; 1288 while (child != NULL) { 1289 if (htmlAutoCloseTag(doc, elem->name, child)) return(1); 1290 child = child->next; 1291 } 1292 return(0); 1293 } 1294 1295 /** 1296 * htmlCheckImplied: 1297 * @ctxt: an HTML parser context 1298 * @newtag: The new tag name 1299 * 1300 * The HTML DTD allows a tag to exists only implicitly 1301 * called when a new tag has been detected and generates the 1302 * appropriates implicit tags if missing 1303 */ 1304 static void 1305 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { 1306 if (!htmlOmittedDefaultValue) 1307 return; 1308 if (xmlStrEqual(newtag, BAD_CAST"html")) 1309 return; 1310 if (ctxt->nameNr <= 0) { 1311 htmlnamePush(ctxt, BAD_CAST"html"); 1312 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1313 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL); 1314 } 1315 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head"))) 1316 return; 1317 if ((ctxt->nameNr <= 1) && 1318 ((xmlStrEqual(newtag, BAD_CAST"script")) || 1319 (xmlStrEqual(newtag, BAD_CAST"style")) || 1320 (xmlStrEqual(newtag, BAD_CAST"meta")) || 1321 (xmlStrEqual(newtag, BAD_CAST"link")) || 1322 (xmlStrEqual(newtag, BAD_CAST"title")) || 1323 (xmlStrEqual(newtag, BAD_CAST"base")))) { 1324 /* 1325 * dropped OBJECT ... i you put it first BODY will be 1326 * assumed ! 1327 */ 1328 htmlnamePush(ctxt, BAD_CAST"head"); 1329 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1330 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL); 1331 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) && 1332 (!xmlStrEqual(newtag, BAD_CAST"frame")) && 1333 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) { 1334 int i; 1335 for (i = 0;i < ctxt->nameNr;i++) { 1336 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) { 1337 return; 1338 } 1339 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) { 1340 return; 1341 } 1342 } 1343 1344 htmlnamePush(ctxt, BAD_CAST"body"); 1345 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1346 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL); 1347 } 1348 } 1349 1350 /** 1351 * htmlCheckParagraph 1352 * @ctxt: an HTML parser context 1353 * 1354 * Check whether a p element need to be implied before inserting 1355 * characters in the current element. 1356 * 1357 * Returns 1 if a paragraph has been inserted, 0 if not and -1 1358 * in case of error. 1359 */ 1360 1361 static int 1362 htmlCheckParagraph(htmlParserCtxtPtr ctxt) { 1363 const xmlChar *tag; 1364 int i; 1365 1366 if (ctxt == NULL) 1367 return(-1); 1368 tag = ctxt->name; 1369 if (tag == NULL) { 1370 htmlAutoClose(ctxt, BAD_CAST"p"); 1371 htmlCheckImplied(ctxt, BAD_CAST"p"); 1372 htmlnamePush(ctxt, BAD_CAST"p"); 1373 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1374 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 1375 return(1); 1376 } 1377 if (!htmlOmittedDefaultValue) 1378 return(0); 1379 for (i = 0; htmlNoContentElements[i] != NULL; i++) { 1380 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) { 1381 htmlAutoClose(ctxt, BAD_CAST"p"); 1382 htmlCheckImplied(ctxt, BAD_CAST"p"); 1383 htmlnamePush(ctxt, BAD_CAST"p"); 1384 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1385 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 1386 return(1); 1387 } 1388 } 1389 return(0); 1390 } 1391 1392 /** 1393 * htmlIsScriptAttribute: 1394 * @name: an attribute name 1395 * 1396 * Check if an attribute is of content type Script 1397 * 1398 * Returns 1 is the attribute is a script 0 otherwise 1399 */ 1400 int 1401 htmlIsScriptAttribute(const xmlChar *name) { 1402 unsigned int i; 1403 1404 if (name == NULL) 1405 return(0); 1406 /* 1407 * all script attributes start with 'on' 1408 */ 1409 if ((name[0] != 'o') || (name[1] != 'n')) 1410 return(0); 1411 for (i = 0; 1412 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]); 1413 i++) { 1414 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i])) 1415 return(1); 1416 } 1417 return(0); 1418 } 1419 1420 /************************************************************************ 1421 * * 1422 * The list of HTML predefined entities * 1423 * * 1424 ************************************************************************/ 1425 1426 1427 static const htmlEntityDesc html40EntitiesTable[] = { 1428 /* 1429 * the 4 absolute ones, plus apostrophe. 1430 */ 1431 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" }, 1432 { 38, "amp", "ampersand, U+0026 ISOnum" }, 1433 { 39, "apos", "single quote" }, 1434 { 60, "lt", "less-than sign, U+003C ISOnum" }, 1435 { 62, "gt", "greater-than sign, U+003E ISOnum" }, 1436 1437 /* 1438 * A bunch still in the 128-255 range 1439 * Replacing them depend really on the charset used. 1440 */ 1441 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" }, 1442 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" }, 1443 { 162, "cent", "cent sign, U+00A2 ISOnum" }, 1444 { 163, "pound","pound sign, U+00A3 ISOnum" }, 1445 { 164, "curren","currency sign, U+00A4 ISOnum" }, 1446 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" }, 1447 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" }, 1448 { 167, "sect", "section sign, U+00A7 ISOnum" }, 1449 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" }, 1450 { 169, "copy", "copyright sign, U+00A9 ISOnum" }, 1451 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" }, 1452 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" }, 1453 { 172, "not", "not sign, U+00AC ISOnum" }, 1454 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" }, 1455 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" }, 1456 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" }, 1457 { 176, "deg", "degree sign, U+00B0 ISOnum" }, 1458 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" }, 1459 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" }, 1460 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" }, 1461 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" }, 1462 { 181, "micro","micro sign, U+00B5 ISOnum" }, 1463 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" }, 1464 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" }, 1465 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" }, 1466 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" }, 1467 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" }, 1468 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" }, 1469 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" }, 1470 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" }, 1471 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" }, 1472 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" }, 1473 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" }, 1474 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" }, 1475 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" }, 1476 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" }, 1477 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" }, 1478 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" }, 1479 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" }, 1480 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" }, 1481 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" }, 1482 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" }, 1483 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" }, 1484 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" }, 1485 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" }, 1486 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" }, 1487 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" }, 1488 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" }, 1489 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" }, 1490 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" }, 1491 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" }, 1492 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" }, 1493 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" }, 1494 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" }, 1495 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" }, 1496 { 215, "times","multiplication sign, U+00D7 ISOnum" }, 1497 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" }, 1498 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" }, 1499 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" }, 1500 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" }, 1501 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" }, 1502 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" }, 1503 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" }, 1504 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" }, 1505 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" }, 1506 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" }, 1507 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" }, 1508 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" }, 1509 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" }, 1510 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" }, 1511 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" }, 1512 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" }, 1513 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" }, 1514 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" }, 1515 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" }, 1516 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" }, 1517 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" }, 1518 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" }, 1519 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" }, 1520 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" }, 1521 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" }, 1522 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" }, 1523 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" }, 1524 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" }, 1525 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" }, 1526 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" }, 1527 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" }, 1528 { 247, "divide","division sign, U+00F7 ISOnum" }, 1529 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" }, 1530 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" }, 1531 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" }, 1532 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" }, 1533 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" }, 1534 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" }, 1535 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" }, 1536 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" }, 1537 1538 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" }, 1539 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" }, 1540 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" }, 1541 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" }, 1542 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" }, 1543 1544 /* 1545 * Anything below should really be kept as entities references 1546 */ 1547 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" }, 1548 1549 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" }, 1550 { 732, "tilde","small tilde, U+02DC ISOdia" }, 1551 1552 { 913, "Alpha","greek capital letter alpha, U+0391" }, 1553 { 914, "Beta", "greek capital letter beta, U+0392" }, 1554 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" }, 1555 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" }, 1556 { 917, "Epsilon","greek capital letter epsilon, U+0395" }, 1557 { 918, "Zeta", "greek capital letter zeta, U+0396" }, 1558 { 919, "Eta", "greek capital letter eta, U+0397" }, 1559 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" }, 1560 { 921, "Iota", "greek capital letter iota, U+0399" }, 1561 { 922, "Kappa","greek capital letter kappa, U+039A" }, 1562 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" }, 1563 { 924, "Mu", "greek capital letter mu, U+039C" }, 1564 { 925, "Nu", "greek capital letter nu, U+039D" }, 1565 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" }, 1566 { 927, "Omicron","greek capital letter omicron, U+039F" }, 1567 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" }, 1568 { 929, "Rho", "greek capital letter rho, U+03A1" }, 1569 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" }, 1570 { 932, "Tau", "greek capital letter tau, U+03A4" }, 1571 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" }, 1572 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" }, 1573 { 935, "Chi", "greek capital letter chi, U+03A7" }, 1574 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" }, 1575 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" }, 1576 1577 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" }, 1578 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" }, 1579 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" }, 1580 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" }, 1581 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" }, 1582 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" }, 1583 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" }, 1584 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" }, 1585 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" }, 1586 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" }, 1587 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" }, 1588 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" }, 1589 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" }, 1590 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" }, 1591 { 959, "omicron","greek small letter omicron, U+03BF NEW" }, 1592 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" }, 1593 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" }, 1594 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" }, 1595 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" }, 1596 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" }, 1597 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" }, 1598 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" }, 1599 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" }, 1600 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" }, 1601 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" }, 1602 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" }, 1603 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" }, 1604 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" }, 1605 1606 { 8194, "ensp", "en space, U+2002 ISOpub" }, 1607 { 8195, "emsp", "em space, U+2003 ISOpub" }, 1608 { 8201, "thinsp","thin space, U+2009 ISOpub" }, 1609 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" }, 1610 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" }, 1611 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" }, 1612 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" }, 1613 { 8211, "ndash","en dash, U+2013 ISOpub" }, 1614 { 8212, "mdash","em dash, U+2014 ISOpub" }, 1615 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" }, 1616 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" }, 1617 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" }, 1618 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" }, 1619 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" }, 1620 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" }, 1621 { 8224, "dagger","dagger, U+2020 ISOpub" }, 1622 { 8225, "Dagger","double dagger, U+2021 ISOpub" }, 1623 1624 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" }, 1625 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" }, 1626 1627 { 8240, "permil","per mille sign, U+2030 ISOtech" }, 1628 1629 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" }, 1630 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" }, 1631 1632 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" }, 1633 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" }, 1634 1635 { 8254, "oline","overline = spacing overscore, U+203E NEW" }, 1636 { 8260, "frasl","fraction slash, U+2044 NEW" }, 1637 1638 { 8364, "euro", "euro sign, U+20AC NEW" }, 1639 1640 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" }, 1641 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" }, 1642 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" }, 1643 { 8482, "trade","trade mark sign, U+2122 ISOnum" }, 1644 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" }, 1645 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" }, 1646 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" }, 1647 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" }, 1648 { 8595, "darr", "downwards arrow, U+2193 ISOnum" }, 1649 { 8596, "harr", "left right arrow, U+2194 ISOamsa" }, 1650 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" }, 1651 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" }, 1652 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" }, 1653 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" }, 1654 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" }, 1655 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" }, 1656 1657 { 8704, "forall","for all, U+2200 ISOtech" }, 1658 { 8706, "part", "partial differential, U+2202 ISOtech" }, 1659 { 8707, "exist","there exists, U+2203 ISOtech" }, 1660 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" }, 1661 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" }, 1662 { 8712, "isin", "element of, U+2208 ISOtech" }, 1663 { 8713, "notin","not an element of, U+2209 ISOtech" }, 1664 { 8715, "ni", "contains as member, U+220B ISOtech" }, 1665 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" }, 1666 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" }, 1667 { 8722, "minus","minus sign, U+2212 ISOtech" }, 1668 { 8727, "lowast","asterisk operator, U+2217 ISOtech" }, 1669 { 8730, "radic","square root = radical sign, U+221A ISOtech" }, 1670 { 8733, "prop", "proportional to, U+221D ISOtech" }, 1671 { 8734, "infin","infinity, U+221E ISOtech" }, 1672 { 8736, "ang", "angle, U+2220 ISOamso" }, 1673 { 8743, "and", "logical and = wedge, U+2227 ISOtech" }, 1674 { 8744, "or", "logical or = vee, U+2228 ISOtech" }, 1675 { 8745, "cap", "intersection = cap, U+2229 ISOtech" }, 1676 { 8746, "cup", "union = cup, U+222A ISOtech" }, 1677 { 8747, "int", "integral, U+222B ISOtech" }, 1678 { 8756, "there4","therefore, U+2234 ISOtech" }, 1679 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" }, 1680 { 8773, "cong", "approximately equal to, U+2245 ISOtech" }, 1681 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" }, 1682 { 8800, "ne", "not equal to, U+2260 ISOtech" }, 1683 { 8801, "equiv","identical to, U+2261 ISOtech" }, 1684 { 8804, "le", "less-than or equal to, U+2264 ISOtech" }, 1685 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" }, 1686 { 8834, "sub", "subset of, U+2282 ISOtech" }, 1687 { 8835, "sup", "superset of, U+2283 ISOtech" }, 1688 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" }, 1689 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" }, 1690 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" }, 1691 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" }, 1692 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" }, 1693 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" }, 1694 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" }, 1695 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" }, 1696 { 8969, "rceil","right ceiling, U+2309 ISOamsc" }, 1697 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" }, 1698 { 8971, "rfloor","right floor, U+230B ISOamsc" }, 1699 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" }, 1700 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" }, 1701 { 9674, "loz", "lozenge, U+25CA ISOpub" }, 1702 1703 { 9824, "spades","black spade suit, U+2660 ISOpub" }, 1704 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" }, 1705 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" }, 1706 { 9830, "diams","black diamond suit, U+2666 ISOpub" }, 1707 1708 }; 1709 1710 /************************************************************************ 1711 * * 1712 * Commodity functions to handle entities * 1713 * * 1714 ************************************************************************/ 1715 1716 /* 1717 * Macro used to grow the current buffer. 1718 */ 1719 #define growBuffer(buffer) { \ 1720 xmlChar *tmp; \ 1721 buffer##_size *= 2; \ 1722 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \ 1723 if (tmp == NULL) { \ 1724 htmlErrMemory(ctxt, "growing buffer\n"); \ 1725 xmlFree(buffer); \ 1726 return(NULL); \ 1727 } \ 1728 buffer = tmp; \ 1729 } 1730 1731 /** 1732 * htmlEntityLookup: 1733 * @name: the entity name 1734 * 1735 * Lookup the given entity in EntitiesTable 1736 * 1737 * TODO: the linear scan is really ugly, an hash table is really needed. 1738 * 1739 * Returns the associated htmlEntityDescPtr if found, NULL otherwise. 1740 */ 1741 const htmlEntityDesc * 1742 htmlEntityLookup(const xmlChar *name) { 1743 unsigned int i; 1744 1745 for (i = 0;i < (sizeof(html40EntitiesTable)/ 1746 sizeof(html40EntitiesTable[0]));i++) { 1747 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) { 1748 return((htmlEntityDescPtr) &html40EntitiesTable[i]); 1749 } 1750 } 1751 return(NULL); 1752 } 1753 1754 /** 1755 * htmlEntityValueLookup: 1756 * @value: the entity's unicode value 1757 * 1758 * Lookup the given entity in EntitiesTable 1759 * 1760 * TODO: the linear scan is really ugly, an hash table is really needed. 1761 * 1762 * Returns the associated htmlEntityDescPtr if found, NULL otherwise. 1763 */ 1764 const htmlEntityDesc * 1765 htmlEntityValueLookup(unsigned int value) { 1766 unsigned int i; 1767 1768 for (i = 0;i < (sizeof(html40EntitiesTable)/ 1769 sizeof(html40EntitiesTable[0]));i++) { 1770 if (html40EntitiesTable[i].value >= value) { 1771 if (html40EntitiesTable[i].value > value) 1772 break; 1773 return((htmlEntityDescPtr) &html40EntitiesTable[i]); 1774 } 1775 } 1776 return(NULL); 1777 } 1778 1779 /** 1780 * UTF8ToHtml: 1781 * @out: a pointer to an array of bytes to store the result 1782 * @outlen: the length of @out 1783 * @in: a pointer to an array of UTF-8 chars 1784 * @inlen: the length of @in 1785 * 1786 * Take a block of UTF-8 chars in and try to convert it to an ASCII 1787 * plus HTML entities block of chars out. 1788 * 1789 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise 1790 * The value of @inlen after return is the number of octets consumed 1791 * as the return value is positive, else unpredictable. 1792 * The value of @outlen after return is the number of octets consumed. 1793 */ 1794 int 1795 UTF8ToHtml(unsigned char* out, int *outlen, 1796 const unsigned char* in, int *inlen) { 1797 const unsigned char* processed = in; 1798 const unsigned char* outend; 1799 const unsigned char* outstart = out; 1800 const unsigned char* instart = in; 1801 const unsigned char* inend; 1802 unsigned int c, d; 1803 int trailing; 1804 1805 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1); 1806 if (in == NULL) { 1807 /* 1808 * initialization nothing to do 1809 */ 1810 *outlen = 0; 1811 *inlen = 0; 1812 return(0); 1813 } 1814 inend = in + (*inlen); 1815 outend = out + (*outlen); 1816 while (in < inend) { 1817 d = *in++; 1818 if (d < 0x80) { c= d; trailing= 0; } 1819 else if (d < 0xC0) { 1820 /* trailing byte in leading position */ 1821 *outlen = out - outstart; 1822 *inlen = processed - instart; 1823 return(-2); 1824 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 1825 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 1826 else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 1827 else { 1828 /* no chance for this in Ascii */ 1829 *outlen = out - outstart; 1830 *inlen = processed - instart; 1831 return(-2); 1832 } 1833 1834 if (inend - in < trailing) { 1835 break; 1836 } 1837 1838 for ( ; trailing; trailing--) { 1839 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) 1840 break; 1841 c <<= 6; 1842 c |= d & 0x3F; 1843 } 1844 1845 /* assertion: c is a single UTF-4 value */ 1846 if (c < 0x80) { 1847 if (out + 1 >= outend) 1848 break; 1849 *out++ = c; 1850 } else { 1851 int len; 1852 const htmlEntityDesc * ent; 1853 const char *cp; 1854 char nbuf[16]; 1855 1856 /* 1857 * Try to lookup a predefined HTML entity for it 1858 */ 1859 1860 ent = htmlEntityValueLookup(c); 1861 if (ent == NULL) { 1862 snprintf(nbuf, sizeof(nbuf), "#%u", c); 1863 cp = nbuf; 1864 } 1865 else 1866 cp = ent->name; 1867 len = strlen(cp); 1868 if (out + 2 + len >= outend) 1869 break; 1870 *out++ = '&'; 1871 memcpy(out, cp, len); 1872 out += len; 1873 *out++ = ';'; 1874 } 1875 processed = in; 1876 } 1877 *outlen = out - outstart; 1878 *inlen = processed - instart; 1879 return(0); 1880 } 1881 1882 /** 1883 * htmlEncodeEntities: 1884 * @out: a pointer to an array of bytes to store the result 1885 * @outlen: the length of @out 1886 * @in: a pointer to an array of UTF-8 chars 1887 * @inlen: the length of @in 1888 * @quoteChar: the quote character to escape (' or ") or zero. 1889 * 1890 * Take a block of UTF-8 chars in and try to convert it to an ASCII 1891 * plus HTML entities block of chars out. 1892 * 1893 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise 1894 * The value of @inlen after return is the number of octets consumed 1895 * as the return value is positive, else unpredictable. 1896 * The value of @outlen after return is the number of octets consumed. 1897 */ 1898 int 1899 htmlEncodeEntities(unsigned char* out, int *outlen, 1900 const unsigned char* in, int *inlen, int quoteChar) { 1901 const unsigned char* processed = in; 1902 const unsigned char* outend; 1903 const unsigned char* outstart = out; 1904 const unsigned char* instart = in; 1905 const unsigned char* inend; 1906 unsigned int c, d; 1907 int trailing; 1908 1909 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) 1910 return(-1); 1911 outend = out + (*outlen); 1912 inend = in + (*inlen); 1913 while (in < inend) { 1914 d = *in++; 1915 if (d < 0x80) { c= d; trailing= 0; } 1916 else if (d < 0xC0) { 1917 /* trailing byte in leading position */ 1918 *outlen = out - outstart; 1919 *inlen = processed - instart; 1920 return(-2); 1921 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 1922 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 1923 else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 1924 else { 1925 /* no chance for this in Ascii */ 1926 *outlen = out - outstart; 1927 *inlen = processed - instart; 1928 return(-2); 1929 } 1930 1931 if (inend - in < trailing) 1932 break; 1933 1934 while (trailing--) { 1935 if (((d= *in++) & 0xC0) != 0x80) { 1936 *outlen = out - outstart; 1937 *inlen = processed - instart; 1938 return(-2); 1939 } 1940 c <<= 6; 1941 c |= d & 0x3F; 1942 } 1943 1944 /* assertion: c is a single UTF-4 value */ 1945 if ((c < 0x80) && (c != (unsigned int) quoteChar) && 1946 (c != '&') && (c != '<') && (c != '>')) { 1947 if (out >= outend) 1948 break; 1949 *out++ = c; 1950 } else { 1951 const htmlEntityDesc * ent; 1952 const char *cp; 1953 char nbuf[16]; 1954 int len; 1955 1956 /* 1957 * Try to lookup a predefined HTML entity for it 1958 */ 1959 ent = htmlEntityValueLookup(c); 1960 if (ent == NULL) { 1961 snprintf(nbuf, sizeof(nbuf), "#%u", c); 1962 cp = nbuf; 1963 } 1964 else 1965 cp = ent->name; 1966 len = strlen(cp); 1967 if (out + 2 + len > outend) 1968 break; 1969 *out++ = '&'; 1970 memcpy(out, cp, len); 1971 out += len; 1972 *out++ = ';'; 1973 } 1974 processed = in; 1975 } 1976 *outlen = out - outstart; 1977 *inlen = processed - instart; 1978 return(0); 1979 } 1980 1981 /************************************************************************ 1982 * * 1983 * Commodity functions to handle streams * 1984 * * 1985 ************************************************************************/ 1986 1987 /** 1988 * htmlNewInputStream: 1989 * @ctxt: an HTML parser context 1990 * 1991 * Create a new input stream structure 1992 * Returns the new input stream or NULL 1993 */ 1994 static htmlParserInputPtr 1995 htmlNewInputStream(htmlParserCtxtPtr ctxt) { 1996 htmlParserInputPtr input; 1997 1998 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput)); 1999 if (input == NULL) { 2000 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n"); 2001 return(NULL); 2002 } 2003 memset(input, 0, sizeof(htmlParserInput)); 2004 input->filename = NULL; 2005 input->directory = NULL; 2006 input->base = NULL; 2007 input->cur = NULL; 2008 input->buf = NULL; 2009 input->line = 1; 2010 input->col = 1; 2011 input->buf = NULL; 2012 input->free = NULL; 2013 input->version = NULL; 2014 input->consumed = 0; 2015 input->length = 0; 2016 return(input); 2017 } 2018 2019 2020 /************************************************************************ 2021 * * 2022 * Commodity functions, cleanup needed ? * 2023 * * 2024 ************************************************************************/ 2025 /* 2026 * all tags allowing pc data from the html 4.01 loose dtd 2027 * NOTE: it might be more apropriate to integrate this information 2028 * into the html40ElementTable array but I don't want to risk any 2029 * binary incomptibility 2030 */ 2031 static const char *allowPCData[] = { 2032 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big", 2033 "blockquote", "body", "button", "caption", "center", "cite", "code", 2034 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2", 2035 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend", 2036 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp", 2037 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var" 2038 }; 2039 2040 /** 2041 * areBlanks: 2042 * @ctxt: an HTML parser context 2043 * @str: a xmlChar * 2044 * @len: the size of @str 2045 * 2046 * Is this a sequence of blank chars that one can ignore ? 2047 * 2048 * Returns 1 if ignorable 0 otherwise. 2049 */ 2050 2051 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) { 2052 unsigned int i; 2053 int j; 2054 xmlNodePtr lastChild; 2055 xmlDtdPtr dtd; 2056 2057 for (j = 0;j < len;j++) 2058 if (!(IS_BLANK_CH(str[j]))) return(0); 2059 2060 if (CUR == 0) return(1); 2061 if (CUR != '<') return(0); 2062 if (ctxt->name == NULL) 2063 return(1); 2064 if (xmlStrEqual(ctxt->name, BAD_CAST"html")) 2065 return(1); 2066 if (xmlStrEqual(ctxt->name, BAD_CAST"head")) 2067 return(1); 2068 2069 /* Only strip CDATA children of the body tag for strict HTML DTDs */ 2070 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) { 2071 dtd = xmlGetIntSubset(ctxt->myDoc); 2072 if (dtd != NULL && dtd->ExternalID != NULL) { 2073 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") || 2074 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN")) 2075 return(1); 2076 } 2077 } 2078 2079 if (ctxt->node == NULL) return(0); 2080 lastChild = xmlGetLastChild(ctxt->node); 2081 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE)) 2082 lastChild = lastChild->prev; 2083 if (lastChild == NULL) { 2084 if ((ctxt->node->type != XML_ELEMENT_NODE) && 2085 (ctxt->node->content != NULL)) return(0); 2086 /* keep ws in constructs like ...<b> </b>... 2087 for all tags "b" allowing PCDATA */ 2088 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2089 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) { 2090 return(0); 2091 } 2092 } 2093 } else if (xmlNodeIsText(lastChild)) { 2094 return(0); 2095 } else { 2096 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p> 2097 for all tags "p" allowing PCDATA */ 2098 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2099 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) { 2100 return(0); 2101 } 2102 } 2103 } 2104 return(1); 2105 } 2106 2107 /** 2108 * htmlNewDocNoDtD: 2109 * @URI: URI for the dtd, or NULL 2110 * @ExternalID: the external ID of the DTD, or NULL 2111 * 2112 * Creates a new HTML document without a DTD node if @URI and @ExternalID 2113 * are NULL 2114 * 2115 * Returns a new document, do not initialize the DTD if not provided 2116 */ 2117 htmlDocPtr 2118 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) { 2119 xmlDocPtr cur; 2120 2121 /* 2122 * Allocate a new document and fill the fields. 2123 */ 2124 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc)); 2125 if (cur == NULL) { 2126 htmlErrMemory(NULL, "HTML document creation failed\n"); 2127 return(NULL); 2128 } 2129 memset(cur, 0, sizeof(xmlDoc)); 2130 2131 cur->type = XML_HTML_DOCUMENT_NODE; 2132 cur->version = NULL; 2133 cur->intSubset = NULL; 2134 cur->doc = cur; 2135 cur->name = NULL; 2136 cur->children = NULL; 2137 cur->extSubset = NULL; 2138 cur->oldNs = NULL; 2139 cur->encoding = NULL; 2140 cur->standalone = 1; 2141 cur->compression = 0; 2142 cur->ids = NULL; 2143 cur->refs = NULL; 2144 cur->_private = NULL; 2145 cur->charset = XML_CHAR_ENCODING_UTF8; 2146 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT; 2147 if ((ExternalID != NULL) || 2148 (URI != NULL)) 2149 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI); 2150 return(cur); 2151 } 2152 2153 /** 2154 * htmlNewDoc: 2155 * @URI: URI for the dtd, or NULL 2156 * @ExternalID: the external ID of the DTD, or NULL 2157 * 2158 * Creates a new HTML document 2159 * 2160 * Returns a new document 2161 */ 2162 htmlDocPtr 2163 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) { 2164 if ((URI == NULL) && (ExternalID == NULL)) 2165 return(htmlNewDocNoDtD( 2166 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd", 2167 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN")); 2168 2169 return(htmlNewDocNoDtD(URI, ExternalID)); 2170 } 2171 2172 2173 /************************************************************************ 2174 * * 2175 * The parser itself * 2176 * Relates to http://www.w3.org/TR/html40 * 2177 * * 2178 ************************************************************************/ 2179 2180 /************************************************************************ 2181 * * 2182 * The parser itself * 2183 * * 2184 ************************************************************************/ 2185 2186 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt); 2187 2188 /** 2189 * htmlParseHTMLName: 2190 * @ctxt: an HTML parser context 2191 * 2192 * parse an HTML tag or attribute name, note that we convert it to lowercase 2193 * since HTML names are not case-sensitive. 2194 * 2195 * Returns the Tag Name parsed or NULL 2196 */ 2197 2198 static const xmlChar * 2199 htmlParseHTMLName(htmlParserCtxtPtr ctxt) { 2200 int i = 0; 2201 xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 2202 2203 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') && 2204 (CUR != ':')) return(NULL); 2205 2206 while ((i < HTML_PARSER_BUFFER_SIZE) && 2207 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) || 2208 (CUR == ':') || (CUR == '-') || (CUR == '_'))) { 2209 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20; 2210 else loc[i] = CUR; 2211 i++; 2212 2213 NEXT; 2214 } 2215 2216 return(xmlDictLookup(ctxt->dict, loc, i)); 2217 } 2218 2219 2220 /** 2221 * htmlParseHTMLName_nonInvasive: 2222 * @ctxt: an HTML parser context 2223 * 2224 * parse an HTML tag or attribute name, note that we convert it to lowercase 2225 * since HTML names are not case-sensitive, this doesn't consume the data 2226 * from the stream, it's a look-ahead 2227 * 2228 * Returns the Tag Name parsed or NULL 2229 */ 2230 2231 static const xmlChar * 2232 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) { 2233 int i = 0; 2234 xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 2235 2236 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') && 2237 (NXT(1) != ':')) return(NULL); 2238 2239 while ((i < HTML_PARSER_BUFFER_SIZE) && 2240 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) || 2241 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) { 2242 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20; 2243 else loc[i] = NXT(1+i); 2244 i++; 2245 } 2246 2247 return(xmlDictLookup(ctxt->dict, loc, i)); 2248 } 2249 2250 2251 /** 2252 * htmlParseName: 2253 * @ctxt: an HTML parser context 2254 * 2255 * parse an HTML name, this routine is case sensitive. 2256 * 2257 * Returns the Name parsed or NULL 2258 */ 2259 2260 static const xmlChar * 2261 htmlParseName(htmlParserCtxtPtr ctxt) { 2262 const xmlChar *in; 2263 const xmlChar *ret; 2264 int count = 0; 2265 2266 GROW; 2267 2268 /* 2269 * Accelerator for simple ASCII names 2270 */ 2271 in = ctxt->input->cur; 2272 if (((*in >= 0x61) && (*in <= 0x7A)) || 2273 ((*in >= 0x41) && (*in <= 0x5A)) || 2274 (*in == '_') || (*in == ':')) { 2275 in++; 2276 while (((*in >= 0x61) && (*in <= 0x7A)) || 2277 ((*in >= 0x41) && (*in <= 0x5A)) || 2278 ((*in >= 0x30) && (*in <= 0x39)) || 2279 (*in == '_') || (*in == '-') || 2280 (*in == ':') || (*in == '.')) 2281 in++; 2282 if ((*in > 0) && (*in < 0x80)) { 2283 count = in - ctxt->input->cur; 2284 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count); 2285 ctxt->input->cur = in; 2286 ctxt->nbChars += count; 2287 ctxt->input->col += count; 2288 return(ret); 2289 } 2290 } 2291 return(htmlParseNameComplex(ctxt)); 2292 } 2293 2294 static const xmlChar * 2295 htmlParseNameComplex(xmlParserCtxtPtr ctxt) { 2296 int len = 0, l; 2297 int c; 2298 int count = 0; 2299 2300 /* 2301 * Handler for more complex cases 2302 */ 2303 GROW; 2304 c = CUR_CHAR(l); 2305 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */ 2306 (!IS_LETTER(c) && (c != '_') && 2307 (c != ':'))) { 2308 return(NULL); 2309 } 2310 2311 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */ 2312 ((IS_LETTER(c)) || (IS_DIGIT(c)) || 2313 (c == '.') || (c == '-') || 2314 (c == '_') || (c == ':') || 2315 (IS_COMBINING(c)) || 2316 (IS_EXTENDER(c)))) { 2317 if (count++ > 100) { 2318 count = 0; 2319 GROW; 2320 } 2321 len += l; 2322 NEXTL(l); 2323 c = CUR_CHAR(l); 2324 } 2325 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len)); 2326 } 2327 2328 2329 /** 2330 * htmlParseHTMLAttribute: 2331 * @ctxt: an HTML parser context 2332 * @stop: a char stop value 2333 * 2334 * parse an HTML attribute value till the stop (quote), if 2335 * stop is 0 then it stops at the first space 2336 * 2337 * Returns the attribute parsed or NULL 2338 */ 2339 2340 static xmlChar * 2341 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) { 2342 xmlChar *buffer = NULL; 2343 int buffer_size = 0; 2344 xmlChar *out = NULL; 2345 const xmlChar *name = NULL; 2346 const xmlChar *cur = NULL; 2347 const htmlEntityDesc * ent; 2348 2349 /* 2350 * allocate a translation buffer. 2351 */ 2352 buffer_size = HTML_PARSER_BUFFER_SIZE; 2353 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar)); 2354 if (buffer == NULL) { 2355 htmlErrMemory(ctxt, "buffer allocation failed\n"); 2356 return(NULL); 2357 } 2358 out = buffer; 2359 2360 /* 2361 * Ok loop until we reach one of the ending chars 2362 */ 2363 while ((CUR != 0) && (CUR != stop)) { 2364 if ((stop == 0) && (CUR == '>')) break; 2365 if ((stop == 0) && (IS_BLANK_CH(CUR))) break; 2366 if (CUR == '&') { 2367 if (NXT(1) == '#') { 2368 unsigned int c; 2369 int bits; 2370 2371 c = htmlParseCharRef(ctxt); 2372 if (c < 0x80) 2373 { *out++ = c; bits= -6; } 2374 else if (c < 0x800) 2375 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2376 else if (c < 0x10000) 2377 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2378 else 2379 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2380 2381 for ( ; bits >= 0; bits-= 6) { 2382 *out++ = ((c >> bits) & 0x3F) | 0x80; 2383 } 2384 2385 if (out - buffer > buffer_size - 100) { 2386 int indx = out - buffer; 2387 2388 growBuffer(buffer); 2389 out = &buffer[indx]; 2390 } 2391 } else { 2392 ent = htmlParseEntityRef(ctxt, &name); 2393 if (name == NULL) { 2394 *out++ = '&'; 2395 if (out - buffer > buffer_size - 100) { 2396 int indx = out - buffer; 2397 2398 growBuffer(buffer); 2399 out = &buffer[indx]; 2400 } 2401 } else if (ent == NULL) { 2402 *out++ = '&'; 2403 cur = name; 2404 while (*cur != 0) { 2405 if (out - buffer > buffer_size - 100) { 2406 int indx = out - buffer; 2407 2408 growBuffer(buffer); 2409 out = &buffer[indx]; 2410 } 2411 *out++ = *cur++; 2412 } 2413 } else { 2414 unsigned int c; 2415 int bits; 2416 2417 if (out - buffer > buffer_size - 100) { 2418 int indx = out - buffer; 2419 2420 growBuffer(buffer); 2421 out = &buffer[indx]; 2422 } 2423 c = ent->value; 2424 if (c < 0x80) 2425 { *out++ = c; bits= -6; } 2426 else if (c < 0x800) 2427 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2428 else if (c < 0x10000) 2429 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2430 else 2431 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2432 2433 for ( ; bits >= 0; bits-= 6) { 2434 *out++ = ((c >> bits) & 0x3F) | 0x80; 2435 } 2436 } 2437 } 2438 } else { 2439 unsigned int c; 2440 int bits, l; 2441 2442 if (out - buffer > buffer_size - 100) { 2443 int indx = out - buffer; 2444 2445 growBuffer(buffer); 2446 out = &buffer[indx]; 2447 } 2448 c = CUR_CHAR(l); 2449 if (c < 0x80) 2450 { *out++ = c; bits= -6; } 2451 else if (c < 0x800) 2452 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2453 else if (c < 0x10000) 2454 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2455 else 2456 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2457 2458 for ( ; bits >= 0; bits-= 6) { 2459 *out++ = ((c >> bits) & 0x3F) | 0x80; 2460 } 2461 NEXT; 2462 } 2463 } 2464 *out++ = 0; 2465 return(buffer); 2466 } 2467 2468 /** 2469 * htmlParseEntityRef: 2470 * @ctxt: an HTML parser context 2471 * @str: location to store the entity name 2472 * 2473 * parse an HTML ENTITY references 2474 * 2475 * [68] EntityRef ::= '&' Name ';' 2476 * 2477 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise, 2478 * if non-NULL *str will have to be freed by the caller. 2479 */ 2480 const htmlEntityDesc * 2481 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) { 2482 const xmlChar *name; 2483 const htmlEntityDesc * ent = NULL; 2484 2485 if (str != NULL) *str = NULL; 2486 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL); 2487 2488 if (CUR == '&') { 2489 NEXT; 2490 name = htmlParseName(ctxt); 2491 if (name == NULL) { 2492 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 2493 "htmlParseEntityRef: no name\n", NULL, NULL); 2494 } else { 2495 GROW; 2496 if (CUR == ';') { 2497 if (str != NULL) 2498 *str = name; 2499 2500 /* 2501 * Lookup the entity in the table. 2502 */ 2503 ent = htmlEntityLookup(name); 2504 if (ent != NULL) /* OK that's ugly !!! */ 2505 NEXT; 2506 } else { 2507 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING, 2508 "htmlParseEntityRef: expecting ';'\n", 2509 NULL, NULL); 2510 if (str != NULL) 2511 *str = name; 2512 } 2513 } 2514 } 2515 return(ent); 2516 } 2517 2518 /** 2519 * htmlParseAttValue: 2520 * @ctxt: an HTML parser context 2521 * 2522 * parse a value for an attribute 2523 * Note: the parser won't do substitution of entities here, this 2524 * will be handled later in xmlStringGetNodeList, unless it was 2525 * asked for ctxt->replaceEntities != 0 2526 * 2527 * Returns the AttValue parsed or NULL. 2528 */ 2529 2530 static xmlChar * 2531 htmlParseAttValue(htmlParserCtxtPtr ctxt) { 2532 xmlChar *ret = NULL; 2533 2534 if (CUR == '"') { 2535 NEXT; 2536 ret = htmlParseHTMLAttribute(ctxt, '"'); 2537 if (CUR != '"') { 2538 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 2539 "AttValue: \" expected\n", NULL, NULL); 2540 } else 2541 NEXT; 2542 } else if (CUR == '\'') { 2543 NEXT; 2544 ret = htmlParseHTMLAttribute(ctxt, '\''); 2545 if (CUR != '\'') { 2546 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 2547 "AttValue: ' expected\n", NULL, NULL); 2548 } else 2549 NEXT; 2550 } else { 2551 /* 2552 * That's an HTMLism, the attribute value may not be quoted 2553 */ 2554 ret = htmlParseHTMLAttribute(ctxt, 0); 2555 if (ret == NULL) { 2556 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE, 2557 "AttValue: no value found\n", NULL, NULL); 2558 } 2559 } 2560 return(ret); 2561 } 2562 2563 /** 2564 * htmlParseSystemLiteral: 2565 * @ctxt: an HTML parser context 2566 * 2567 * parse an HTML Literal 2568 * 2569 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") 2570 * 2571 * Returns the SystemLiteral parsed or NULL 2572 */ 2573 2574 static xmlChar * 2575 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { 2576 const xmlChar *q; 2577 xmlChar *ret = NULL; 2578 2579 if (CUR == '"') { 2580 NEXT; 2581 q = CUR_PTR; 2582 while ((IS_CHAR_CH(CUR)) && (CUR != '"')) 2583 NEXT; 2584 if (!IS_CHAR_CH(CUR)) { 2585 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2586 "Unfinished SystemLiteral\n", NULL, NULL); 2587 } else { 2588 ret = xmlStrndup(q, CUR_PTR - q); 2589 NEXT; 2590 } 2591 } else if (CUR == '\'') { 2592 NEXT; 2593 q = CUR_PTR; 2594 while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) 2595 NEXT; 2596 if (!IS_CHAR_CH(CUR)) { 2597 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2598 "Unfinished SystemLiteral\n", NULL, NULL); 2599 } else { 2600 ret = xmlStrndup(q, CUR_PTR - q); 2601 NEXT; 2602 } 2603 } else { 2604 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2605 " or ' expected\n", NULL, NULL); 2606 } 2607 2608 return(ret); 2609 } 2610 2611 /** 2612 * htmlParsePubidLiteral: 2613 * @ctxt: an HTML parser context 2614 * 2615 * parse an HTML public literal 2616 * 2617 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" 2618 * 2619 * Returns the PubidLiteral parsed or NULL. 2620 */ 2621 2622 static xmlChar * 2623 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { 2624 const xmlChar *q; 2625 xmlChar *ret = NULL; 2626 /* 2627 * Name ::= (Letter | '_') (NameChar)* 2628 */ 2629 if (CUR == '"') { 2630 NEXT; 2631 q = CUR_PTR; 2632 while (IS_PUBIDCHAR_CH(CUR)) NEXT; 2633 if (CUR != '"') { 2634 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2635 "Unfinished PubidLiteral\n", NULL, NULL); 2636 } else { 2637 ret = xmlStrndup(q, CUR_PTR - q); 2638 NEXT; 2639 } 2640 } else if (CUR == '\'') { 2641 NEXT; 2642 q = CUR_PTR; 2643 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')) 2644 NEXT; 2645 if (CUR != '\'') { 2646 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2647 "Unfinished PubidLiteral\n", NULL, NULL); 2648 } else { 2649 ret = xmlStrndup(q, CUR_PTR - q); 2650 NEXT; 2651 } 2652 } else { 2653 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2654 "PubidLiteral \" or ' expected\n", NULL, NULL); 2655 } 2656 2657 return(ret); 2658 } 2659 2660 /** 2661 * htmlParseScript: 2662 * @ctxt: an HTML parser context 2663 * 2664 * parse the content of an HTML SCRIPT or STYLE element 2665 * http://www.w3.org/TR/html4/sgml/dtd.html#Script 2666 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet 2667 * http://www.w3.org/TR/html4/types.html#type-script 2668 * http://www.w3.org/TR/html4/types.html#h-6.15 2669 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1 2670 * 2671 * Script data ( %Script; in the DTD) can be the content of the SCRIPT 2672 * element and the value of intrinsic event attributes. User agents must 2673 * not evaluate script data as HTML markup but instead must pass it on as 2674 * data to a script engine. 2675 * NOTES: 2676 * - The content is passed like CDATA 2677 * - the attributes for style and scripting "onXXX" are also described 2678 * as CDATA but SGML allows entities references in attributes so their 2679 * processing is identical as other attributes 2680 */ 2681 static void 2682 htmlParseScript(htmlParserCtxtPtr ctxt) { 2683 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; 2684 int nbchar = 0; 2685 int cur,l; 2686 2687 SHRINK; 2688 cur = CUR_CHAR(l); 2689 while (IS_CHAR_CH(cur)) { 2690 if ((cur == '<') && (NXT(1) == '/')) { 2691 /* 2692 * One should break here, the specification is clear: 2693 * Authors should therefore escape "</" within the content. 2694 * Escape mechanisms are specific to each scripting or 2695 * style sheet language. 2696 * 2697 * In recovery mode, only break if end tag match the 2698 * current tag, effectively ignoring all tags inside the 2699 * script/style block and treating the entire block as 2700 * CDATA. 2701 */ 2702 if (ctxt->recovery) { 2703 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2, 2704 xmlStrlen(ctxt->name)) == 0) 2705 { 2706 break; /* while */ 2707 } else { 2708 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 2709 "Element %s embeds close tag\n", 2710 ctxt->name, NULL); 2711 } 2712 } else { 2713 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) || 2714 ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) 2715 { 2716 break; /* while */ 2717 } 2718 } 2719 } 2720 COPY_BUF(l,buf,nbchar,cur); 2721 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 2722 if (ctxt->sax->cdataBlock!= NULL) { 2723 /* 2724 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2725 */ 2726 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 2727 } else if (ctxt->sax->characters != NULL) { 2728 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2729 } 2730 nbchar = 0; 2731 } 2732 GROW; 2733 NEXTL(l); 2734 cur = CUR_CHAR(l); 2735 } 2736 2737 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) { 2738 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 2739 "Invalid char in CDATA 0x%X\n", cur); 2740 NEXT; 2741 } 2742 2743 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2744 if (ctxt->sax->cdataBlock!= NULL) { 2745 /* 2746 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2747 */ 2748 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 2749 } else if (ctxt->sax->characters != NULL) { 2750 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2751 } 2752 } 2753 } 2754 2755 2756 /** 2757 * htmlParseCharData: 2758 * @ctxt: an HTML parser context 2759 * 2760 * parse a CharData section. 2761 * if we are within a CDATA section ']]>' marks an end of section. 2762 * 2763 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) 2764 */ 2765 2766 static void 2767 htmlParseCharData(htmlParserCtxtPtr ctxt) { 2768 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; 2769 int nbchar = 0; 2770 int cur, l; 2771 int chunk = 0; 2772 2773 SHRINK; 2774 cur = CUR_CHAR(l); 2775 while (((cur != '<') || (ctxt->token == '<')) && 2776 ((cur != '&') || (ctxt->token == '&')) && 2777 (cur != 0)) { 2778 if (!(IS_CHAR(cur))) { 2779 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 2780 "Invalid char in CDATA 0x%X\n", cur); 2781 } else { 2782 COPY_BUF(l,buf,nbchar,cur); 2783 } 2784 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 2785 /* 2786 * Ok the segment is to be consumed as chars. 2787 */ 2788 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2789 if (areBlanks(ctxt, buf, nbchar)) { 2790 if (ctxt->sax->ignorableWhitespace != NULL) 2791 ctxt->sax->ignorableWhitespace(ctxt->userData, 2792 buf, nbchar); 2793 } else { 2794 htmlCheckParagraph(ctxt); 2795 if (ctxt->sax->characters != NULL) 2796 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2797 } 2798 } 2799 nbchar = 0; 2800 } 2801 NEXTL(l); 2802 chunk++; 2803 if (chunk > HTML_PARSER_BUFFER_SIZE) { 2804 chunk = 0; 2805 SHRINK; 2806 GROW; 2807 } 2808 cur = CUR_CHAR(l); 2809 if (cur == 0) { 2810 SHRINK; 2811 GROW; 2812 cur = CUR_CHAR(l); 2813 } 2814 } 2815 if (nbchar != 0) { 2816 buf[nbchar] = 0; 2817 2818 /* 2819 * Ok the segment is to be consumed as chars. 2820 */ 2821 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2822 if (areBlanks(ctxt, buf, nbchar)) { 2823 if (ctxt->sax->ignorableWhitespace != NULL) 2824 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar); 2825 } else { 2826 htmlCheckParagraph(ctxt); 2827 if (ctxt->sax->characters != NULL) 2828 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2829 } 2830 } 2831 } else { 2832 /* 2833 * Loop detection 2834 */ 2835 if (cur == 0) 2836 ctxt->instate = XML_PARSER_EOF; 2837 } 2838 } 2839 2840 /** 2841 * htmlParseExternalID: 2842 * @ctxt: an HTML parser context 2843 * @publicID: a xmlChar** receiving PubidLiteral 2844 * 2845 * Parse an External ID or a Public ID 2846 * 2847 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral 2848 * | 'PUBLIC' S PubidLiteral S SystemLiteral 2849 * 2850 * [83] PublicID ::= 'PUBLIC' S PubidLiteral 2851 * 2852 * Returns the function returns SystemLiteral and in the second 2853 * case publicID receives PubidLiteral, is strict is off 2854 * it is possible to return NULL and have publicID set. 2855 */ 2856 2857 static xmlChar * 2858 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) { 2859 xmlChar *URI = NULL; 2860 2861 if ((UPPER == 'S') && (UPP(1) == 'Y') && 2862 (UPP(2) == 'S') && (UPP(3) == 'T') && 2863 (UPP(4) == 'E') && (UPP(5) == 'M')) { 2864 SKIP(6); 2865 if (!IS_BLANK_CH(CUR)) { 2866 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 2867 "Space required after 'SYSTEM'\n", NULL, NULL); 2868 } 2869 SKIP_BLANKS; 2870 URI = htmlParseSystemLiteral(ctxt); 2871 if (URI == NULL) { 2872 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED, 2873 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL); 2874 } 2875 } else if ((UPPER == 'P') && (UPP(1) == 'U') && 2876 (UPP(2) == 'B') && (UPP(3) == 'L') && 2877 (UPP(4) == 'I') && (UPP(5) == 'C')) { 2878 SKIP(6); 2879 if (!IS_BLANK_CH(CUR)) { 2880 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 2881 "Space required after 'PUBLIC'\n", NULL, NULL); 2882 } 2883 SKIP_BLANKS; 2884 *publicID = htmlParsePubidLiteral(ctxt); 2885 if (*publicID == NULL) { 2886 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED, 2887 "htmlParseExternalID: PUBLIC, no Public Identifier\n", 2888 NULL, NULL); 2889 } 2890 SKIP_BLANKS; 2891 if ((CUR == '"') || (CUR == '\'')) { 2892 URI = htmlParseSystemLiteral(ctxt); 2893 } 2894 } 2895 return(URI); 2896 } 2897 2898 /** 2899 * xmlParsePI: 2900 * @ctxt: an XML parser context 2901 * 2902 * parse an XML Processing Instruction. 2903 * 2904 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' 2905 */ 2906 static void 2907 htmlParsePI(htmlParserCtxtPtr ctxt) { 2908 xmlChar *buf = NULL; 2909 int len = 0; 2910 int size = HTML_PARSER_BUFFER_SIZE; 2911 int cur, l; 2912 const xmlChar *target; 2913 xmlParserInputState state; 2914 int count = 0; 2915 2916 if ((RAW == '<') && (NXT(1) == '?')) { 2917 state = ctxt->instate; 2918 ctxt->instate = XML_PARSER_PI; 2919 /* 2920 * this is a Processing Instruction. 2921 */ 2922 SKIP(2); 2923 SHRINK; 2924 2925 /* 2926 * Parse the target name and check for special support like 2927 * namespace. 2928 */ 2929 target = htmlParseName(ctxt); 2930 if (target != NULL) { 2931 if (RAW == '>') { 2932 SKIP(1); 2933 2934 /* 2935 * SAX: PI detected. 2936 */ 2937 if ((ctxt->sax) && (!ctxt->disableSAX) && 2938 (ctxt->sax->processingInstruction != NULL)) 2939 ctxt->sax->processingInstruction(ctxt->userData, 2940 target, NULL); 2941 ctxt->instate = state; 2942 return; 2943 } 2944 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 2945 if (buf == NULL) { 2946 htmlErrMemory(ctxt, NULL); 2947 ctxt->instate = state; 2948 return; 2949 } 2950 cur = CUR; 2951 if (!IS_BLANK(cur)) { 2952 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 2953 "ParsePI: PI %s space expected\n", target, NULL); 2954 } 2955 SKIP_BLANKS; 2956 cur = CUR_CHAR(l); 2957 while (IS_CHAR(cur) && (cur != '>')) { 2958 if (len + 5 >= size) { 2959 xmlChar *tmp; 2960 2961 size *= 2; 2962 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 2963 if (tmp == NULL) { 2964 htmlErrMemory(ctxt, NULL); 2965 xmlFree(buf); 2966 ctxt->instate = state; 2967 return; 2968 } 2969 buf = tmp; 2970 } 2971 count++; 2972 if (count > 50) { 2973 GROW; 2974 count = 0; 2975 } 2976 COPY_BUF(l,buf,len,cur); 2977 NEXTL(l); 2978 cur = CUR_CHAR(l); 2979 if (cur == 0) { 2980 SHRINK; 2981 GROW; 2982 cur = CUR_CHAR(l); 2983 } 2984 } 2985 buf[len] = 0; 2986 if (cur != '>') { 2987 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED, 2988 "ParsePI: PI %s never end ...\n", target, NULL); 2989 } else { 2990 SKIP(1); 2991 2992 /* 2993 * SAX: PI detected. 2994 */ 2995 if ((ctxt->sax) && (!ctxt->disableSAX) && 2996 (ctxt->sax->processingInstruction != NULL)) 2997 ctxt->sax->processingInstruction(ctxt->userData, 2998 target, buf); 2999 } 3000 xmlFree(buf); 3001 } else { 3002 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED, 3003 "PI is not started correctly", NULL, NULL); 3004 } 3005 ctxt->instate = state; 3006 } 3007 } 3008 3009 /** 3010 * htmlParseComment: 3011 * @ctxt: an HTML parser context 3012 * 3013 * Parse an XML (SGML) comment <!-- .... --> 3014 * 3015 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' 3016 */ 3017 static void 3018 htmlParseComment(htmlParserCtxtPtr ctxt) { 3019 xmlChar *buf = NULL; 3020 int len; 3021 int size = HTML_PARSER_BUFFER_SIZE; 3022 int q, ql; 3023 int r, rl; 3024 int cur, l; 3025 xmlParserInputState state; 3026 3027 /* 3028 * Check that there is a comment right here. 3029 */ 3030 if ((RAW != '<') || (NXT(1) != '!') || 3031 (NXT(2) != '-') || (NXT(3) != '-')) return; 3032 3033 state = ctxt->instate; 3034 ctxt->instate = XML_PARSER_COMMENT; 3035 SHRINK; 3036 SKIP(4); 3037 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 3038 if (buf == NULL) { 3039 htmlErrMemory(ctxt, "buffer allocation failed\n"); 3040 ctxt->instate = state; 3041 return; 3042 } 3043 q = CUR_CHAR(ql); 3044 NEXTL(ql); 3045 r = CUR_CHAR(rl); 3046 NEXTL(rl); 3047 cur = CUR_CHAR(l); 3048 len = 0; 3049 while (IS_CHAR(cur) && 3050 ((cur != '>') || 3051 (r != '-') || (q != '-'))) { 3052 if (len + 5 >= size) { 3053 xmlChar *tmp; 3054 3055 size *= 2; 3056 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 3057 if (tmp == NULL) { 3058 xmlFree(buf); 3059 htmlErrMemory(ctxt, "growing buffer failed\n"); 3060 ctxt->instate = state; 3061 return; 3062 } 3063 buf = tmp; 3064 } 3065 COPY_BUF(ql,buf,len,q); 3066 q = r; 3067 ql = rl; 3068 r = cur; 3069 rl = l; 3070 NEXTL(l); 3071 cur = CUR_CHAR(l); 3072 if (cur == 0) { 3073 SHRINK; 3074 GROW; 3075 cur = CUR_CHAR(l); 3076 } 3077 } 3078 buf[len] = 0; 3079 if (!IS_CHAR(cur)) { 3080 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, 3081 "Comment not terminated \n<!--%.50s\n", buf, NULL); 3082 xmlFree(buf); 3083 } else { 3084 NEXT; 3085 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) && 3086 (!ctxt->disableSAX)) 3087 ctxt->sax->comment(ctxt->userData, buf); 3088 xmlFree(buf); 3089 } 3090 ctxt->instate = state; 3091 } 3092 3093 /** 3094 * htmlParseCharRef: 3095 * @ctxt: an HTML parser context 3096 * 3097 * parse Reference declarations 3098 * 3099 * [66] CharRef ::= '&#' [0-9]+ ';' | 3100 * '&#x' [0-9a-fA-F]+ ';' 3101 * 3102 * Returns the value parsed (as an int) 3103 */ 3104 int 3105 htmlParseCharRef(htmlParserCtxtPtr ctxt) { 3106 int val = 0; 3107 3108 if ((ctxt == NULL) || (ctxt->input == NULL)) { 3109 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3110 "htmlParseCharRef: context error\n", 3111 NULL, NULL); 3112 return(0); 3113 } 3114 if ((CUR == '&') && (NXT(1) == '#') && 3115 ((NXT(2) == 'x') || NXT(2) == 'X')) { 3116 SKIP(3); 3117 while (CUR != ';') { 3118 if ((CUR >= '0') && (CUR <= '9')) 3119 val = val * 16 + (CUR - '0'); 3120 else if ((CUR >= 'a') && (CUR <= 'f')) 3121 val = val * 16 + (CUR - 'a') + 10; 3122 else if ((CUR >= 'A') && (CUR <= 'F')) 3123 val = val * 16 + (CUR - 'A') + 10; 3124 else { 3125 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF, 3126 "htmlParseCharRef: missing semicolumn\n", 3127 NULL, NULL); 3128 break; 3129 } 3130 NEXT; 3131 } 3132 if (CUR == ';') 3133 NEXT; 3134 } else if ((CUR == '&') && (NXT(1) == '#')) { 3135 SKIP(2); 3136 while (CUR != ';') { 3137 if ((CUR >= '0') && (CUR <= '9')) 3138 val = val * 10 + (CUR - '0'); 3139 else { 3140 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF, 3141 "htmlParseCharRef: missing semicolumn\n", 3142 NULL, NULL); 3143 break; 3144 } 3145 NEXT; 3146 } 3147 if (CUR == ';') 3148 NEXT; 3149 } else { 3150 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF, 3151 "htmlParseCharRef: invalid value\n", NULL, NULL); 3152 } 3153 /* 3154 * Check the value IS_CHAR ... 3155 */ 3156 if (IS_CHAR(val)) { 3157 return(val); 3158 } else { 3159 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 3160 "htmlParseCharRef: invalid xmlChar value %d\n", 3161 val); 3162 } 3163 return(0); 3164 } 3165 3166 3167 /** 3168 * htmlParseDocTypeDecl: 3169 * @ctxt: an HTML parser context 3170 * 3171 * parse a DOCTYPE declaration 3172 * 3173 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? 3174 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>' 3175 */ 3176 3177 static void 3178 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) { 3179 const xmlChar *name; 3180 xmlChar *ExternalID = NULL; 3181 xmlChar *URI = NULL; 3182 3183 /* 3184 * We know that '<!DOCTYPE' has been detected. 3185 */ 3186 SKIP(9); 3187 3188 SKIP_BLANKS; 3189 3190 /* 3191 * Parse the DOCTYPE name. 3192 */ 3193 name = htmlParseName(ctxt); 3194 if (name == NULL) { 3195 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3196 "htmlParseDocTypeDecl : no DOCTYPE name !\n", 3197 NULL, NULL); 3198 } 3199 /* 3200 * Check that upper(name) == "HTML" !!!!!!!!!!!!! 3201 */ 3202 3203 SKIP_BLANKS; 3204 3205 /* 3206 * Check for SystemID and ExternalID 3207 */ 3208 URI = htmlParseExternalID(ctxt, &ExternalID); 3209 SKIP_BLANKS; 3210 3211 /* 3212 * We should be at the end of the DOCTYPE declaration. 3213 */ 3214 if (CUR != '>') { 3215 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED, 3216 "DOCTYPE improperly terminated\n", NULL, NULL); 3217 /* We shouldn't try to resynchronize ... */ 3218 } 3219 NEXT; 3220 3221 /* 3222 * Create or update the document accordingly to the DOCTYPE 3223 */ 3224 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) && 3225 (!ctxt->disableSAX)) 3226 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI); 3227 3228 /* 3229 * Cleanup, since we don't use all those identifiers 3230 */ 3231 if (URI != NULL) xmlFree(URI); 3232 if (ExternalID != NULL) xmlFree(ExternalID); 3233 } 3234 3235 /** 3236 * htmlParseAttribute: 3237 * @ctxt: an HTML parser context 3238 * @value: a xmlChar ** used to store the value of the attribute 3239 * 3240 * parse an attribute 3241 * 3242 * [41] Attribute ::= Name Eq AttValue 3243 * 3244 * [25] Eq ::= S? '=' S? 3245 * 3246 * With namespace: 3247 * 3248 * [NS 11] Attribute ::= QName Eq AttValue 3249 * 3250 * Also the case QName == xmlns:??? is handled independently as a namespace 3251 * definition. 3252 * 3253 * Returns the attribute name, and the value in *value. 3254 */ 3255 3256 static const xmlChar * 3257 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { 3258 const xmlChar *name; 3259 xmlChar *val = NULL; 3260 3261 *value = NULL; 3262 name = htmlParseHTMLName(ctxt); 3263 if (name == NULL) { 3264 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3265 "error parsing attribute name\n", NULL, NULL); 3266 return(NULL); 3267 } 3268 3269 /* 3270 * read the value 3271 */ 3272 SKIP_BLANKS; 3273 if (CUR == '=') { 3274 NEXT; 3275 SKIP_BLANKS; 3276 val = htmlParseAttValue(ctxt); 3277 } else if (htmlIsBooleanAttr(name)) { 3278 /* 3279 * assume a minimized attribute 3280 */ 3281 val = xmlStrdup(name); 3282 } 3283 3284 *value = val; 3285 return(name); 3286 } 3287 3288 /** 3289 * htmlCheckEncoding: 3290 * @ctxt: an HTML parser context 3291 * @attvalue: the attribute value 3292 * 3293 * Checks an http-equiv attribute from a Meta tag to detect 3294 * the encoding 3295 * If a new encoding is detected the parser is switched to decode 3296 * it and pass UTF8 3297 */ 3298 static void 3299 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { 3300 const xmlChar *encoding; 3301 3302 if ((ctxt == NULL) || (attvalue == NULL)) 3303 return; 3304 3305 /* do not change encoding */ 3306 if (ctxt->input->encoding != NULL) 3307 return; 3308 3309 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset="); 3310 if (encoding != NULL) { 3311 encoding += 8; 3312 } else { 3313 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset ="); 3314 if (encoding != NULL) 3315 encoding += 9; 3316 } 3317 if (encoding != NULL) { 3318 xmlCharEncoding enc; 3319 xmlCharEncodingHandlerPtr handler; 3320 3321 while ((*encoding == ' ') || (*encoding == '\t')) encoding++; 3322 3323 if (ctxt->input->encoding != NULL) 3324 xmlFree((xmlChar *) ctxt->input->encoding); 3325 ctxt->input->encoding = xmlStrdup(encoding); 3326 3327 enc = xmlParseCharEncoding((const char *) encoding); 3328 /* 3329 * registered set of known encodings 3330 */ 3331 if (enc != XML_CHAR_ENCODING_ERROR) { 3332 if (((enc == XML_CHAR_ENCODING_UTF16LE) || 3333 (enc == XML_CHAR_ENCODING_UTF16BE) || 3334 (enc == XML_CHAR_ENCODING_UCS4LE) || 3335 (enc == XML_CHAR_ENCODING_UCS4BE)) && 3336 (ctxt->input->buf != NULL) && 3337 (ctxt->input->buf->encoder == NULL)) { 3338 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 3339 "htmlCheckEncoding: wrong encoding meta\n", 3340 NULL, NULL); 3341 } else { 3342 xmlSwitchEncoding(ctxt, enc); 3343 } 3344 ctxt->charset = XML_CHAR_ENCODING_UTF8; 3345 } else { 3346 /* 3347 * fallback for unknown encodings 3348 */ 3349 handler = xmlFindCharEncodingHandler((const char *) encoding); 3350 if (handler != NULL) { 3351 xmlSwitchToEncoding(ctxt, handler); 3352 ctxt->charset = XML_CHAR_ENCODING_UTF8; 3353 } else { 3354 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; 3355 } 3356 } 3357 3358 if ((ctxt->input->buf != NULL) && 3359 (ctxt->input->buf->encoder != NULL) && 3360 (ctxt->input->buf->raw != NULL) && 3361 (ctxt->input->buf->buffer != NULL)) { 3362 int nbchars; 3363 int processed; 3364 3365 /* 3366 * convert as much as possible to the parser reading buffer. 3367 */ 3368 processed = ctxt->input->cur - ctxt->input->base; 3369 xmlBufferShrink(ctxt->input->buf->buffer, processed); 3370 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder, 3371 ctxt->input->buf->buffer, 3372 ctxt->input->buf->raw); 3373 if (nbchars < 0) { 3374 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 3375 "htmlCheckEncoding: encoder error\n", 3376 NULL, NULL); 3377 } 3378 ctxt->input->base = 3379 ctxt->input->cur = ctxt->input->buf->buffer->content; 3380 } 3381 } 3382 } 3383 3384 /** 3385 * htmlCheckMeta: 3386 * @ctxt: an HTML parser context 3387 * @atts: the attributes values 3388 * 3389 * Checks an attributes from a Meta tag 3390 */ 3391 static void 3392 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) { 3393 int i; 3394 const xmlChar *att, *value; 3395 int http = 0; 3396 const xmlChar *content = NULL; 3397 3398 if ((ctxt == NULL) || (atts == NULL)) 3399 return; 3400 3401 i = 0; 3402 att = atts[i++]; 3403 while (att != NULL) { 3404 value = atts[i++]; 3405 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv")) 3406 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 3407 http = 1; 3408 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content"))) 3409 content = value; 3410 att = atts[i++]; 3411 } 3412 if ((http) && (content != NULL)) 3413 htmlCheckEncoding(ctxt, content); 3414 3415 } 3416 3417 /** 3418 * htmlParseStartTag: 3419 * @ctxt: an HTML parser context 3420 * 3421 * parse a start of tag either for rule element or 3422 * EmptyElement. In both case we don't parse the tag closing chars. 3423 * 3424 * [40] STag ::= '<' Name (S Attribute)* S? '>' 3425 * 3426 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' 3427 * 3428 * With namespace: 3429 * 3430 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>' 3431 * 3432 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>' 3433 * 3434 * Returns 0 in case of success, -1 in case of error and 1 if discarded 3435 */ 3436 3437 static int 3438 htmlParseStartTag(htmlParserCtxtPtr ctxt) { 3439 const xmlChar *name; 3440 const xmlChar *attname; 3441 xmlChar *attvalue; 3442 const xmlChar **atts; 3443 int nbatts = 0; 3444 int maxatts; 3445 int meta = 0; 3446 int i; 3447 int discardtag = 0; 3448 3449 if ((ctxt == NULL) || (ctxt->input == NULL)) { 3450 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3451 "htmlParseStartTag: context error\n", NULL, NULL); 3452 return -1; 3453 } 3454 if (CUR != '<') return -1; 3455 NEXT; 3456 3457 atts = ctxt->atts; 3458 maxatts = ctxt->maxatts; 3459 3460 GROW; 3461 name = htmlParseHTMLName(ctxt); 3462 if (name == NULL) { 3463 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3464 "htmlParseStartTag: invalid element name\n", 3465 NULL, NULL); 3466 /* Dump the bogus tag like browsers do */ 3467 while ((IS_CHAR_CH(CUR)) && (CUR != '>')) 3468 NEXT; 3469 return -1; 3470 } 3471 if (xmlStrEqual(name, BAD_CAST"meta")) 3472 meta = 1; 3473 3474 /* 3475 * Check for auto-closure of HTML elements. 3476 */ 3477 htmlAutoClose(ctxt, name); 3478 3479 /* 3480 * Check for implied HTML elements. 3481 */ 3482 htmlCheckImplied(ctxt, name); 3483 3484 /* 3485 * Avoid html at any level > 0, head at any level != 1 3486 * or any attempt to recurse body 3487 */ 3488 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) { 3489 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3490 "htmlParseStartTag: misplaced <html> tag\n", 3491 name, NULL); 3492 discardtag = 1; 3493 ctxt->depth++; 3494 } 3495 if ((ctxt->nameNr != 1) && 3496 (xmlStrEqual(name, BAD_CAST"head"))) { 3497 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3498 "htmlParseStartTag: misplaced <head> tag\n", 3499 name, NULL); 3500 discardtag = 1; 3501 ctxt->depth++; 3502 } 3503 if (xmlStrEqual(name, BAD_CAST"body")) { 3504 int indx; 3505 for (indx = 0;indx < ctxt->nameNr;indx++) { 3506 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) { 3507 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3508 "htmlParseStartTag: misplaced <body> tag\n", 3509 name, NULL); 3510 discardtag = 1; 3511 ctxt->depth++; 3512 } 3513 } 3514 } 3515 3516 /* 3517 * Now parse the attributes, it ends up with the ending 3518 * 3519 * (S Attribute)* S? 3520 */ 3521 SKIP_BLANKS; 3522 while ((IS_CHAR_CH(CUR)) && 3523 (CUR != '>') && 3524 ((CUR != '/') || (NXT(1) != '>'))) { 3525 long cons = ctxt->nbChars; 3526 3527 GROW; 3528 attname = htmlParseAttribute(ctxt, &attvalue); 3529 if (attname != NULL) { 3530 3531 /* 3532 * Well formedness requires at most one declaration of an attribute 3533 */ 3534 for (i = 0; i < nbatts;i += 2) { 3535 if (xmlStrEqual(atts[i], attname)) { 3536 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED, 3537 "Attribute %s redefined\n", attname, NULL); 3538 if (attvalue != NULL) 3539 xmlFree(attvalue); 3540 goto failed; 3541 } 3542 } 3543 3544 /* 3545 * Add the pair to atts 3546 */ 3547 if (atts == NULL) { 3548 maxatts = 22; /* allow for 10 attrs by default */ 3549 atts = (const xmlChar **) 3550 xmlMalloc(maxatts * sizeof(xmlChar *)); 3551 if (atts == NULL) { 3552 htmlErrMemory(ctxt, NULL); 3553 if (attvalue != NULL) 3554 xmlFree(attvalue); 3555 goto failed; 3556 } 3557 ctxt->atts = atts; 3558 ctxt->maxatts = maxatts; 3559 } else if (nbatts + 4 > maxatts) { 3560 const xmlChar **n; 3561 3562 maxatts *= 2; 3563 n = (const xmlChar **) xmlRealloc((void *) atts, 3564 maxatts * sizeof(const xmlChar *)); 3565 if (n == NULL) { 3566 htmlErrMemory(ctxt, NULL); 3567 if (attvalue != NULL) 3568 xmlFree(attvalue); 3569 goto failed; 3570 } 3571 atts = n; 3572 ctxt->atts = atts; 3573 ctxt->maxatts = maxatts; 3574 } 3575 atts[nbatts++] = attname; 3576 atts[nbatts++] = attvalue; 3577 atts[nbatts] = NULL; 3578 atts[nbatts + 1] = NULL; 3579 } 3580 else { 3581 if (attvalue != NULL) 3582 xmlFree(attvalue); 3583 /* Dump the bogus attribute string up to the next blank or 3584 * the end of the tag. */ 3585 while ((IS_CHAR_CH(CUR)) && 3586 !(IS_BLANK_CH(CUR)) && (CUR != '>') && 3587 ((CUR != '/') || (NXT(1) != '>'))) 3588 NEXT; 3589 } 3590 3591 failed: 3592 SKIP_BLANKS; 3593 if (cons == ctxt->nbChars) { 3594 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3595 "htmlParseStartTag: problem parsing attributes\n", 3596 NULL, NULL); 3597 break; 3598 } 3599 } 3600 3601 /* 3602 * Handle specific association to the META tag 3603 */ 3604 if (meta && (nbatts != 0)) 3605 htmlCheckMeta(ctxt, atts); 3606 3607 /* 3608 * SAX: Start of Element ! 3609 */ 3610 if (!discardtag) { 3611 htmlnamePush(ctxt, name); 3612 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) { 3613 if (nbatts != 0) 3614 ctxt->sax->startElement(ctxt->userData, name, atts); 3615 else 3616 ctxt->sax->startElement(ctxt->userData, name, NULL); 3617 } 3618 } 3619 3620 if (atts != NULL) { 3621 for (i = 1;i < nbatts;i += 2) { 3622 if (atts[i] != NULL) 3623 xmlFree((xmlChar *) atts[i]); 3624 } 3625 } 3626 3627 return(discardtag); 3628 } 3629 3630 /** 3631 * htmlParseEndTag: 3632 * @ctxt: an HTML parser context 3633 * 3634 * parse an end of tag 3635 * 3636 * [42] ETag ::= '</' Name S? '>' 3637 * 3638 * With namespace 3639 * 3640 * [NS 9] ETag ::= '</' QName S? '>' 3641 * 3642 * Returns 1 if the current level should be closed. 3643 */ 3644 3645 static int 3646 htmlParseEndTag(htmlParserCtxtPtr ctxt) 3647 { 3648 const xmlChar *name; 3649 const xmlChar *oldname; 3650 int i, ret; 3651 3652 if ((CUR != '<') || (NXT(1) != '/')) { 3653 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED, 3654 "htmlParseEndTag: '</' not found\n", NULL, NULL); 3655 return (0); 3656 } 3657 SKIP(2); 3658 3659 name = htmlParseHTMLName(ctxt); 3660 if (name == NULL) 3661 return (0); 3662 /* 3663 * We should definitely be at the ending "S? '>'" part 3664 */ 3665 SKIP_BLANKS; 3666 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) { 3667 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 3668 "End tag : expected '>'\n", NULL, NULL); 3669 if (ctxt->recovery) { 3670 /* 3671 * We're not at the ending > !! 3672 * Error, unless in recover mode where we search forwards 3673 * until we find a > 3674 */ 3675 while (CUR != '\0' && CUR != '>') NEXT; 3676 NEXT; 3677 } 3678 } else 3679 NEXT; 3680 3681 /* 3682 * if we ignored misplaced tags in htmlParseStartTag don't pop them 3683 * out now. 3684 */ 3685 if ((ctxt->depth > 0) && 3686 (xmlStrEqual(name, BAD_CAST "html") || 3687 xmlStrEqual(name, BAD_CAST "body") || 3688 xmlStrEqual(name, BAD_CAST "head"))) { 3689 ctxt->depth--; 3690 return (0); 3691 } 3692 3693 /* 3694 * If the name read is not one of the element in the parsing stack 3695 * then return, it's just an error. 3696 */ 3697 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 3698 if (xmlStrEqual(name, ctxt->nameTab[i])) 3699 break; 3700 } 3701 if (i < 0) { 3702 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 3703 "Unexpected end tag : %s\n", name, NULL); 3704 return (0); 3705 } 3706 3707 3708 /* 3709 * Check for auto-closure of HTML elements. 3710 */ 3711 3712 htmlAutoCloseOnClose(ctxt, name); 3713 3714 /* 3715 * Well formedness constraints, opening and closing must match. 3716 * With the exception that the autoclose may have popped stuff out 3717 * of the stack. 3718 */ 3719 if (!xmlStrEqual(name, ctxt->name)) { 3720 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) { 3721 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 3722 "Opening and ending tag mismatch: %s and %s\n", 3723 name, ctxt->name); 3724 } 3725 } 3726 3727 /* 3728 * SAX: End of Tag 3729 */ 3730 oldname = ctxt->name; 3731 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) { 3732 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 3733 ctxt->sax->endElement(ctxt->userData, name); 3734 htmlnamePop(ctxt); 3735 ret = 1; 3736 } else { 3737 ret = 0; 3738 } 3739 3740 return (ret); 3741 } 3742 3743 3744 /** 3745 * htmlParseReference: 3746 * @ctxt: an HTML parser context 3747 * 3748 * parse and handle entity references in content, 3749 * this will end-up in a call to character() since this is either a 3750 * CharRef, or a predefined entity. 3751 */ 3752 static void 3753 htmlParseReference(htmlParserCtxtPtr ctxt) { 3754 const htmlEntityDesc * ent; 3755 xmlChar out[6]; 3756 const xmlChar *name; 3757 if (CUR != '&') return; 3758 3759 if (NXT(1) == '#') { 3760 unsigned int c; 3761 int bits, i = 0; 3762 3763 c = htmlParseCharRef(ctxt); 3764 if (c == 0) 3765 return; 3766 3767 if (c < 0x80) { out[i++]= c; bits= -6; } 3768 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } 3769 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } 3770 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } 3771 3772 for ( ; bits >= 0; bits-= 6) { 3773 out[i++]= ((c >> bits) & 0x3F) | 0x80; 3774 } 3775 out[i] = 0; 3776 3777 htmlCheckParagraph(ctxt); 3778 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 3779 ctxt->sax->characters(ctxt->userData, out, i); 3780 } else { 3781 ent = htmlParseEntityRef(ctxt, &name); 3782 if (name == NULL) { 3783 htmlCheckParagraph(ctxt); 3784 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 3785 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); 3786 return; 3787 } 3788 if ((ent == NULL) || !(ent->value > 0)) { 3789 htmlCheckParagraph(ctxt); 3790 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) { 3791 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); 3792 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name)); 3793 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */ 3794 } 3795 } else { 3796 unsigned int c; 3797 int bits, i = 0; 3798 3799 c = ent->value; 3800 if (c < 0x80) 3801 { out[i++]= c; bits= -6; } 3802 else if (c < 0x800) 3803 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } 3804 else if (c < 0x10000) 3805 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } 3806 else 3807 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } 3808 3809 for ( ; bits >= 0; bits-= 6) { 3810 out[i++]= ((c >> bits) & 0x3F) | 0x80; 3811 } 3812 out[i] = 0; 3813 3814 htmlCheckParagraph(ctxt); 3815 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 3816 ctxt->sax->characters(ctxt->userData, out, i); 3817 } 3818 } 3819 } 3820 3821 /** 3822 * htmlParseContent: 3823 * @ctxt: an HTML parser context 3824 * 3825 * Parse a content: comment, sub-element, reference or text. 3826 */ 3827 3828 static void 3829 htmlParseContent(htmlParserCtxtPtr ctxt) { 3830 xmlChar *currentNode; 3831 int depth; 3832 const xmlChar *name; 3833 3834 currentNode = xmlStrdup(ctxt->name); 3835 depth = ctxt->nameNr; 3836 while (1) { 3837 long cons = ctxt->nbChars; 3838 3839 GROW; 3840 /* 3841 * Our tag or one of it's parent or children is ending. 3842 */ 3843 if ((CUR == '<') && (NXT(1) == '/')) { 3844 if (htmlParseEndTag(ctxt) && 3845 ((currentNode != NULL) || (ctxt->nameNr == 0))) { 3846 if (currentNode != NULL) 3847 xmlFree(currentNode); 3848 return; 3849 } 3850 continue; /* while */ 3851 } 3852 3853 else if ((CUR == '<') && 3854 ((IS_ASCII_LETTER(NXT(1))) || 3855 (NXT(1) == '_') || (NXT(1) == ':'))) { 3856 name = htmlParseHTMLName_nonInvasive(ctxt); 3857 if (name == NULL) { 3858 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3859 "htmlParseStartTag: invalid element name\n", 3860 NULL, NULL); 3861 /* Dump the bogus tag like browsers do */ 3862 while ((IS_CHAR_CH(CUR)) && (CUR != '>')) 3863 NEXT; 3864 3865 if (currentNode != NULL) 3866 xmlFree(currentNode); 3867 return; 3868 } 3869 3870 if (ctxt->name != NULL) { 3871 if (htmlCheckAutoClose(name, ctxt->name) == 1) { 3872 htmlAutoClose(ctxt, name); 3873 continue; 3874 } 3875 } 3876 } 3877 3878 /* 3879 * Has this node been popped out during parsing of 3880 * the next element 3881 */ 3882 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) && 3883 (!xmlStrEqual(currentNode, ctxt->name))) 3884 { 3885 if (currentNode != NULL) xmlFree(currentNode); 3886 return; 3887 } 3888 3889 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) || 3890 (xmlStrEqual(currentNode, BAD_CAST"style")))) { 3891 /* 3892 * Handle SCRIPT/STYLE separately 3893 */ 3894 htmlParseScript(ctxt); 3895 } else { 3896 /* 3897 * Sometimes DOCTYPE arrives in the middle of the document 3898 */ 3899 if ((CUR == '<') && (NXT(1) == '!') && 3900 (UPP(2) == 'D') && (UPP(3) == 'O') && 3901 (UPP(4) == 'C') && (UPP(5) == 'T') && 3902 (UPP(6) == 'Y') && (UPP(7) == 'P') && 3903 (UPP(8) == 'E')) { 3904 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3905 "Misplaced DOCTYPE declaration\n", 3906 BAD_CAST "DOCTYPE" , NULL); 3907 htmlParseDocTypeDecl(ctxt); 3908 } 3909 3910 /* 3911 * First case : a comment 3912 */ 3913 if ((CUR == '<') && (NXT(1) == '!') && 3914 (NXT(2) == '-') && (NXT(3) == '-')) { 3915 htmlParseComment(ctxt); 3916 } 3917 3918 /* 3919 * Second case : a Processing Instruction. 3920 */ 3921 else if ((CUR == '<') && (NXT(1) == '?')) { 3922 htmlParsePI(ctxt); 3923 } 3924 3925 /* 3926 * Third case : a sub-element. 3927 */ 3928 else if (CUR == '<') { 3929 htmlParseElement(ctxt); 3930 } 3931 3932 /* 3933 * Fourth case : a reference. If if has not been resolved, 3934 * parsing returns it's Name, create the node 3935 */ 3936 else if (CUR == '&') { 3937 htmlParseReference(ctxt); 3938 } 3939 3940 /* 3941 * Fifth case : end of the resource 3942 */ 3943 else if (CUR == 0) { 3944 htmlAutoCloseOnEnd(ctxt); 3945 break; 3946 } 3947 3948 /* 3949 * Last case, text. Note that References are handled directly. 3950 */ 3951 else { 3952 htmlParseCharData(ctxt); 3953 } 3954 3955 if (cons == ctxt->nbChars) { 3956 if (ctxt->node != NULL) { 3957 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3958 "detected an error in element content\n", 3959 NULL, NULL); 3960 } 3961 break; 3962 } 3963 } 3964 GROW; 3965 } 3966 if (currentNode != NULL) xmlFree(currentNode); 3967 } 3968 3969 /** 3970 * htmlParseContent: 3971 * @ctxt: an HTML parser context 3972 * 3973 * Parse a content: comment, sub-element, reference or text. 3974 */ 3975 3976 void 3977 __htmlParseContent(void *ctxt) { 3978 if (ctxt != NULL) 3979 htmlParseContent((htmlParserCtxtPtr) ctxt); 3980 } 3981 3982 /** 3983 * htmlParseElement: 3984 * @ctxt: an HTML parser context 3985 * 3986 * parse an HTML element, this is highly recursive 3987 * 3988 * [39] element ::= EmptyElemTag | STag content ETag 3989 * 3990 * [41] Attribute ::= Name Eq AttValue 3991 */ 3992 3993 void 3994 htmlParseElement(htmlParserCtxtPtr ctxt) { 3995 const xmlChar *name; 3996 xmlChar *currentNode = NULL; 3997 const htmlElemDesc * info; 3998 htmlParserNodeInfo node_info; 3999 int failed; 4000 int depth; 4001 const xmlChar *oldptr; 4002 4003 if ((ctxt == NULL) || (ctxt->input == NULL)) { 4004 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4005 "htmlParseElement: context error\n", NULL, NULL); 4006 return; 4007 } 4008 /* Capture start position */ 4009 if (ctxt->record_info) { 4010 node_info.begin_pos = ctxt->input->consumed + 4011 (CUR_PTR - ctxt->input->base); 4012 node_info.begin_line = ctxt->input->line; 4013 } 4014 4015 failed = htmlParseStartTag(ctxt); 4016 name = ctxt->name; 4017 if ((failed == -1) || (name == NULL)) { 4018 if (CUR == '>') 4019 NEXT; 4020 return; 4021 } 4022 4023 /* 4024 * Lookup the info for that element. 4025 */ 4026 info = htmlTagLookup(name); 4027 if (info == NULL) { 4028 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 4029 "Tag %s invalid\n", name, NULL); 4030 } 4031 4032 /* 4033 * Check for an Empty Element labeled the XML/SGML way 4034 */ 4035 if ((CUR == '/') && (NXT(1) == '>')) { 4036 SKIP(2); 4037 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4038 ctxt->sax->endElement(ctxt->userData, name); 4039 htmlnamePop(ctxt); 4040 return; 4041 } 4042 4043 if (CUR == '>') { 4044 NEXT; 4045 } else { 4046 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 4047 "Couldn't find end of Start Tag %s\n", name, NULL); 4048 4049 /* 4050 * end of parsing of this node. 4051 */ 4052 if (xmlStrEqual(name, ctxt->name)) { 4053 nodePop(ctxt); 4054 htmlnamePop(ctxt); 4055 } 4056 4057 /* 4058 * Capture end position and add node 4059 */ 4060 if (ctxt->record_info) { 4061 node_info.end_pos = ctxt->input->consumed + 4062 (CUR_PTR - ctxt->input->base); 4063 node_info.end_line = ctxt->input->line; 4064 node_info.node = ctxt->node; 4065 xmlParserAddNodeInfo(ctxt, &node_info); 4066 } 4067 return; 4068 } 4069 4070 /* 4071 * Check for an Empty Element from DTD definition 4072 */ 4073 if ((info != NULL) && (info->empty)) { 4074 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4075 ctxt->sax->endElement(ctxt->userData, name); 4076 htmlnamePop(ctxt); 4077 return; 4078 } 4079 4080 /* 4081 * Parse the content of the element: 4082 */ 4083 currentNode = xmlStrdup(ctxt->name); 4084 depth = ctxt->nameNr; 4085 while (IS_CHAR_CH(CUR)) { 4086 oldptr = ctxt->input->cur; 4087 htmlParseContent(ctxt); 4088 if (oldptr==ctxt->input->cur) break; 4089 if (ctxt->nameNr < depth) break; 4090 } 4091 4092 /* 4093 * Capture end position and add node 4094 */ 4095 if ( currentNode != NULL && ctxt->record_info ) { 4096 node_info.end_pos = ctxt->input->consumed + 4097 (CUR_PTR - ctxt->input->base); 4098 node_info.end_line = ctxt->input->line; 4099 node_info.node = ctxt->node; 4100 xmlParserAddNodeInfo(ctxt, &node_info); 4101 } 4102 if (!IS_CHAR_CH(CUR)) { 4103 htmlAutoCloseOnEnd(ctxt); 4104 } 4105 4106 if (currentNode != NULL) 4107 xmlFree(currentNode); 4108 } 4109 4110 /** 4111 * htmlParseDocument: 4112 * @ctxt: an HTML parser context 4113 * 4114 * parse an HTML document (and build a tree if using the standard SAX 4115 * interface). 4116 * 4117 * Returns 0, -1 in case of error. the parser context is augmented 4118 * as a result of the parsing. 4119 */ 4120 4121 int 4122 htmlParseDocument(htmlParserCtxtPtr ctxt) { 4123 xmlChar start[4]; 4124 xmlCharEncoding enc; 4125 xmlDtdPtr dtd; 4126 4127 xmlInitParser(); 4128 4129 htmlDefaultSAXHandlerInit(); 4130 4131 if ((ctxt == NULL) || (ctxt->input == NULL)) { 4132 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4133 "htmlParseDocument: context error\n", NULL, NULL); 4134 return(XML_ERR_INTERNAL_ERROR); 4135 } 4136 ctxt->html = 1; 4137 GROW; 4138 /* 4139 * SAX: beginning of the document processing. 4140 */ 4141 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) 4142 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator); 4143 4144 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) && 4145 ((ctxt->input->end - ctxt->input->cur) >= 4)) { 4146 /* 4147 * Get the 4 first bytes and decode the charset 4148 * if enc != XML_CHAR_ENCODING_NONE 4149 * plug some encoding conversion routines. 4150 */ 4151 start[0] = RAW; 4152 start[1] = NXT(1); 4153 start[2] = NXT(2); 4154 start[3] = NXT(3); 4155 enc = xmlDetectCharEncoding(&start[0], 4); 4156 if (enc != XML_CHAR_ENCODING_NONE) { 4157 xmlSwitchEncoding(ctxt, enc); 4158 } 4159 } 4160 4161 /* 4162 * Wipe out everything which is before the first '<' 4163 */ 4164 SKIP_BLANKS; 4165 if (CUR == 0) { 4166 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY, 4167 "Document is empty\n", NULL, NULL); 4168 } 4169 4170 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX)) 4171 ctxt->sax->startDocument(ctxt->userData); 4172 4173 4174 /* 4175 * Parse possible comments and PIs before any content 4176 */ 4177 while (((CUR == '<') && (NXT(1) == '!') && 4178 (NXT(2) == '-') && (NXT(3) == '-')) || 4179 ((CUR == '<') && (NXT(1) == '?'))) { 4180 htmlParseComment(ctxt); 4181 htmlParsePI(ctxt); 4182 SKIP_BLANKS; 4183 } 4184 4185 4186 /* 4187 * Then possibly doc type declaration(s) and more Misc 4188 * (doctypedecl Misc*)? 4189 */ 4190 if ((CUR == '<') && (NXT(1) == '!') && 4191 (UPP(2) == 'D') && (UPP(3) == 'O') && 4192 (UPP(4) == 'C') && (UPP(5) == 'T') && 4193 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4194 (UPP(8) == 'E')) { 4195 htmlParseDocTypeDecl(ctxt); 4196 } 4197 SKIP_BLANKS; 4198 4199 /* 4200 * Parse possible comments and PIs before any content 4201 */ 4202 while (((CUR == '<') && (NXT(1) == '!') && 4203 (NXT(2) == '-') && (NXT(3) == '-')) || 4204 ((CUR == '<') && (NXT(1) == '?'))) { 4205 htmlParseComment(ctxt); 4206 htmlParsePI(ctxt); 4207 SKIP_BLANKS; 4208 } 4209 4210 /* 4211 * Time to start parsing the tree itself 4212 */ 4213 htmlParseContent(ctxt); 4214 4215 /* 4216 * autoclose 4217 */ 4218 if (CUR == 0) 4219 htmlAutoCloseOnEnd(ctxt); 4220 4221 4222 /* 4223 * SAX: end of the document processing. 4224 */ 4225 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 4226 ctxt->sax->endDocument(ctxt->userData); 4227 4228 if (ctxt->myDoc != NULL) { 4229 dtd = xmlGetIntSubset(ctxt->myDoc); 4230 if (dtd == NULL) 4231 ctxt->myDoc->intSubset = 4232 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 4233 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 4234 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 4235 } 4236 if (! ctxt->wellFormed) return(-1); 4237 return(0); 4238 } 4239 4240 4241 /************************************************************************ 4242 * * 4243 * Parser contexts handling * 4244 * * 4245 ************************************************************************/ 4246 4247 /** 4248 * htmlInitParserCtxt: 4249 * @ctxt: an HTML parser context 4250 * 4251 * Initialize a parser context 4252 * 4253 * Returns 0 in case of success and -1 in case of error 4254 */ 4255 4256 static int 4257 htmlInitParserCtxt(htmlParserCtxtPtr ctxt) 4258 { 4259 htmlSAXHandler *sax; 4260 4261 if (ctxt == NULL) return(-1); 4262 memset(ctxt, 0, sizeof(htmlParserCtxt)); 4263 4264 ctxt->dict = xmlDictCreate(); 4265 if (ctxt->dict == NULL) { 4266 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4267 return(-1); 4268 } 4269 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler)); 4270 if (sax == NULL) { 4271 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4272 return(-1); 4273 } 4274 else 4275 memset(sax, 0, sizeof(htmlSAXHandler)); 4276 4277 /* Allocate the Input stack */ 4278 ctxt->inputTab = (htmlParserInputPtr *) 4279 xmlMalloc(5 * sizeof(htmlParserInputPtr)); 4280 if (ctxt->inputTab == NULL) { 4281 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4282 ctxt->inputNr = 0; 4283 ctxt->inputMax = 0; 4284 ctxt->input = NULL; 4285 return(-1); 4286 } 4287 ctxt->inputNr = 0; 4288 ctxt->inputMax = 5; 4289 ctxt->input = NULL; 4290 ctxt->version = NULL; 4291 ctxt->encoding = NULL; 4292 ctxt->standalone = -1; 4293 ctxt->instate = XML_PARSER_START; 4294 4295 /* Allocate the Node stack */ 4296 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr)); 4297 if (ctxt->nodeTab == NULL) { 4298 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4299 ctxt->nodeNr = 0; 4300 ctxt->nodeMax = 0; 4301 ctxt->node = NULL; 4302 ctxt->inputNr = 0; 4303 ctxt->inputMax = 0; 4304 ctxt->input = NULL; 4305 return(-1); 4306 } 4307 ctxt->nodeNr = 0; 4308 ctxt->nodeMax = 10; 4309 ctxt->node = NULL; 4310 4311 /* Allocate the Name stack */ 4312 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *)); 4313 if (ctxt->nameTab == NULL) { 4314 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4315 ctxt->nameNr = 0; 4316 ctxt->nameMax = 10; 4317 ctxt->name = NULL; 4318 ctxt->nodeNr = 0; 4319 ctxt->nodeMax = 0; 4320 ctxt->node = NULL; 4321 ctxt->inputNr = 0; 4322 ctxt->inputMax = 0; 4323 ctxt->input = NULL; 4324 return(-1); 4325 } 4326 ctxt->nameNr = 0; 4327 ctxt->nameMax = 10; 4328 ctxt->name = NULL; 4329 4330 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler; 4331 else { 4332 ctxt->sax = sax; 4333 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 4334 } 4335 ctxt->userData = ctxt; 4336 ctxt->myDoc = NULL; 4337 ctxt->wellFormed = 1; 4338 ctxt->replaceEntities = 0; 4339 ctxt->linenumbers = xmlLineNumbersDefaultValue; 4340 ctxt->html = 1; 4341 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0; 4342 ctxt->vctxt.userData = ctxt; 4343 ctxt->vctxt.error = xmlParserValidityError; 4344 ctxt->vctxt.warning = xmlParserValidityWarning; 4345 ctxt->record_info = 0; 4346 ctxt->validate = 0; 4347 ctxt->nbChars = 0; 4348 ctxt->checkIndex = 0; 4349 ctxt->catalogs = NULL; 4350 xmlInitNodeInfoSeq(&ctxt->node_seq); 4351 return(0); 4352 } 4353 4354 /** 4355 * htmlFreeParserCtxt: 4356 * @ctxt: an HTML parser context 4357 * 4358 * Free all the memory used by a parser context. However the parsed 4359 * document in ctxt->myDoc is not freed. 4360 */ 4361 4362 void 4363 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt) 4364 { 4365 xmlFreeParserCtxt(ctxt); 4366 } 4367 4368 /** 4369 * htmlNewParserCtxt: 4370 * 4371 * Allocate and initialize a new parser context. 4372 * 4373 * Returns the htmlParserCtxtPtr or NULL in case of allocation error 4374 */ 4375 4376 htmlParserCtxtPtr 4377 htmlNewParserCtxt(void) 4378 { 4379 xmlParserCtxtPtr ctxt; 4380 4381 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt)); 4382 if (ctxt == NULL) { 4383 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n"); 4384 return(NULL); 4385 } 4386 memset(ctxt, 0, sizeof(xmlParserCtxt)); 4387 if (htmlInitParserCtxt(ctxt) < 0) { 4388 htmlFreeParserCtxt(ctxt); 4389 return(NULL); 4390 } 4391 return(ctxt); 4392 } 4393 4394 /** 4395 * htmlCreateMemoryParserCtxt: 4396 * @buffer: a pointer to a char array 4397 * @size: the size of the array 4398 * 4399 * Create a parser context for an HTML in-memory document. 4400 * 4401 * Returns the new parser context or NULL 4402 */ 4403 htmlParserCtxtPtr 4404 htmlCreateMemoryParserCtxt(const char *buffer, int size) { 4405 xmlParserCtxtPtr ctxt; 4406 xmlParserInputPtr input; 4407 xmlParserInputBufferPtr buf; 4408 4409 if (buffer == NULL) 4410 return(NULL); 4411 if (size <= 0) 4412 return(NULL); 4413 4414 ctxt = htmlNewParserCtxt(); 4415 if (ctxt == NULL) 4416 return(NULL); 4417 4418 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); 4419 if (buf == NULL) return(NULL); 4420 4421 input = xmlNewInputStream(ctxt); 4422 if (input == NULL) { 4423 xmlFreeParserCtxt(ctxt); 4424 return(NULL); 4425 } 4426 4427 input->filename = NULL; 4428 input->buf = buf; 4429 input->base = input->buf->buffer->content; 4430 input->cur = input->buf->buffer->content; 4431 input->end = &input->buf->buffer->content[input->buf->buffer->use]; 4432 4433 inputPush(ctxt, input); 4434 return(ctxt); 4435 } 4436 4437 /** 4438 * htmlCreateDocParserCtxt: 4439 * @cur: a pointer to an array of xmlChar 4440 * @encoding: a free form C string describing the HTML document encoding, or NULL 4441 * 4442 * Create a parser context for an HTML document. 4443 * 4444 * TODO: check the need to add encoding handling there 4445 * 4446 * Returns the new parser context or NULL 4447 */ 4448 static htmlParserCtxtPtr 4449 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) { 4450 int len; 4451 htmlParserCtxtPtr ctxt; 4452 4453 if (cur == NULL) 4454 return(NULL); 4455 len = xmlStrlen(cur); 4456 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len); 4457 if (ctxt == NULL) 4458 return(NULL); 4459 4460 if (encoding != NULL) { 4461 xmlCharEncoding enc; 4462 xmlCharEncodingHandlerPtr handler; 4463 4464 if (ctxt->input->encoding != NULL) 4465 xmlFree((xmlChar *) ctxt->input->encoding); 4466 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding); 4467 4468 enc = xmlParseCharEncoding(encoding); 4469 /* 4470 * registered set of known encodings 4471 */ 4472 if (enc != XML_CHAR_ENCODING_ERROR) { 4473 xmlSwitchEncoding(ctxt, enc); 4474 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { 4475 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 4476 "Unsupported encoding %s\n", 4477 (const xmlChar *) encoding, NULL); 4478 } 4479 } else { 4480 /* 4481 * fallback for unknown encodings 4482 */ 4483 handler = xmlFindCharEncodingHandler((const char *) encoding); 4484 if (handler != NULL) { 4485 xmlSwitchToEncoding(ctxt, handler); 4486 } else { 4487 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 4488 "Unsupported encoding %s\n", 4489 (const xmlChar *) encoding, NULL); 4490 } 4491 } 4492 } 4493 return(ctxt); 4494 } 4495 4496 #ifdef LIBXML_PUSH_ENABLED 4497 /************************************************************************ 4498 * * 4499 * Progressive parsing interfaces * 4500 * * 4501 ************************************************************************/ 4502 4503 /** 4504 * htmlParseLookupSequence: 4505 * @ctxt: an HTML parser context 4506 * @first: the first char to lookup 4507 * @next: the next char to lookup or zero 4508 * @third: the next char to lookup or zero 4509 * @comment: flag to force checking inside comments 4510 * 4511 * Try to find if a sequence (first, next, third) or just (first next) or 4512 * (first) is available in the input stream. 4513 * This function has a side effect of (possibly) incrementing ctxt->checkIndex 4514 * to avoid rescanning sequences of bytes, it DOES change the state of the 4515 * parser, do not use liberally. 4516 * This is basically similar to xmlParseLookupSequence() 4517 * 4518 * Returns the index to the current parsing point if the full sequence 4519 * is available, -1 otherwise. 4520 */ 4521 static int 4522 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, 4523 xmlChar next, xmlChar third, int iscomment) { 4524 int base, len; 4525 htmlParserInputPtr in; 4526 const xmlChar *buf; 4527 int incomment = 0; 4528 4529 in = ctxt->input; 4530 if (in == NULL) return(-1); 4531 base = in->cur - in->base; 4532 if (base < 0) return(-1); 4533 if (ctxt->checkIndex > base) 4534 base = ctxt->checkIndex; 4535 if (in->buf == NULL) { 4536 buf = in->base; 4537 len = in->length; 4538 } else { 4539 buf = in->buf->buffer->content; 4540 len = in->buf->buffer->use; 4541 } 4542 /* take into account the sequence length */ 4543 if (third) len -= 2; 4544 else if (next) len --; 4545 for (;base < len;base++) { 4546 if (!incomment && (base + 4 < len) && !iscomment) { 4547 if ((buf[base] == '<') && (buf[base + 1] == '!') && 4548 (buf[base + 2] == '-') && (buf[base + 3] == '-')) { 4549 incomment = 1; 4550 /* do not increment past <! - some people use <!--> */ 4551 base += 2; 4552 } 4553 } 4554 if (incomment) { 4555 if (base + 3 > len) 4556 return(-1); 4557 if ((buf[base] == '-') && (buf[base + 1] == '-') && 4558 (buf[base + 2] == '>')) { 4559 incomment = 0; 4560 base += 2; 4561 } 4562 continue; 4563 } 4564 if (buf[base] == first) { 4565 if (third != 0) { 4566 if ((buf[base + 1] != next) || 4567 (buf[base + 2] != third)) continue; 4568 } else if (next != 0) { 4569 if (buf[base + 1] != next) continue; 4570 } 4571 ctxt->checkIndex = 0; 4572 #ifdef DEBUG_PUSH 4573 if (next == 0) 4574 xmlGenericError(xmlGenericErrorContext, 4575 "HPP: lookup '%c' found at %d\n", 4576 first, base); 4577 else if (third == 0) 4578 xmlGenericError(xmlGenericErrorContext, 4579 "HPP: lookup '%c%c' found at %d\n", 4580 first, next, base); 4581 else 4582 xmlGenericError(xmlGenericErrorContext, 4583 "HPP: lookup '%c%c%c' found at %d\n", 4584 first, next, third, base); 4585 #endif 4586 return(base - (in->cur - in->base)); 4587 } 4588 } 4589 ctxt->checkIndex = base; 4590 #ifdef DEBUG_PUSH 4591 if (next == 0) 4592 xmlGenericError(xmlGenericErrorContext, 4593 "HPP: lookup '%c' failed\n", first); 4594 else if (third == 0) 4595 xmlGenericError(xmlGenericErrorContext, 4596 "HPP: lookup '%c%c' failed\n", first, next); 4597 else 4598 xmlGenericError(xmlGenericErrorContext, 4599 "HPP: lookup '%c%c%c' failed\n", first, next, third); 4600 #endif 4601 return(-1); 4602 } 4603 4604 /** 4605 * htmlParseTryOrFinish: 4606 * @ctxt: an HTML parser context 4607 * @terminate: last chunk indicator 4608 * 4609 * Try to progress on parsing 4610 * 4611 * Returns zero if no parsing was possible 4612 */ 4613 static int 4614 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { 4615 int ret = 0; 4616 htmlParserInputPtr in; 4617 int avail = 0; 4618 xmlChar cur, next; 4619 4620 #ifdef DEBUG_PUSH 4621 switch (ctxt->instate) { 4622 case XML_PARSER_EOF: 4623 xmlGenericError(xmlGenericErrorContext, 4624 "HPP: try EOF\n"); break; 4625 case XML_PARSER_START: 4626 xmlGenericError(xmlGenericErrorContext, 4627 "HPP: try START\n"); break; 4628 case XML_PARSER_MISC: 4629 xmlGenericError(xmlGenericErrorContext, 4630 "HPP: try MISC\n");break; 4631 case XML_PARSER_COMMENT: 4632 xmlGenericError(xmlGenericErrorContext, 4633 "HPP: try COMMENT\n");break; 4634 case XML_PARSER_PROLOG: 4635 xmlGenericError(xmlGenericErrorContext, 4636 "HPP: try PROLOG\n");break; 4637 case XML_PARSER_START_TAG: 4638 xmlGenericError(xmlGenericErrorContext, 4639 "HPP: try START_TAG\n");break; 4640 case XML_PARSER_CONTENT: 4641 xmlGenericError(xmlGenericErrorContext, 4642 "HPP: try CONTENT\n");break; 4643 case XML_PARSER_CDATA_SECTION: 4644 xmlGenericError(xmlGenericErrorContext, 4645 "HPP: try CDATA_SECTION\n");break; 4646 case XML_PARSER_END_TAG: 4647 xmlGenericError(xmlGenericErrorContext, 4648 "HPP: try END_TAG\n");break; 4649 case XML_PARSER_ENTITY_DECL: 4650 xmlGenericError(xmlGenericErrorContext, 4651 "HPP: try ENTITY_DECL\n");break; 4652 case XML_PARSER_ENTITY_VALUE: 4653 xmlGenericError(xmlGenericErrorContext, 4654 "HPP: try ENTITY_VALUE\n");break; 4655 case XML_PARSER_ATTRIBUTE_VALUE: 4656 xmlGenericError(xmlGenericErrorContext, 4657 "HPP: try ATTRIBUTE_VALUE\n");break; 4658 case XML_PARSER_DTD: 4659 xmlGenericError(xmlGenericErrorContext, 4660 "HPP: try DTD\n");break; 4661 case XML_PARSER_EPILOG: 4662 xmlGenericError(xmlGenericErrorContext, 4663 "HPP: try EPILOG\n");break; 4664 case XML_PARSER_PI: 4665 xmlGenericError(xmlGenericErrorContext, 4666 "HPP: try PI\n");break; 4667 case XML_PARSER_SYSTEM_LITERAL: 4668 xmlGenericError(xmlGenericErrorContext, 4669 "HPP: try SYSTEM_LITERAL\n");break; 4670 } 4671 #endif 4672 4673 while (1) { 4674 4675 in = ctxt->input; 4676 if (in == NULL) break; 4677 if (in->buf == NULL) 4678 avail = in->length - (in->cur - in->base); 4679 else 4680 avail = in->buf->buffer->use - (in->cur - in->base); 4681 if ((avail == 0) && (terminate)) { 4682 htmlAutoCloseOnEnd(ctxt); 4683 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 4684 /* 4685 * SAX: end of the document processing. 4686 */ 4687 ctxt->instate = XML_PARSER_EOF; 4688 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 4689 ctxt->sax->endDocument(ctxt->userData); 4690 } 4691 } 4692 if (avail < 1) 4693 goto done; 4694 cur = in->cur[0]; 4695 if (cur == 0) { 4696 SKIP(1); 4697 continue; 4698 } 4699 4700 switch (ctxt->instate) { 4701 case XML_PARSER_EOF: 4702 /* 4703 * Document parsing is done ! 4704 */ 4705 goto done; 4706 case XML_PARSER_START: 4707 /* 4708 * Very first chars read from the document flow. 4709 */ 4710 cur = in->cur[0]; 4711 if (IS_BLANK_CH(cur)) { 4712 SKIP_BLANKS; 4713 if (in->buf == NULL) 4714 avail = in->length - (in->cur - in->base); 4715 else 4716 avail = in->buf->buffer->use - (in->cur - in->base); 4717 } 4718 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) 4719 ctxt->sax->setDocumentLocator(ctxt->userData, 4720 &xmlDefaultSAXLocator); 4721 if ((ctxt->sax) && (ctxt->sax->startDocument) && 4722 (!ctxt->disableSAX)) 4723 ctxt->sax->startDocument(ctxt->userData); 4724 4725 cur = in->cur[0]; 4726 next = in->cur[1]; 4727 if ((cur == '<') && (next == '!') && 4728 (UPP(2) == 'D') && (UPP(3) == 'O') && 4729 (UPP(4) == 'C') && (UPP(5) == 'T') && 4730 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4731 (UPP(8) == 'E')) { 4732 if ((!terminate) && 4733 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 4734 goto done; 4735 #ifdef DEBUG_PUSH 4736 xmlGenericError(xmlGenericErrorContext, 4737 "HPP: Parsing internal subset\n"); 4738 #endif 4739 htmlParseDocTypeDecl(ctxt); 4740 ctxt->instate = XML_PARSER_PROLOG; 4741 #ifdef DEBUG_PUSH 4742 xmlGenericError(xmlGenericErrorContext, 4743 "HPP: entering PROLOG\n"); 4744 #endif 4745 } else { 4746 ctxt->instate = XML_PARSER_MISC; 4747 #ifdef DEBUG_PUSH 4748 xmlGenericError(xmlGenericErrorContext, 4749 "HPP: entering MISC\n"); 4750 #endif 4751 } 4752 break; 4753 case XML_PARSER_MISC: 4754 SKIP_BLANKS; 4755 if (in->buf == NULL) 4756 avail = in->length - (in->cur - in->base); 4757 else 4758 avail = in->buf->buffer->use - (in->cur - in->base); 4759 if (avail < 2) 4760 goto done; 4761 cur = in->cur[0]; 4762 next = in->cur[1]; 4763 if ((cur == '<') && (next == '!') && 4764 (in->cur[2] == '-') && (in->cur[3] == '-')) { 4765 if ((!terminate) && 4766 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0)) 4767 goto done; 4768 #ifdef DEBUG_PUSH 4769 xmlGenericError(xmlGenericErrorContext, 4770 "HPP: Parsing Comment\n"); 4771 #endif 4772 htmlParseComment(ctxt); 4773 ctxt->instate = XML_PARSER_MISC; 4774 } else if ((cur == '<') && (next == '?')) { 4775 if ((!terminate) && 4776 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 4777 goto done; 4778 #ifdef DEBUG_PUSH 4779 xmlGenericError(xmlGenericErrorContext, 4780 "HPP: Parsing PI\n"); 4781 #endif 4782 htmlParsePI(ctxt); 4783 ctxt->instate = XML_PARSER_MISC; 4784 } else if ((cur == '<') && (next == '!') && 4785 (UPP(2) == 'D') && (UPP(3) == 'O') && 4786 (UPP(4) == 'C') && (UPP(5) == 'T') && 4787 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4788 (UPP(8) == 'E')) { 4789 if ((!terminate) && 4790 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 4791 goto done; 4792 #ifdef DEBUG_PUSH 4793 xmlGenericError(xmlGenericErrorContext, 4794 "HPP: Parsing internal subset\n"); 4795 #endif 4796 htmlParseDocTypeDecl(ctxt); 4797 ctxt->instate = XML_PARSER_PROLOG; 4798 #ifdef DEBUG_PUSH 4799 xmlGenericError(xmlGenericErrorContext, 4800 "HPP: entering PROLOG\n"); 4801 #endif 4802 } else if ((cur == '<') && (next == '!') && 4803 (avail < 9)) { 4804 goto done; 4805 } else { 4806 ctxt->instate = XML_PARSER_START_TAG; 4807 #ifdef DEBUG_PUSH 4808 xmlGenericError(xmlGenericErrorContext, 4809 "HPP: entering START_TAG\n"); 4810 #endif 4811 } 4812 break; 4813 case XML_PARSER_PROLOG: 4814 SKIP_BLANKS; 4815 if (in->buf == NULL) 4816 avail = in->length - (in->cur - in->base); 4817 else 4818 avail = in->buf->buffer->use - (in->cur - in->base); 4819 if (avail < 2) 4820 goto done; 4821 cur = in->cur[0]; 4822 next = in->cur[1]; 4823 if ((cur == '<') && (next == '!') && 4824 (in->cur[2] == '-') && (in->cur[3] == '-')) { 4825 if ((!terminate) && 4826 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0)) 4827 goto done; 4828 #ifdef DEBUG_PUSH 4829 xmlGenericError(xmlGenericErrorContext, 4830 "HPP: Parsing Comment\n"); 4831 #endif 4832 htmlParseComment(ctxt); 4833 ctxt->instate = XML_PARSER_PROLOG; 4834 } else if ((cur == '<') && (next == '?')) { 4835 if ((!terminate) && 4836 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 4837 goto done; 4838 #ifdef DEBUG_PUSH 4839 xmlGenericError(xmlGenericErrorContext, 4840 "HPP: Parsing PI\n"); 4841 #endif 4842 htmlParsePI(ctxt); 4843 ctxt->instate = XML_PARSER_PROLOG; 4844 } else if ((cur == '<') && (next == '!') && 4845 (avail < 4)) { 4846 goto done; 4847 } else { 4848 ctxt->instate = XML_PARSER_START_TAG; 4849 #ifdef DEBUG_PUSH 4850 xmlGenericError(xmlGenericErrorContext, 4851 "HPP: entering START_TAG\n"); 4852 #endif 4853 } 4854 break; 4855 case XML_PARSER_EPILOG: 4856 if (in->buf == NULL) 4857 avail = in->length - (in->cur - in->base); 4858 else 4859 avail = in->buf->buffer->use - (in->cur - in->base); 4860 if (avail < 1) 4861 goto done; 4862 cur = in->cur[0]; 4863 if (IS_BLANK_CH(cur)) { 4864 htmlParseCharData(ctxt); 4865 goto done; 4866 } 4867 if (avail < 2) 4868 goto done; 4869 next = in->cur[1]; 4870 if ((cur == '<') && (next == '!') && 4871 (in->cur[2] == '-') && (in->cur[3] == '-')) { 4872 if ((!terminate) && 4873 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0)) 4874 goto done; 4875 #ifdef DEBUG_PUSH 4876 xmlGenericError(xmlGenericErrorContext, 4877 "HPP: Parsing Comment\n"); 4878 #endif 4879 htmlParseComment(ctxt); 4880 ctxt->instate = XML_PARSER_EPILOG; 4881 } else if ((cur == '<') && (next == '?')) { 4882 if ((!terminate) && 4883 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 4884 goto done; 4885 #ifdef DEBUG_PUSH 4886 xmlGenericError(xmlGenericErrorContext, 4887 "HPP: Parsing PI\n"); 4888 #endif 4889 htmlParsePI(ctxt); 4890 ctxt->instate = XML_PARSER_EPILOG; 4891 } else if ((cur == '<') && (next == '!') && 4892 (avail < 4)) { 4893 goto done; 4894 } else { 4895 ctxt->errNo = XML_ERR_DOCUMENT_END; 4896 ctxt->wellFormed = 0; 4897 ctxt->instate = XML_PARSER_EOF; 4898 #ifdef DEBUG_PUSH 4899 xmlGenericError(xmlGenericErrorContext, 4900 "HPP: entering EOF\n"); 4901 #endif 4902 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 4903 ctxt->sax->endDocument(ctxt->userData); 4904 goto done; 4905 } 4906 break; 4907 case XML_PARSER_START_TAG: { 4908 const xmlChar *name; 4909 int failed; 4910 const htmlElemDesc * info; 4911 4912 if (avail < 2) 4913 goto done; 4914 cur = in->cur[0]; 4915 if (cur != '<') { 4916 ctxt->instate = XML_PARSER_CONTENT; 4917 #ifdef DEBUG_PUSH 4918 xmlGenericError(xmlGenericErrorContext, 4919 "HPP: entering CONTENT\n"); 4920 #endif 4921 break; 4922 } 4923 if (in->cur[1] == '/') { 4924 ctxt->instate = XML_PARSER_END_TAG; 4925 ctxt->checkIndex = 0; 4926 #ifdef DEBUG_PUSH 4927 xmlGenericError(xmlGenericErrorContext, 4928 "HPP: entering END_TAG\n"); 4929 #endif 4930 break; 4931 } 4932 if ((!terminate) && 4933 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 4934 goto done; 4935 4936 failed = htmlParseStartTag(ctxt); 4937 name = ctxt->name; 4938 if ((failed == -1) || 4939 (name == NULL)) { 4940 if (CUR == '>') 4941 NEXT; 4942 break; 4943 } 4944 4945 /* 4946 * Lookup the info for that element. 4947 */ 4948 info = htmlTagLookup(name); 4949 if (info == NULL) { 4950 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 4951 "Tag %s invalid\n", name, NULL); 4952 } 4953 4954 /* 4955 * Check for an Empty Element labeled the XML/SGML way 4956 */ 4957 if ((CUR == '/') && (NXT(1) == '>')) { 4958 SKIP(2); 4959 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4960 ctxt->sax->endElement(ctxt->userData, name); 4961 htmlnamePop(ctxt); 4962 ctxt->instate = XML_PARSER_CONTENT; 4963 #ifdef DEBUG_PUSH 4964 xmlGenericError(xmlGenericErrorContext, 4965 "HPP: entering CONTENT\n"); 4966 #endif 4967 break; 4968 } 4969 4970 if (CUR == '>') { 4971 NEXT; 4972 } else { 4973 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 4974 "Couldn't find end of Start Tag %s\n", 4975 name, NULL); 4976 4977 /* 4978 * end of parsing of this node. 4979 */ 4980 if (xmlStrEqual(name, ctxt->name)) { 4981 nodePop(ctxt); 4982 htmlnamePop(ctxt); 4983 } 4984 4985 ctxt->instate = XML_PARSER_CONTENT; 4986 #ifdef DEBUG_PUSH 4987 xmlGenericError(xmlGenericErrorContext, 4988 "HPP: entering CONTENT\n"); 4989 #endif 4990 break; 4991 } 4992 4993 /* 4994 * Check for an Empty Element from DTD definition 4995 */ 4996 if ((info != NULL) && (info->empty)) { 4997 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4998 ctxt->sax->endElement(ctxt->userData, name); 4999 htmlnamePop(ctxt); 5000 } 5001 ctxt->instate = XML_PARSER_CONTENT; 5002 #ifdef DEBUG_PUSH 5003 xmlGenericError(xmlGenericErrorContext, 5004 "HPP: entering CONTENT\n"); 5005 #endif 5006 break; 5007 } 5008 case XML_PARSER_CONTENT: { 5009 long cons; 5010 /* 5011 * Handle preparsed entities and charRef 5012 */ 5013 if (ctxt->token != 0) { 5014 xmlChar chr[2] = { 0 , 0 } ; 5015 5016 chr[0] = (xmlChar) ctxt->token; 5017 htmlCheckParagraph(ctxt); 5018 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 5019 ctxt->sax->characters(ctxt->userData, chr, 1); 5020 ctxt->token = 0; 5021 ctxt->checkIndex = 0; 5022 } 5023 if ((avail == 1) && (terminate)) { 5024 cur = in->cur[0]; 5025 if ((cur != '<') && (cur != '&')) { 5026 if (ctxt->sax != NULL) { 5027 if (IS_BLANK_CH(cur)) { 5028 if (ctxt->sax->ignorableWhitespace != NULL) 5029 ctxt->sax->ignorableWhitespace( 5030 ctxt->userData, &cur, 1); 5031 } else { 5032 htmlCheckParagraph(ctxt); 5033 if (ctxt->sax->characters != NULL) 5034 ctxt->sax->characters( 5035 ctxt->userData, &cur, 1); 5036 } 5037 } 5038 ctxt->token = 0; 5039 ctxt->checkIndex = 0; 5040 in->cur++; 5041 break; 5042 } 5043 } 5044 if (avail < 2) 5045 goto done; 5046 cur = in->cur[0]; 5047 next = in->cur[1]; 5048 cons = ctxt->nbChars; 5049 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) || 5050 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) { 5051 /* 5052 * Handle SCRIPT/STYLE separately 5053 */ 5054 if (!terminate) { 5055 int idx; 5056 xmlChar val; 5057 5058 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0); 5059 if (idx < 0) 5060 goto done; 5061 val = in->cur[idx + 2]; 5062 if (val == 0) /* bad cut of input */ 5063 goto done; 5064 } 5065 htmlParseScript(ctxt); 5066 if ((cur == '<') && (next == '/')) { 5067 ctxt->instate = XML_PARSER_END_TAG; 5068 ctxt->checkIndex = 0; 5069 #ifdef DEBUG_PUSH 5070 xmlGenericError(xmlGenericErrorContext, 5071 "HPP: entering END_TAG\n"); 5072 #endif 5073 break; 5074 } 5075 } else { 5076 /* 5077 * Sometimes DOCTYPE arrives in the middle of the document 5078 */ 5079 if ((cur == '<') && (next == '!') && 5080 (UPP(2) == 'D') && (UPP(3) == 'O') && 5081 (UPP(4) == 'C') && (UPP(5) == 'T') && 5082 (UPP(6) == 'Y') && (UPP(7) == 'P') && 5083 (UPP(8) == 'E')) { 5084 if ((!terminate) && 5085 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 5086 goto done; 5087 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 5088 "Misplaced DOCTYPE declaration\n", 5089 BAD_CAST "DOCTYPE" , NULL); 5090 htmlParseDocTypeDecl(ctxt); 5091 } else if ((cur == '<') && (next == '!') && 5092 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5093 if ((!terminate) && 5094 (htmlParseLookupSequence( 5095 ctxt, '-', '-', '>', 1) < 0)) 5096 goto done; 5097 #ifdef DEBUG_PUSH 5098 xmlGenericError(xmlGenericErrorContext, 5099 "HPP: Parsing Comment\n"); 5100 #endif 5101 htmlParseComment(ctxt); 5102 ctxt->instate = XML_PARSER_CONTENT; 5103 } else if ((cur == '<') && (next == '?')) { 5104 if ((!terminate) && 5105 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 5106 goto done; 5107 #ifdef DEBUG_PUSH 5108 xmlGenericError(xmlGenericErrorContext, 5109 "HPP: Parsing PI\n"); 5110 #endif 5111 htmlParsePI(ctxt); 5112 ctxt->instate = XML_PARSER_CONTENT; 5113 } else if ((cur == '<') && (next == '!') && (avail < 4)) { 5114 goto done; 5115 } else if ((cur == '<') && (next == '/')) { 5116 ctxt->instate = XML_PARSER_END_TAG; 5117 ctxt->checkIndex = 0; 5118 #ifdef DEBUG_PUSH 5119 xmlGenericError(xmlGenericErrorContext, 5120 "HPP: entering END_TAG\n"); 5121 #endif 5122 break; 5123 } else if (cur == '<') { 5124 ctxt->instate = XML_PARSER_START_TAG; 5125 ctxt->checkIndex = 0; 5126 #ifdef DEBUG_PUSH 5127 xmlGenericError(xmlGenericErrorContext, 5128 "HPP: entering START_TAG\n"); 5129 #endif 5130 break; 5131 } else if (cur == '&') { 5132 if ((!terminate) && 5133 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0)) 5134 goto done; 5135 #ifdef DEBUG_PUSH 5136 xmlGenericError(xmlGenericErrorContext, 5137 "HPP: Parsing Reference\n"); 5138 #endif 5139 /* TODO: check generation of subtrees if noent !!! */ 5140 htmlParseReference(ctxt); 5141 } else { 5142 /* 5143 * check that the text sequence is complete 5144 * before handing out the data to the parser 5145 * to avoid problems with erroneous end of 5146 * data detection. 5147 */ 5148 if ((!terminate) && 5149 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0)) 5150 goto done; 5151 ctxt->checkIndex = 0; 5152 #ifdef DEBUG_PUSH 5153 xmlGenericError(xmlGenericErrorContext, 5154 "HPP: Parsing char data\n"); 5155 #endif 5156 htmlParseCharData(ctxt); 5157 } 5158 } 5159 if (cons == ctxt->nbChars) { 5160 if (ctxt->node != NULL) { 5161 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5162 "detected an error in element content\n", 5163 NULL, NULL); 5164 } 5165 NEXT; 5166 break; 5167 } 5168 5169 break; 5170 } 5171 case XML_PARSER_END_TAG: 5172 if (avail < 2) 5173 goto done; 5174 if ((!terminate) && 5175 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 5176 goto done; 5177 htmlParseEndTag(ctxt); 5178 if (ctxt->nameNr == 0) { 5179 ctxt->instate = XML_PARSER_EPILOG; 5180 } else { 5181 ctxt->instate = XML_PARSER_CONTENT; 5182 } 5183 ctxt->checkIndex = 0; 5184 #ifdef DEBUG_PUSH 5185 xmlGenericError(xmlGenericErrorContext, 5186 "HPP: entering CONTENT\n"); 5187 #endif 5188 break; 5189 case XML_PARSER_CDATA_SECTION: 5190 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5191 "HPP: internal error, state == CDATA\n", 5192 NULL, NULL); 5193 ctxt->instate = XML_PARSER_CONTENT; 5194 ctxt->checkIndex = 0; 5195 #ifdef DEBUG_PUSH 5196 xmlGenericError(xmlGenericErrorContext, 5197 "HPP: entering CONTENT\n"); 5198 #endif 5199 break; 5200 case XML_PARSER_DTD: 5201 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5202 "HPP: internal error, state == DTD\n", 5203 NULL, NULL); 5204 ctxt->instate = XML_PARSER_CONTENT; 5205 ctxt->checkIndex = 0; 5206 #ifdef DEBUG_PUSH 5207 xmlGenericError(xmlGenericErrorContext, 5208 "HPP: entering CONTENT\n"); 5209 #endif 5210 break; 5211 case XML_PARSER_COMMENT: 5212 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5213 "HPP: internal error, state == COMMENT\n", 5214 NULL, NULL); 5215 ctxt->instate = XML_PARSER_CONTENT; 5216 ctxt->checkIndex = 0; 5217 #ifdef DEBUG_PUSH 5218 xmlGenericError(xmlGenericErrorContext, 5219 "HPP: entering CONTENT\n"); 5220 #endif 5221 break; 5222 case XML_PARSER_PI: 5223 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5224 "HPP: internal error, state == PI\n", 5225 NULL, NULL); 5226 ctxt->instate = XML_PARSER_CONTENT; 5227 ctxt->checkIndex = 0; 5228 #ifdef DEBUG_PUSH 5229 xmlGenericError(xmlGenericErrorContext, 5230 "HPP: entering CONTENT\n"); 5231 #endif 5232 break; 5233 case XML_PARSER_ENTITY_DECL: 5234 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5235 "HPP: internal error, state == ENTITY_DECL\n", 5236 NULL, NULL); 5237 ctxt->instate = XML_PARSER_CONTENT; 5238 ctxt->checkIndex = 0; 5239 #ifdef DEBUG_PUSH 5240 xmlGenericError(xmlGenericErrorContext, 5241 "HPP: entering CONTENT\n"); 5242 #endif 5243 break; 5244 case XML_PARSER_ENTITY_VALUE: 5245 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5246 "HPP: internal error, state == ENTITY_VALUE\n", 5247 NULL, NULL); 5248 ctxt->instate = XML_PARSER_CONTENT; 5249 ctxt->checkIndex = 0; 5250 #ifdef DEBUG_PUSH 5251 xmlGenericError(xmlGenericErrorContext, 5252 "HPP: entering DTD\n"); 5253 #endif 5254 break; 5255 case XML_PARSER_ATTRIBUTE_VALUE: 5256 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5257 "HPP: internal error, state == ATTRIBUTE_VALUE\n", 5258 NULL, NULL); 5259 ctxt->instate = XML_PARSER_START_TAG; 5260 ctxt->checkIndex = 0; 5261 #ifdef DEBUG_PUSH 5262 xmlGenericError(xmlGenericErrorContext, 5263 "HPP: entering START_TAG\n"); 5264 #endif 5265 break; 5266 case XML_PARSER_SYSTEM_LITERAL: 5267 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5268 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n", 5269 NULL, NULL); 5270 ctxt->instate = XML_PARSER_CONTENT; 5271 ctxt->checkIndex = 0; 5272 #ifdef DEBUG_PUSH 5273 xmlGenericError(xmlGenericErrorContext, 5274 "HPP: entering CONTENT\n"); 5275 #endif 5276 break; 5277 case XML_PARSER_IGNORE: 5278 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5279 "HPP: internal error, state == XML_PARSER_IGNORE\n", 5280 NULL, NULL); 5281 ctxt->instate = XML_PARSER_CONTENT; 5282 ctxt->checkIndex = 0; 5283 #ifdef DEBUG_PUSH 5284 xmlGenericError(xmlGenericErrorContext, 5285 "HPP: entering CONTENT\n"); 5286 #endif 5287 break; 5288 case XML_PARSER_PUBLIC_LITERAL: 5289 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5290 "HPP: internal error, state == XML_PARSER_LITERAL\n", 5291 NULL, NULL); 5292 ctxt->instate = XML_PARSER_CONTENT; 5293 ctxt->checkIndex = 0; 5294 #ifdef DEBUG_PUSH 5295 xmlGenericError(xmlGenericErrorContext, 5296 "HPP: entering CONTENT\n"); 5297 #endif 5298 break; 5299 5300 } 5301 } 5302 done: 5303 if ((avail == 0) && (terminate)) { 5304 htmlAutoCloseOnEnd(ctxt); 5305 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 5306 /* 5307 * SAX: end of the document processing. 5308 */ 5309 ctxt->instate = XML_PARSER_EOF; 5310 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5311 ctxt->sax->endDocument(ctxt->userData); 5312 } 5313 } 5314 if ((ctxt->myDoc != NULL) && 5315 ((terminate) || (ctxt->instate == XML_PARSER_EOF) || 5316 (ctxt->instate == XML_PARSER_EPILOG))) { 5317 xmlDtdPtr dtd; 5318 dtd = xmlGetIntSubset(ctxt->myDoc); 5319 if (dtd == NULL) 5320 ctxt->myDoc->intSubset = 5321 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 5322 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 5323 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 5324 } 5325 #ifdef DEBUG_PUSH 5326 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret); 5327 #endif 5328 return(ret); 5329 } 5330 5331 /** 5332 * htmlParseChunk: 5333 * @ctxt: an HTML parser context 5334 * @chunk: an char array 5335 * @size: the size in byte of the chunk 5336 * @terminate: last chunk indicator 5337 * 5338 * Parse a Chunk of memory 5339 * 5340 * Returns zero if no error, the xmlParserErrors otherwise. 5341 */ 5342 int 5343 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, 5344 int terminate) { 5345 if ((ctxt == NULL) || (ctxt->input == NULL)) { 5346 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5347 "htmlParseChunk: context error\n", NULL, NULL); 5348 return(XML_ERR_INTERNAL_ERROR); 5349 } 5350 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 5351 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) { 5352 int base = ctxt->input->base - ctxt->input->buf->buffer->content; 5353 int cur = ctxt->input->cur - ctxt->input->base; 5354 int res; 5355 5356 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 5357 if (res < 0) { 5358 ctxt->errNo = XML_PARSER_EOF; 5359 ctxt->disableSAX = 1; 5360 return (XML_PARSER_EOF); 5361 } 5362 ctxt->input->base = ctxt->input->buf->buffer->content + base; 5363 ctxt->input->cur = ctxt->input->base + cur; 5364 ctxt->input->end = 5365 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; 5366 #ifdef DEBUG_PUSH 5367 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 5368 #endif 5369 5370 #if 0 5371 if ((terminate) || (ctxt->input->buf->buffer->use > 80)) 5372 htmlParseTryOrFinish(ctxt, terminate); 5373 #endif 5374 } else if (ctxt->instate != XML_PARSER_EOF) { 5375 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) { 5376 xmlParserInputBufferPtr in = ctxt->input->buf; 5377 if ((in->encoder != NULL) && (in->buffer != NULL) && 5378 (in->raw != NULL)) { 5379 int nbchars; 5380 5381 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw); 5382 if (nbchars < 0) { 5383 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 5384 "encoder error\n", NULL, NULL); 5385 return(XML_ERR_INVALID_ENCODING); 5386 } 5387 } 5388 } 5389 } 5390 htmlParseTryOrFinish(ctxt, terminate); 5391 if (terminate) { 5392 if ((ctxt->instate != XML_PARSER_EOF) && 5393 (ctxt->instate != XML_PARSER_EPILOG) && 5394 (ctxt->instate != XML_PARSER_MISC)) { 5395 ctxt->errNo = XML_ERR_DOCUMENT_END; 5396 ctxt->wellFormed = 0; 5397 } 5398 if (ctxt->instate != XML_PARSER_EOF) { 5399 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5400 ctxt->sax->endDocument(ctxt->userData); 5401 } 5402 ctxt->instate = XML_PARSER_EOF; 5403 } 5404 return((xmlParserErrors) ctxt->errNo); 5405 } 5406 5407 /************************************************************************ 5408 * * 5409 * User entry points * 5410 * * 5411 ************************************************************************/ 5412 5413 /** 5414 * htmlCreatePushParserCtxt: 5415 * @sax: a SAX handler 5416 * @user_data: The user data returned on SAX callbacks 5417 * @chunk: a pointer to an array of chars 5418 * @size: number of chars in the array 5419 * @filename: an optional file name or URI 5420 * @enc: an optional encoding 5421 * 5422 * Create a parser context for using the HTML parser in push mode 5423 * The value of @filename is used for fetching external entities 5424 * and error/warning reports. 5425 * 5426 * Returns the new parser context or NULL 5427 */ 5428 htmlParserCtxtPtr 5429 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, 5430 const char *chunk, int size, const char *filename, 5431 xmlCharEncoding enc) { 5432 htmlParserCtxtPtr ctxt; 5433 htmlParserInputPtr inputStream; 5434 xmlParserInputBufferPtr buf; 5435 5436 xmlInitParser(); 5437 5438 buf = xmlAllocParserInputBuffer(enc); 5439 if (buf == NULL) return(NULL); 5440 5441 ctxt = htmlNewParserCtxt(); 5442 if (ctxt == NULL) { 5443 xmlFreeParserInputBuffer(buf); 5444 return(NULL); 5445 } 5446 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder) 5447 ctxt->charset=XML_CHAR_ENCODING_UTF8; 5448 if (sax != NULL) { 5449 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler) 5450 xmlFree(ctxt->sax); 5451 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler)); 5452 if (ctxt->sax == NULL) { 5453 xmlFree(buf); 5454 xmlFree(ctxt); 5455 return(NULL); 5456 } 5457 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler)); 5458 if (user_data != NULL) 5459 ctxt->userData = user_data; 5460 } 5461 if (filename == NULL) { 5462 ctxt->directory = NULL; 5463 } else { 5464 ctxt->directory = xmlParserGetDirectory(filename); 5465 } 5466 5467 inputStream = htmlNewInputStream(ctxt); 5468 if (inputStream == NULL) { 5469 xmlFreeParserCtxt(ctxt); 5470 xmlFree(buf); 5471 return(NULL); 5472 } 5473 5474 if (filename == NULL) 5475 inputStream->filename = NULL; 5476 else 5477 inputStream->filename = (char *) 5478 xmlCanonicPath((const xmlChar *) filename); 5479 inputStream->buf = buf; 5480 inputStream->base = inputStream->buf->buffer->content; 5481 inputStream->cur = inputStream->buf->buffer->content; 5482 inputStream->end = 5483 &inputStream->buf->buffer->content[inputStream->buf->buffer->use]; 5484 5485 inputPush(ctxt, inputStream); 5486 5487 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 5488 (ctxt->input->buf != NULL)) { 5489 int base = ctxt->input->base - ctxt->input->buf->buffer->content; 5490 int cur = ctxt->input->cur - ctxt->input->base; 5491 5492 xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 5493 5494 ctxt->input->base = ctxt->input->buf->buffer->content + base; 5495 ctxt->input->cur = ctxt->input->base + cur; 5496 ctxt->input->end = 5497 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; 5498 #ifdef DEBUG_PUSH 5499 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 5500 #endif 5501 } 5502 ctxt->progressive = 1; 5503 5504 return(ctxt); 5505 } 5506 #endif /* LIBXML_PUSH_ENABLED */ 5507 5508 /** 5509 * htmlSAXParseDoc: 5510 * @cur: a pointer to an array of xmlChar 5511 * @encoding: a free form C string describing the HTML document encoding, or NULL 5512 * @sax: the SAX handler block 5513 * @userData: if using SAX, this pointer will be provided on callbacks. 5514 * 5515 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks 5516 * to handle parse events. If sax is NULL, fallback to the default DOM 5517 * behavior and return a tree. 5518 * 5519 * Returns the resulting document tree unless SAX is NULL or the document is 5520 * not well formed. 5521 */ 5522 5523 htmlDocPtr 5524 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) { 5525 htmlDocPtr ret; 5526 htmlParserCtxtPtr ctxt; 5527 5528 xmlInitParser(); 5529 5530 if (cur == NULL) return(NULL); 5531 5532 5533 ctxt = htmlCreateDocParserCtxt(cur, encoding); 5534 if (ctxt == NULL) return(NULL); 5535 if (sax != NULL) { 5536 if (ctxt->sax != NULL) xmlFree (ctxt->sax); 5537 ctxt->sax = sax; 5538 ctxt->userData = userData; 5539 } 5540 5541 htmlParseDocument(ctxt); 5542 ret = ctxt->myDoc; 5543 if (sax != NULL) { 5544 ctxt->sax = NULL; 5545 ctxt->userData = NULL; 5546 } 5547 htmlFreeParserCtxt(ctxt); 5548 5549 return(ret); 5550 } 5551 5552 /** 5553 * htmlParseDoc: 5554 * @cur: a pointer to an array of xmlChar 5555 * @encoding: a free form C string describing the HTML document encoding, or NULL 5556 * 5557 * parse an HTML in-memory document and build a tree. 5558 * 5559 * Returns the resulting document tree 5560 */ 5561 5562 htmlDocPtr 5563 htmlParseDoc(xmlChar *cur, const char *encoding) { 5564 return(htmlSAXParseDoc(cur, encoding, NULL, NULL)); 5565 } 5566 5567 5568 /** 5569 * htmlCreateFileParserCtxt: 5570 * @filename: the filename 5571 * @encoding: a free form C string describing the HTML document encoding, or NULL 5572 * 5573 * Create a parser context for a file content. 5574 * Automatic support for ZLIB/Compress compressed document is provided 5575 * by default if found at compile-time. 5576 * 5577 * Returns the new parser context or NULL 5578 */ 5579 htmlParserCtxtPtr 5580 htmlCreateFileParserCtxt(const char *filename, const char *encoding) 5581 { 5582 htmlParserCtxtPtr ctxt; 5583 htmlParserInputPtr inputStream; 5584 char *canonicFilename; 5585 /* htmlCharEncoding enc; */ 5586 xmlChar *content, *content_line = (xmlChar *) "charset="; 5587 5588 if (filename == NULL) 5589 return(NULL); 5590 5591 ctxt = htmlNewParserCtxt(); 5592 if (ctxt == NULL) { 5593 return(NULL); 5594 } 5595 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename); 5596 if (canonicFilename == NULL) { 5597 #ifdef LIBXML_SAX1_ENABLED 5598 if (xmlDefaultSAXHandler.error != NULL) { 5599 xmlDefaultSAXHandler.error(NULL, "out of memory\n"); 5600 } 5601 #endif 5602 xmlFreeParserCtxt(ctxt); 5603 return(NULL); 5604 } 5605 5606 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt); 5607 xmlFree(canonicFilename); 5608 if (inputStream == NULL) { 5609 xmlFreeParserCtxt(ctxt); 5610 return(NULL); 5611 } 5612 5613 inputPush(ctxt, inputStream); 5614 5615 /* set encoding */ 5616 if (encoding) { 5617 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1); 5618 if (content) { 5619 strcpy ((char *)content, (char *)content_line); 5620 strcat ((char *)content, (char *)encoding); 5621 htmlCheckEncoding (ctxt, content); 5622 xmlFree (content); 5623 } 5624 } 5625 5626 return(ctxt); 5627 } 5628 5629 /** 5630 * htmlSAXParseFile: 5631 * @filename: the filename 5632 * @encoding: a free form C string describing the HTML document encoding, or NULL 5633 * @sax: the SAX handler block 5634 * @userData: if using SAX, this pointer will be provided on callbacks. 5635 * 5636 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 5637 * compressed document is provided by default if found at compile-time. 5638 * It use the given SAX function block to handle the parsing callback. 5639 * If sax is NULL, fallback to the default DOM tree building routines. 5640 * 5641 * Returns the resulting document tree unless SAX is NULL or the document is 5642 * not well formed. 5643 */ 5644 5645 htmlDocPtr 5646 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax, 5647 void *userData) { 5648 htmlDocPtr ret; 5649 htmlParserCtxtPtr ctxt; 5650 htmlSAXHandlerPtr oldsax = NULL; 5651 5652 xmlInitParser(); 5653 5654 ctxt = htmlCreateFileParserCtxt(filename, encoding); 5655 if (ctxt == NULL) return(NULL); 5656 if (sax != NULL) { 5657 oldsax = ctxt->sax; 5658 ctxt->sax = sax; 5659 ctxt->userData = userData; 5660 } 5661 5662 htmlParseDocument(ctxt); 5663 5664 ret = ctxt->myDoc; 5665 if (sax != NULL) { 5666 ctxt->sax = oldsax; 5667 ctxt->userData = NULL; 5668 } 5669 htmlFreeParserCtxt(ctxt); 5670 5671 return(ret); 5672 } 5673 5674 /** 5675 * htmlParseFile: 5676 * @filename: the filename 5677 * @encoding: a free form C string describing the HTML document encoding, or NULL 5678 * 5679 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 5680 * compressed document is provided by default if found at compile-time. 5681 * 5682 * Returns the resulting document tree 5683 */ 5684 5685 htmlDocPtr 5686 htmlParseFile(const char *filename, const char *encoding) { 5687 return(htmlSAXParseFile(filename, encoding, NULL, NULL)); 5688 } 5689 5690 /** 5691 * htmlHandleOmittedElem: 5692 * @val: int 0 or 1 5693 * 5694 * Set and return the previous value for handling HTML omitted tags. 5695 * 5696 * Returns the last value for 0 for no handling, 1 for auto insertion. 5697 */ 5698 5699 int 5700 htmlHandleOmittedElem(int val) { 5701 int old = htmlOmittedDefaultValue; 5702 5703 htmlOmittedDefaultValue = val; 5704 return(old); 5705 } 5706 5707 /** 5708 * htmlElementAllowedHere: 5709 * @parent: HTML parent element 5710 * @elt: HTML element 5711 * 5712 * Checks whether an HTML element may be a direct child of a parent element. 5713 * Note - doesn't check for deprecated elements 5714 * 5715 * Returns 1 if allowed; 0 otherwise. 5716 */ 5717 int 5718 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) { 5719 const char** p ; 5720 5721 if ( ! elt || ! parent || ! parent->subelts ) 5722 return 0 ; 5723 5724 for ( p = parent->subelts; *p; ++p ) 5725 if ( !xmlStrcmp((const xmlChar *)*p, elt) ) 5726 return 1 ; 5727 5728 return 0 ; 5729 } 5730 /** 5731 * htmlElementStatusHere: 5732 * @parent: HTML parent element 5733 * @elt: HTML element 5734 * 5735 * Checks whether an HTML element may be a direct child of a parent element. 5736 * and if so whether it is valid or deprecated. 5737 * 5738 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID 5739 */ 5740 htmlStatus 5741 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) { 5742 if ( ! parent || ! elt ) 5743 return HTML_INVALID ; 5744 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) ) 5745 return HTML_INVALID ; 5746 5747 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ; 5748 } 5749 /** 5750 * htmlAttrAllowed: 5751 * @elt: HTML element 5752 * @attr: HTML attribute 5753 * @legacy: whether to allow deprecated attributes 5754 * 5755 * Checks whether an attribute is valid for an element 5756 * Has full knowledge of Required and Deprecated attributes 5757 * 5758 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID 5759 */ 5760 htmlStatus 5761 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) { 5762 const char** p ; 5763 5764 if ( !elt || ! attr ) 5765 return HTML_INVALID ; 5766 5767 if ( elt->attrs_req ) 5768 for ( p = elt->attrs_req; *p; ++p) 5769 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 5770 return HTML_REQUIRED ; 5771 5772 if ( elt->attrs_opt ) 5773 for ( p = elt->attrs_opt; *p; ++p) 5774 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 5775 return HTML_VALID ; 5776 5777 if ( legacy && elt->attrs_depr ) 5778 for ( p = elt->attrs_depr; *p; ++p) 5779 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 5780 return HTML_DEPRECATED ; 5781 5782 return HTML_INVALID ; 5783 } 5784 /** 5785 * htmlNodeStatus: 5786 * @node: an htmlNodePtr in a tree 5787 * @legacy: whether to allow deprecated elements (YES is faster here 5788 * for Element nodes) 5789 * 5790 * Checks whether the tree node is valid. Experimental (the author 5791 * only uses the HTML enhancements in a SAX parser) 5792 * 5793 * Return: for Element nodes, a return from htmlElementAllowedHere (if 5794 * legacy allowed) or htmlElementStatusHere (otherwise). 5795 * for Attribute nodes, a return from htmlAttrAllowed 5796 * for other nodes, HTML_NA (no checks performed) 5797 */ 5798 htmlStatus 5799 htmlNodeStatus(const htmlNodePtr node, int legacy) { 5800 if ( ! node ) 5801 return HTML_INVALID ; 5802 5803 switch ( node->type ) { 5804 case XML_ELEMENT_NODE: 5805 return legacy 5806 ? ( htmlElementAllowedHere ( 5807 htmlTagLookup(node->parent->name) , node->name 5808 ) ? HTML_VALID : HTML_INVALID ) 5809 : htmlElementStatusHere( 5810 htmlTagLookup(node->parent->name) , 5811 htmlTagLookup(node->name) ) 5812 ; 5813 case XML_ATTRIBUTE_NODE: 5814 return htmlAttrAllowed( 5815 htmlTagLookup(node->parent->name) , node->name, legacy) ; 5816 default: return HTML_NA ; 5817 } 5818 } 5819 /************************************************************************ 5820 * * 5821 * New set (2.6.0) of simpler and more flexible APIs * 5822 * * 5823 ************************************************************************/ 5824 /** 5825 * DICT_FREE: 5826 * @str: a string 5827 * 5828 * Free a string if it is not owned by the "dict" dictionnary in the 5829 * current scope 5830 */ 5831 #define DICT_FREE(str) \ 5832 if ((str) && ((!dict) || \ 5833 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \ 5834 xmlFree((char *)(str)); 5835 5836 /** 5837 * htmlCtxtReset: 5838 * @ctxt: an HTML parser context 5839 * 5840 * Reset a parser context 5841 */ 5842 void 5843 htmlCtxtReset(htmlParserCtxtPtr ctxt) 5844 { 5845 xmlParserInputPtr input; 5846 xmlDictPtr dict; 5847 5848 if (ctxt == NULL) 5849 return; 5850 5851 xmlInitParser(); 5852 dict = ctxt->dict; 5853 5854 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */ 5855 xmlFreeInputStream(input); 5856 } 5857 ctxt->inputNr = 0; 5858 ctxt->input = NULL; 5859 5860 ctxt->spaceNr = 0; 5861 if (ctxt->spaceTab != NULL) { 5862 ctxt->spaceTab[0] = -1; 5863 ctxt->space = &ctxt->spaceTab[0]; 5864 } else { 5865 ctxt->space = NULL; 5866 } 5867 5868 5869 ctxt->nodeNr = 0; 5870 ctxt->node = NULL; 5871 5872 ctxt->nameNr = 0; 5873 ctxt->name = NULL; 5874 5875 DICT_FREE(ctxt->version); 5876 ctxt->version = NULL; 5877 DICT_FREE(ctxt->encoding); 5878 ctxt->encoding = NULL; 5879 DICT_FREE(ctxt->directory); 5880 ctxt->directory = NULL; 5881 DICT_FREE(ctxt->extSubURI); 5882 ctxt->extSubURI = NULL; 5883 DICT_FREE(ctxt->extSubSystem); 5884 ctxt->extSubSystem = NULL; 5885 if (ctxt->myDoc != NULL) 5886 xmlFreeDoc(ctxt->myDoc); 5887 ctxt->myDoc = NULL; 5888 5889 ctxt->standalone = -1; 5890 ctxt->hasExternalSubset = 0; 5891 ctxt->hasPErefs = 0; 5892 ctxt->html = 1; 5893 ctxt->external = 0; 5894 ctxt->instate = XML_PARSER_START; 5895 ctxt->token = 0; 5896 5897 ctxt->wellFormed = 1; 5898 ctxt->nsWellFormed = 1; 5899 ctxt->valid = 1; 5900 ctxt->vctxt.userData = ctxt; 5901 ctxt->vctxt.error = xmlParserValidityError; 5902 ctxt->vctxt.warning = xmlParserValidityWarning; 5903 ctxt->record_info = 0; 5904 ctxt->nbChars = 0; 5905 ctxt->checkIndex = 0; 5906 ctxt->inSubset = 0; 5907 ctxt->errNo = XML_ERR_OK; 5908 ctxt->depth = 0; 5909 ctxt->charset = XML_CHAR_ENCODING_NONE; 5910 ctxt->catalogs = NULL; 5911 xmlInitNodeInfoSeq(&ctxt->node_seq); 5912 5913 if (ctxt->attsDefault != NULL) { 5914 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree); 5915 ctxt->attsDefault = NULL; 5916 } 5917 if (ctxt->attsSpecial != NULL) { 5918 xmlHashFree(ctxt->attsSpecial, NULL); 5919 ctxt->attsSpecial = NULL; 5920 } 5921 } 5922 5923 /** 5924 * htmlCtxtUseOptions: 5925 * @ctxt: an HTML parser context 5926 * @options: a combination of htmlParserOption(s) 5927 * 5928 * Applies the options to the parser context 5929 * 5930 * Returns 0 in case of success, the set of unknown or unimplemented options 5931 * in case of error. 5932 */ 5933 int 5934 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options) 5935 { 5936 if (ctxt == NULL) 5937 return(-1); 5938 5939 if (options & HTML_PARSE_NOWARNING) { 5940 ctxt->sax->warning = NULL; 5941 ctxt->vctxt.warning = NULL; 5942 options -= XML_PARSE_NOWARNING; 5943 ctxt->options |= XML_PARSE_NOWARNING; 5944 } 5945 if (options & HTML_PARSE_NOERROR) { 5946 ctxt->sax->error = NULL; 5947 ctxt->vctxt.error = NULL; 5948 ctxt->sax->fatalError = NULL; 5949 options -= XML_PARSE_NOERROR; 5950 ctxt->options |= XML_PARSE_NOERROR; 5951 } 5952 if (options & HTML_PARSE_PEDANTIC) { 5953 ctxt->pedantic = 1; 5954 options -= XML_PARSE_PEDANTIC; 5955 ctxt->options |= XML_PARSE_PEDANTIC; 5956 } else 5957 ctxt->pedantic = 0; 5958 if (options & XML_PARSE_NOBLANKS) { 5959 ctxt->keepBlanks = 0; 5960 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace; 5961 options -= XML_PARSE_NOBLANKS; 5962 ctxt->options |= XML_PARSE_NOBLANKS; 5963 } else 5964 ctxt->keepBlanks = 1; 5965 if (options & HTML_PARSE_RECOVER) { 5966 ctxt->recovery = 1; 5967 options -= HTML_PARSE_RECOVER; 5968 } else 5969 ctxt->recovery = 0; 5970 if (options & HTML_PARSE_COMPACT) { 5971 ctxt->options |= HTML_PARSE_COMPACT; 5972 options -= HTML_PARSE_COMPACT; 5973 } 5974 ctxt->dictNames = 0; 5975 return (options); 5976 } 5977 5978 /** 5979 * htmlDoRead: 5980 * @ctxt: an HTML parser context 5981 * @URL: the base URL to use for the document 5982 * @encoding: the document encoding, or NULL 5983 * @options: a combination of htmlParserOption(s) 5984 * @reuse: keep the context for reuse 5985 * 5986 * Common front-end for the htmlRead functions 5987 * 5988 * Returns the resulting document tree or NULL 5989 */ 5990 static htmlDocPtr 5991 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding, 5992 int options, int reuse) 5993 { 5994 htmlDocPtr ret; 5995 5996 htmlCtxtUseOptions(ctxt, options); 5997 ctxt->html = 1; 5998 if (encoding != NULL) { 5999 xmlCharEncodingHandlerPtr hdlr; 6000 6001 hdlr = xmlFindCharEncodingHandler(encoding); 6002 if (hdlr != NULL) { 6003 xmlSwitchToEncoding(ctxt, hdlr); 6004 if (ctxt->input->encoding != NULL) 6005 xmlFree((xmlChar *) ctxt->input->encoding); 6006 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding); 6007 } 6008 } 6009 if ((URL != NULL) && (ctxt->input != NULL) && 6010 (ctxt->input->filename == NULL)) 6011 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL); 6012 htmlParseDocument(ctxt); 6013 ret = ctxt->myDoc; 6014 ctxt->myDoc = NULL; 6015 if (!reuse) { 6016 if ((ctxt->dictNames) && 6017 (ret != NULL) && 6018 (ret->dict == ctxt->dict)) 6019 ctxt->dict = NULL; 6020 xmlFreeParserCtxt(ctxt); 6021 } 6022 return (ret); 6023 } 6024 6025 /** 6026 * htmlReadDoc: 6027 * @cur: a pointer to a zero terminated string 6028 * @URL: the base URL to use for the document 6029 * @encoding: the document encoding, or NULL 6030 * @options: a combination of htmlParserOption(s) 6031 * 6032 * parse an XML in-memory document and build a tree. 6033 * 6034 * Returns the resulting document tree 6035 */ 6036 htmlDocPtr 6037 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options) 6038 { 6039 htmlParserCtxtPtr ctxt; 6040 6041 if (cur == NULL) 6042 return (NULL); 6043 6044 xmlInitParser(); 6045 ctxt = htmlCreateDocParserCtxt(cur, NULL); 6046 if (ctxt == NULL) 6047 return (NULL); 6048 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6049 } 6050 6051 /** 6052 * htmlReadFile: 6053 * @filename: a file or URL 6054 * @encoding: the document encoding, or NULL 6055 * @options: a combination of htmlParserOption(s) 6056 * 6057 * parse an XML file from the filesystem or the network. 6058 * 6059 * Returns the resulting document tree 6060 */ 6061 htmlDocPtr 6062 htmlReadFile(const char *filename, const char *encoding, int options) 6063 { 6064 htmlParserCtxtPtr ctxt; 6065 6066 xmlInitParser(); 6067 ctxt = htmlCreateFileParserCtxt(filename, encoding); 6068 if (ctxt == NULL) 6069 return (NULL); 6070 return (htmlDoRead(ctxt, NULL, NULL, options, 0)); 6071 } 6072 6073 /** 6074 * htmlReadMemory: 6075 * @buffer: a pointer to a char array 6076 * @size: the size of the array 6077 * @URL: the base URL to use for the document 6078 * @encoding: the document encoding, or NULL 6079 * @options: a combination of htmlParserOption(s) 6080 * 6081 * parse an XML in-memory document and build a tree. 6082 * 6083 * Returns the resulting document tree 6084 */ 6085 htmlDocPtr 6086 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options) 6087 { 6088 htmlParserCtxtPtr ctxt; 6089 6090 xmlInitParser(); 6091 ctxt = xmlCreateMemoryParserCtxt(buffer, size); 6092 if (ctxt == NULL) 6093 return (NULL); 6094 htmlDefaultSAXHandlerInit(); 6095 if (ctxt->sax != NULL) 6096 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 6097 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6098 } 6099 6100 /** 6101 * htmlReadFd: 6102 * @fd: an open file descriptor 6103 * @URL: the base URL to use for the document 6104 * @encoding: the document encoding, or NULL 6105 * @options: a combination of htmlParserOption(s) 6106 * 6107 * parse an XML from a file descriptor and build a tree. 6108 * 6109 * Returns the resulting document tree 6110 */ 6111 htmlDocPtr 6112 htmlReadFd(int fd, const char *URL, const char *encoding, int options) 6113 { 6114 htmlParserCtxtPtr ctxt; 6115 xmlParserInputBufferPtr input; 6116 xmlParserInputPtr stream; 6117 6118 if (fd < 0) 6119 return (NULL); 6120 6121 xmlInitParser(); 6122 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 6123 if (input == NULL) 6124 return (NULL); 6125 ctxt = xmlNewParserCtxt(); 6126 if (ctxt == NULL) { 6127 xmlFreeParserInputBuffer(input); 6128 return (NULL); 6129 } 6130 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6131 if (stream == NULL) { 6132 xmlFreeParserInputBuffer(input); 6133 xmlFreeParserCtxt(ctxt); 6134 return (NULL); 6135 } 6136 inputPush(ctxt, stream); 6137 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6138 } 6139 6140 /** 6141 * htmlReadIO: 6142 * @ioread: an I/O read function 6143 * @ioclose: an I/O close function 6144 * @ioctx: an I/O handler 6145 * @URL: the base URL to use for the document 6146 * @encoding: the document encoding, or NULL 6147 * @options: a combination of htmlParserOption(s) 6148 * 6149 * parse an HTML document from I/O functions and source and build a tree. 6150 * 6151 * Returns the resulting document tree 6152 */ 6153 htmlDocPtr 6154 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, 6155 void *ioctx, const char *URL, const char *encoding, int options) 6156 { 6157 htmlParserCtxtPtr ctxt; 6158 xmlParserInputBufferPtr input; 6159 xmlParserInputPtr stream; 6160 6161 if (ioread == NULL) 6162 return (NULL); 6163 xmlInitParser(); 6164 6165 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 6166 XML_CHAR_ENCODING_NONE); 6167 if (input == NULL) 6168 return (NULL); 6169 ctxt = htmlNewParserCtxt(); 6170 if (ctxt == NULL) { 6171 xmlFreeParserInputBuffer(input); 6172 return (NULL); 6173 } 6174 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6175 if (stream == NULL) { 6176 xmlFreeParserInputBuffer(input); 6177 xmlFreeParserCtxt(ctxt); 6178 return (NULL); 6179 } 6180 inputPush(ctxt, stream); 6181 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6182 } 6183 6184 /** 6185 * htmlCtxtReadDoc: 6186 * @ctxt: an HTML parser context 6187 * @cur: a pointer to a zero terminated string 6188 * @URL: the base URL to use for the document 6189 * @encoding: the document encoding, or NULL 6190 * @options: a combination of htmlParserOption(s) 6191 * 6192 * parse an XML in-memory document and build a tree. 6193 * This reuses the existing @ctxt parser context 6194 * 6195 * Returns the resulting document tree 6196 */ 6197 htmlDocPtr 6198 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur, 6199 const char *URL, const char *encoding, int options) 6200 { 6201 xmlParserInputPtr stream; 6202 6203 if (cur == NULL) 6204 return (NULL); 6205 if (ctxt == NULL) 6206 return (NULL); 6207 6208 htmlCtxtReset(ctxt); 6209 6210 stream = xmlNewStringInputStream(ctxt, cur); 6211 if (stream == NULL) { 6212 return (NULL); 6213 } 6214 inputPush(ctxt, stream); 6215 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6216 } 6217 6218 /** 6219 * htmlCtxtReadFile: 6220 * @ctxt: an HTML parser context 6221 * @filename: a file or URL 6222 * @encoding: the document encoding, or NULL 6223 * @options: a combination of htmlParserOption(s) 6224 * 6225 * parse an XML file from the filesystem or the network. 6226 * This reuses the existing @ctxt parser context 6227 * 6228 * Returns the resulting document tree 6229 */ 6230 htmlDocPtr 6231 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename, 6232 const char *encoding, int options) 6233 { 6234 xmlParserInputPtr stream; 6235 6236 if (filename == NULL) 6237 return (NULL); 6238 if (ctxt == NULL) 6239 return (NULL); 6240 6241 htmlCtxtReset(ctxt); 6242 6243 stream = xmlLoadExternalEntity(filename, NULL, ctxt); 6244 if (stream == NULL) { 6245 return (NULL); 6246 } 6247 inputPush(ctxt, stream); 6248 return (htmlDoRead(ctxt, NULL, encoding, options, 1)); 6249 } 6250 6251 /** 6252 * htmlCtxtReadMemory: 6253 * @ctxt: an HTML parser context 6254 * @buffer: a pointer to a char array 6255 * @size: the size of the array 6256 * @URL: the base URL to use for the document 6257 * @encoding: the document encoding, or NULL 6258 * @options: a combination of htmlParserOption(s) 6259 * 6260 * parse an XML in-memory document and build a tree. 6261 * This reuses the existing @ctxt parser context 6262 * 6263 * Returns the resulting document tree 6264 */ 6265 htmlDocPtr 6266 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size, 6267 const char *URL, const char *encoding, int options) 6268 { 6269 xmlParserInputBufferPtr input; 6270 xmlParserInputPtr stream; 6271 6272 if (ctxt == NULL) 6273 return (NULL); 6274 if (buffer == NULL) 6275 return (NULL); 6276 6277 htmlCtxtReset(ctxt); 6278 6279 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); 6280 if (input == NULL) { 6281 return(NULL); 6282 } 6283 6284 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6285 if (stream == NULL) { 6286 xmlFreeParserInputBuffer(input); 6287 return(NULL); 6288 } 6289 6290 inputPush(ctxt, stream); 6291 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6292 } 6293 6294 /** 6295 * htmlCtxtReadFd: 6296 * @ctxt: an HTML parser context 6297 * @fd: an open file descriptor 6298 * @URL: the base URL to use for the document 6299 * @encoding: the document encoding, or NULL 6300 * @options: a combination of htmlParserOption(s) 6301 * 6302 * parse an XML from a file descriptor and build a tree. 6303 * This reuses the existing @ctxt parser context 6304 * 6305 * Returns the resulting document tree 6306 */ 6307 htmlDocPtr 6308 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd, 6309 const char *URL, const char *encoding, int options) 6310 { 6311 xmlParserInputBufferPtr input; 6312 xmlParserInputPtr stream; 6313 6314 if (fd < 0) 6315 return (NULL); 6316 if (ctxt == NULL) 6317 return (NULL); 6318 6319 htmlCtxtReset(ctxt); 6320 6321 6322 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 6323 if (input == NULL) 6324 return (NULL); 6325 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6326 if (stream == NULL) { 6327 xmlFreeParserInputBuffer(input); 6328 return (NULL); 6329 } 6330 inputPush(ctxt, stream); 6331 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6332 } 6333 6334 /** 6335 * htmlCtxtReadIO: 6336 * @ctxt: an HTML parser context 6337 * @ioread: an I/O read function 6338 * @ioclose: an I/O close function 6339 * @ioctx: an I/O handler 6340 * @URL: the base URL to use for the document 6341 * @encoding: the document encoding, or NULL 6342 * @options: a combination of htmlParserOption(s) 6343 * 6344 * parse an HTML document from I/O functions and source and build a tree. 6345 * This reuses the existing @ctxt parser context 6346 * 6347 * Returns the resulting document tree 6348 */ 6349 htmlDocPtr 6350 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, 6351 xmlInputCloseCallback ioclose, void *ioctx, 6352 const char *URL, 6353 const char *encoding, int options) 6354 { 6355 xmlParserInputBufferPtr input; 6356 xmlParserInputPtr stream; 6357 6358 if (ioread == NULL) 6359 return (NULL); 6360 if (ctxt == NULL) 6361 return (NULL); 6362 6363 htmlCtxtReset(ctxt); 6364 6365 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 6366 XML_CHAR_ENCODING_NONE); 6367 if (input == NULL) 6368 return (NULL); 6369 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6370 if (stream == NULL) { 6371 xmlFreeParserInputBuffer(input); 6372 return (NULL); 6373 } 6374 inputPush(ctxt, stream); 6375 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6376 } 6377 6378 #define bottom_HTMLparser 6379 #include "elfgcchack.h" 6380 #endif /* LIBXML_HTML_ENABLED */ 6381