1 /* 2 * HTMLparser.c : an HTML 4.0 non-verifying parser 3 * 4 * See Copyright for the status of this software. 5 * 6 * daniel (at) veillard.com 7 */ 8 9 #define IN_LIBXML 10 #include "libxml.h" 11 #ifdef LIBXML_HTML_ENABLED 12 13 #include <string.h> 14 #ifdef HAVE_CTYPE_H 15 #include <ctype.h> 16 #endif 17 #ifdef HAVE_STDLIB_H 18 #include <stdlib.h> 19 #endif 20 #ifdef HAVE_SYS_STAT_H 21 #include <sys/stat.h> 22 #endif 23 #ifdef HAVE_FCNTL_H 24 #include <fcntl.h> 25 #endif 26 #ifdef HAVE_UNISTD_H 27 #include <unistd.h> 28 #endif 29 #ifdef HAVE_ZLIB_H 30 #include <zlib.h> 31 #endif 32 33 #include <libxml/xmlmemory.h> 34 #include <libxml/tree.h> 35 #include <libxml/parser.h> 36 #include <libxml/parserInternals.h> 37 #include <libxml/xmlerror.h> 38 #include <libxml/HTMLparser.h> 39 #include <libxml/HTMLtree.h> 40 #include <libxml/entities.h> 41 #include <libxml/encoding.h> 42 #include <libxml/valid.h> 43 #include <libxml/xmlIO.h> 44 #include <libxml/globals.h> 45 #include <libxml/uri.h> 46 47 #define HTML_MAX_NAMELEN 1000 48 #define HTML_PARSER_BIG_BUFFER_SIZE 1000 49 #define HTML_PARSER_BUFFER_SIZE 100 50 51 /* #define DEBUG */ 52 /* #define DEBUG_PUSH */ 53 54 static int htmlOmittedDefaultValue = 1; 55 56 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len, 57 xmlChar end, xmlChar end2, xmlChar end3); 58 static void htmlParseComment(htmlParserCtxtPtr ctxt); 59 60 /************************************************************************ 61 * * 62 * Some factorized error routines * 63 * * 64 ************************************************************************/ 65 66 /** 67 * htmlErrMemory: 68 * @ctxt: an HTML parser context 69 * @extra: extra informations 70 * 71 * Handle a redefinition of attribute error 72 */ 73 static void 74 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra) 75 { 76 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 77 (ctxt->instate == XML_PARSER_EOF)) 78 return; 79 if (ctxt != NULL) { 80 ctxt->errNo = XML_ERR_NO_MEMORY; 81 ctxt->instate = XML_PARSER_EOF; 82 ctxt->disableSAX = 1; 83 } 84 if (extra) 85 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 86 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 87 NULL, NULL, 0, 0, 88 "Memory allocation failed : %s\n", extra); 89 else 90 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 91 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL, 92 NULL, NULL, 0, 0, "Memory allocation failed\n"); 93 } 94 95 /** 96 * htmlParseErr: 97 * @ctxt: an HTML parser context 98 * @error: the error number 99 * @msg: the error message 100 * @str1: string infor 101 * @str2: string infor 102 * 103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints 104 */ 105 static void 106 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, 107 const char *msg, const xmlChar *str1, const xmlChar *str2) 108 { 109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 110 (ctxt->instate == XML_PARSER_EOF)) 111 return; 112 if (ctxt != NULL) 113 ctxt->errNo = error; 114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 115 XML_ERR_ERROR, NULL, 0, 116 (const char *) str1, (const char *) str2, 117 NULL, 0, 0, 118 msg, str1, str2); 119 if (ctxt != NULL) 120 ctxt->wellFormed = 0; 121 } 122 123 /** 124 * htmlParseErrInt: 125 * @ctxt: an HTML parser context 126 * @error: the error number 127 * @msg: the error message 128 * @val: integer info 129 * 130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints 131 */ 132 static void 133 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error, 134 const char *msg, int val) 135 { 136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 137 (ctxt->instate == XML_PARSER_EOF)) 138 return; 139 if (ctxt != NULL) 140 ctxt->errNo = error; 141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 142 XML_ERR_ERROR, NULL, 0, NULL, NULL, 143 NULL, val, 0, msg, val); 144 if (ctxt != NULL) 145 ctxt->wellFormed = 0; 146 } 147 148 /************************************************************************ 149 * * 150 * Parser stacks related functions and macros * 151 * * 152 ************************************************************************/ 153 154 /** 155 * htmlnamePush: 156 * @ctxt: an HTML parser context 157 * @value: the element name 158 * 159 * Pushes a new element name on top of the name stack 160 * 161 * Returns 0 in case of error, the index in the stack otherwise 162 */ 163 static int 164 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value) 165 { 166 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head"))) 167 ctxt->html = 3; 168 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body"))) 169 ctxt->html = 10; 170 if (ctxt->nameNr >= ctxt->nameMax) { 171 ctxt->nameMax *= 2; 172 ctxt->nameTab = (const xmlChar * *) 173 xmlRealloc((xmlChar * *)ctxt->nameTab, 174 ctxt->nameMax * 175 sizeof(ctxt->nameTab[0])); 176 if (ctxt->nameTab == NULL) { 177 htmlErrMemory(ctxt, NULL); 178 return (0); 179 } 180 } 181 ctxt->nameTab[ctxt->nameNr] = value; 182 ctxt->name = value; 183 return (ctxt->nameNr++); 184 } 185 /** 186 * htmlnamePop: 187 * @ctxt: an HTML parser context 188 * 189 * Pops the top element name from the name stack 190 * 191 * Returns the name just removed 192 */ 193 static const xmlChar * 194 htmlnamePop(htmlParserCtxtPtr ctxt) 195 { 196 const xmlChar *ret; 197 198 if (ctxt->nameNr <= 0) 199 return (NULL); 200 ctxt->nameNr--; 201 if (ctxt->nameNr < 0) 202 return (NULL); 203 if (ctxt->nameNr > 0) 204 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1]; 205 else 206 ctxt->name = NULL; 207 ret = ctxt->nameTab[ctxt->nameNr]; 208 ctxt->nameTab[ctxt->nameNr] = NULL; 209 return (ret); 210 } 211 212 /** 213 * htmlNodeInfoPush: 214 * @ctxt: an HTML parser context 215 * @value: the node info 216 * 217 * Pushes a new element name on top of the node info stack 218 * 219 * Returns 0 in case of error, the index in the stack otherwise 220 */ 221 static int 222 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value) 223 { 224 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) { 225 if (ctxt->nodeInfoMax == 0) 226 ctxt->nodeInfoMax = 5; 227 ctxt->nodeInfoMax *= 2; 228 ctxt->nodeInfoTab = (htmlParserNodeInfo *) 229 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab, 230 ctxt->nodeInfoMax * 231 sizeof(ctxt->nodeInfoTab[0])); 232 if (ctxt->nodeInfoTab == NULL) { 233 htmlErrMemory(ctxt, NULL); 234 return (0); 235 } 236 } 237 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value; 238 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr]; 239 return (ctxt->nodeInfoNr++); 240 } 241 242 /** 243 * htmlNodeInfoPop: 244 * @ctxt: an HTML parser context 245 * 246 * Pops the top element name from the node info stack 247 * 248 * Returns 0 in case of error, the pointer to NodeInfo otherwise 249 */ 250 static htmlParserNodeInfo * 251 htmlNodeInfoPop(htmlParserCtxtPtr ctxt) 252 { 253 if (ctxt->nodeInfoNr <= 0) 254 return (NULL); 255 ctxt->nodeInfoNr--; 256 if (ctxt->nodeInfoNr < 0) 257 return (NULL); 258 if (ctxt->nodeInfoNr > 0) 259 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1]; 260 else 261 ctxt->nodeInfo = NULL; 262 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr]; 263 } 264 265 /* 266 * Macros for accessing the content. Those should be used only by the parser, 267 * and not exported. 268 * 269 * Dirty macros, i.e. one need to make assumption on the context to use them 270 * 271 * CUR_PTR return the current pointer to the xmlChar to be parsed. 272 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled 273 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled 274 * in UNICODE mode. This should be used internally by the parser 275 * only to compare to ASCII values otherwise it would break when 276 * running with UTF-8 encoding. 277 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only 278 * to compare on ASCII based substring. 279 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR 280 * it should be used only to compare on ASCII based substring. 281 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined 282 * strings without newlines within the parser. 283 * 284 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding 285 * 286 * CURRENT Returns the current char value, with the full decoding of 287 * UTF-8 if we are using this mode. It returns an int. 288 * NEXT Skip to the next character, this does the proper decoding 289 * in UTF-8 mode. It also pop-up unfinished entities on the fly. 290 * NEXTL(l) Skip the current unicode character of l xmlChars long. 291 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly 292 */ 293 294 #define UPPER (toupper(*ctxt->input->cur)) 295 296 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val) 297 298 #define NXT(val) ctxt->input->cur[(val)] 299 300 #define UPP(val) (toupper(ctxt->input->cur[(val)])) 301 302 #define CUR_PTR ctxt->input->cur 303 304 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \ 305 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \ 306 xmlParserInputShrink(ctxt->input) 307 308 #define GROW if ((ctxt->progressive == 0) && \ 309 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \ 310 xmlParserInputGrow(ctxt->input, INPUT_CHUNK) 311 312 #define CURRENT ((int) (*ctxt->input->cur)) 313 314 #define SKIP_BLANKS htmlSkipBlankChars(ctxt) 315 316 /* Inported from XML */ 317 318 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */ 319 #define CUR ((int) (*ctxt->input->cur)) 320 #define NEXT xmlNextChar(ctxt) 321 322 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur)) 323 324 325 #define NEXTL(l) do { \ 326 if (*(ctxt->input->cur) == '\n') { \ 327 ctxt->input->line++; ctxt->input->col = 1; \ 328 } else ctxt->input->col++; \ 329 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \ 330 } while (0) 331 332 /************ 333 \ 334 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \ 335 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt); 336 ************/ 337 338 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l) 339 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l) 340 341 #define COPY_BUF(l,b,i,v) \ 342 if (l == 1) b[i++] = (xmlChar) v; \ 343 else i += xmlCopyChar(l,&b[i],v) 344 345 /** 346 * htmlFindEncoding: 347 * @the HTML parser context 348 * 349 * Ty to find and encoding in the current data available in the input 350 * buffer this is needed to try to switch to the proper encoding when 351 * one face a character error. 352 * That's an heuristic, since it's operating outside of parsing it could 353 * try to use a meta which had been commented out, that's the reason it 354 * should only be used in case of error, not as a default. 355 * 356 * Returns an encoding string or NULL if not found, the string need to 357 * be freed 358 */ 359 static xmlChar * 360 htmlFindEncoding(xmlParserCtxtPtr ctxt) { 361 const xmlChar *start, *cur, *end; 362 363 if ((ctxt == NULL) || (ctxt->input == NULL) || 364 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) || 365 (ctxt->input->buf->encoder != NULL)) 366 return(NULL); 367 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL)) 368 return(NULL); 369 370 start = ctxt->input->cur; 371 end = ctxt->input->end; 372 /* we also expect the input buffer to be zero terminated */ 373 if (*end != 0) 374 return(NULL); 375 376 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV"); 377 if (cur == NULL) 378 return(NULL); 379 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT"); 380 if (cur == NULL) 381 return(NULL); 382 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET="); 383 if (cur == NULL) 384 return(NULL); 385 cur += 8; 386 start = cur; 387 while (((*cur >= 'A') && (*cur <= 'Z')) || 388 ((*cur >= 'a') && (*cur <= 'z')) || 389 ((*cur >= '0') && (*cur <= '9')) || 390 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/')) 391 cur++; 392 if (cur == start) 393 return(NULL); 394 return(xmlStrndup(start, cur - start)); 395 } 396 397 /** 398 * htmlCurrentChar: 399 * @ctxt: the HTML parser context 400 * @len: pointer to the length of the char read 401 * 402 * The current char value, if using UTF-8 this may actually span multiple 403 * bytes in the input buffer. Implement the end of line normalization: 404 * 2.11 End-of-Line Handling 405 * If the encoding is unspecified, in the case we find an ISO-Latin-1 406 * char, then the encoding converter is plugged in automatically. 407 * 408 * Returns the current char value and its length 409 */ 410 411 static int 412 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { 413 if (ctxt->instate == XML_PARSER_EOF) 414 return(0); 415 416 if (ctxt->token != 0) { 417 *len = 0; 418 return(ctxt->token); 419 } 420 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { 421 /* 422 * We are supposed to handle UTF8, check it's valid 423 * From rfc2044: encoding of the Unicode values on UTF-8: 424 * 425 * UCS-4 range (hex.) UTF-8 octet sequence (binary) 426 * 0000 0000-0000 007F 0xxxxxxx 427 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx 428 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 429 * 430 * Check for the 0x110000 limit too 431 */ 432 const unsigned char *cur = ctxt->input->cur; 433 unsigned char c; 434 unsigned int val; 435 436 c = *cur; 437 if (c & 0x80) { 438 if (cur[1] == 0) { 439 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 440 cur = ctxt->input->cur; 441 } 442 if ((cur[1] & 0xc0) != 0x80) 443 goto encoding_error; 444 if ((c & 0xe0) == 0xe0) { 445 446 if (cur[2] == 0) { 447 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 448 cur = ctxt->input->cur; 449 } 450 if ((cur[2] & 0xc0) != 0x80) 451 goto encoding_error; 452 if ((c & 0xf0) == 0xf0) { 453 if (cur[3] == 0) { 454 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 455 cur = ctxt->input->cur; 456 } 457 if (((c & 0xf8) != 0xf0) || 458 ((cur[3] & 0xc0) != 0x80)) 459 goto encoding_error; 460 /* 4-byte code */ 461 *len = 4; 462 val = (cur[0] & 0x7) << 18; 463 val |= (cur[1] & 0x3f) << 12; 464 val |= (cur[2] & 0x3f) << 6; 465 val |= cur[3] & 0x3f; 466 } else { 467 /* 3-byte code */ 468 *len = 3; 469 val = (cur[0] & 0xf) << 12; 470 val |= (cur[1] & 0x3f) << 6; 471 val |= cur[2] & 0x3f; 472 } 473 } else { 474 /* 2-byte code */ 475 *len = 2; 476 val = (cur[0] & 0x1f) << 6; 477 val |= cur[1] & 0x3f; 478 } 479 if (!IS_CHAR(val)) { 480 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 481 "Char 0x%X out of allowed range\n", val); 482 } 483 return(val); 484 } else { 485 if ((*ctxt->input->cur == 0) && 486 (ctxt->input->cur < ctxt->input->end)) { 487 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 488 "Char 0x%X out of allowed range\n", 0); 489 *len = 1; 490 return(' '); 491 } 492 /* 1-byte code */ 493 *len = 1; 494 return((int) *ctxt->input->cur); 495 } 496 } 497 /* 498 * Assume it's a fixed length encoding (1) with 499 * a compatible encoding for the ASCII set, since 500 * XML constructs only use < 128 chars 501 */ 502 *len = 1; 503 if ((int) *ctxt->input->cur < 0x80) 504 return((int) *ctxt->input->cur); 505 506 /* 507 * Humm this is bad, do an automatic flow conversion 508 */ 509 { 510 xmlChar * guess; 511 xmlCharEncodingHandlerPtr handler; 512 513 guess = htmlFindEncoding(ctxt); 514 if (guess == NULL) { 515 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); 516 } else { 517 if (ctxt->input->encoding != NULL) 518 xmlFree((xmlChar *) ctxt->input->encoding); 519 ctxt->input->encoding = guess; 520 handler = xmlFindCharEncodingHandler((const char *) guess); 521 if (handler != NULL) { 522 xmlSwitchToEncoding(ctxt, handler); 523 } else { 524 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 525 "Unsupported encoding %s", guess, NULL); 526 } 527 } 528 ctxt->charset = XML_CHAR_ENCODING_UTF8; 529 } 530 531 return(xmlCurrentChar(ctxt, len)); 532 533 encoding_error: 534 /* 535 * If we detect an UTF8 error that probably mean that the 536 * input encoding didn't get properly advertized in the 537 * declaration header. Report the error and switch the encoding 538 * to ISO-Latin-1 (if you don't like this policy, just declare the 539 * encoding !) 540 */ 541 { 542 char buffer[150]; 543 544 if (ctxt->input->end - ctxt->input->cur >= 4) { 545 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", 546 ctxt->input->cur[0], ctxt->input->cur[1], 547 ctxt->input->cur[2], ctxt->input->cur[3]); 548 } else { 549 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]); 550 } 551 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 552 "Input is not proper UTF-8, indicate encoding !\n", 553 BAD_CAST buffer, NULL); 554 } 555 556 ctxt->charset = XML_CHAR_ENCODING_8859_1; 557 *len = 1; 558 return((int) *ctxt->input->cur); 559 } 560 561 /** 562 * htmlSkipBlankChars: 563 * @ctxt: the HTML parser context 564 * 565 * skip all blanks character found at that point in the input streams. 566 * 567 * Returns the number of space chars skipped 568 */ 569 570 static int 571 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) { 572 int res = 0; 573 574 while (IS_BLANK_CH(*(ctxt->input->cur))) { 575 if ((*ctxt->input->cur == 0) && 576 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { 577 xmlPopInput(ctxt); 578 } else { 579 if (*(ctxt->input->cur) == '\n') { 580 ctxt->input->line++; ctxt->input->col = 1; 581 } else ctxt->input->col++; 582 ctxt->input->cur++; 583 ctxt->nbChars++; 584 if (*ctxt->input->cur == 0) 585 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 586 } 587 res++; 588 } 589 return(res); 590 } 591 592 593 594 /************************************************************************ 595 * * 596 * The list of HTML elements and their properties * 597 * * 598 ************************************************************************/ 599 600 /* 601 * Start Tag: 1 means the start tag can be ommited 602 * End Tag: 1 means the end tag can be ommited 603 * 2 means it's forbidden (empty elements) 604 * 3 means the tag is stylistic and should be closed easily 605 * Depr: this element is deprecated 606 * DTD: 1 means that this element is valid only in the Loose DTD 607 * 2 means that this element is valid only in the Frameset DTD 608 * 609 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description 610 , subElements , impliedsubelt , Attributes, userdata 611 */ 612 613 /* Definitions and a couple of vars for HTML Elements */ 614 615 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small" 616 #define NB_FONTSTYLE 8 617 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym" 618 #define NB_PHRASE 10 619 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe" 620 #define NB_SPECIAL 16 621 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL 622 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL 623 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address" 624 #define NB_BLOCK NB_HEADING + NB_LIST + 14 625 #define FORMCTRL "input", "select", "textarea", "label", "button" 626 #define NB_FORMCTRL 5 627 #define PCDATA 628 #define NB_PCDATA 0 629 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6" 630 #define NB_HEADING 6 631 #define LIST "ul", "ol", "dir", "menu" 632 #define NB_LIST 4 633 #define MODIFIER 634 #define NB_MODIFIER 0 635 #define FLOW BLOCK,INLINE 636 #define NB_FLOW NB_BLOCK + NB_INLINE 637 #define EMPTY NULL 638 639 640 static const char* const html_flow[] = { FLOW, NULL } ; 641 static const char* const html_inline[] = { INLINE, NULL } ; 642 643 /* placeholders: elts with content but no subelements */ 644 static const char* const html_pcdata[] = { NULL } ; 645 #define html_cdata html_pcdata 646 647 648 /* ... and for HTML Attributes */ 649 650 #define COREATTRS "id", "class", "style", "title" 651 #define NB_COREATTRS 4 652 #define I18N "lang", "dir" 653 #define NB_I18N 2 654 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup" 655 #define NB_EVENTS 9 656 #define ATTRS COREATTRS,I18N,EVENTS 657 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS 658 #define CELLHALIGN "align", "char", "charoff" 659 #define NB_CELLHALIGN 3 660 #define CELLVALIGN "valign" 661 #define NB_CELLVALIGN 1 662 663 static const char* const html_attrs[] = { ATTRS, NULL } ; 664 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ; 665 static const char* const core_attrs[] = { COREATTRS, NULL } ; 666 static const char* const i18n_attrs[] = { I18N, NULL } ; 667 668 669 /* Other declarations that should go inline ... */ 670 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name", 671 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords", 672 "tabindex", "onfocus", "onblur", NULL } ; 673 static const char* const target_attr[] = { "target", NULL } ; 674 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ; 675 static const char* const alt_attr[] = { "alt", NULL } ; 676 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ; 677 static const char* const href_attrs[] = { "href", NULL } ; 678 static const char* const clear_attrs[] = { "clear", NULL } ; 679 static const char* const inline_p[] = { INLINE, "p", NULL } ; 680 681 static const char* const flow_param[] = { FLOW, "param", NULL } ; 682 static const char* const applet_attrs[] = { COREATTRS , "codebase", 683 "archive", "alt", "name", "height", "width", "align", 684 "hspace", "vspace", NULL } ; 685 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref", 686 "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 687 static const char* const basefont_attrs[] = 688 { "id", "size", "color", "face", NULL } ; 689 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ; 690 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ; 691 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ; 692 static const char* const body_depr[] = { "background", "bgcolor", "text", 693 "link", "vlink", "alink", NULL } ; 694 static const char* const button_attrs[] = { ATTRS, "name", "value", "type", 695 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 696 697 698 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ; 699 static const char* const col_elt[] = { "col", NULL } ; 700 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ; 701 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ; 702 static const char* const dl_contents[] = { "dt", "dd", NULL } ; 703 static const char* const compact_attr[] = { "compact", NULL } ; 704 static const char* const label_attr[] = { "label", NULL } ; 705 static const char* const fieldset_contents[] = { FLOW, "legend" } ; 706 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ; 707 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ; 708 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ; 709 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ; 710 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ; 711 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ; 712 static const char* const head_attrs[] = { I18N, "profile", NULL } ; 713 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ; 714 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ; 715 static const char* const version_attr[] = { "version", NULL } ; 716 static const char* const html_content[] = { "head", "body", "frameset", NULL } ; 717 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ; 718 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ; 719 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ; 720 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ; 721 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ; 722 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ; 723 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ; 724 static const char* const align_attr[] = { "align", NULL } ; 725 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ; 726 static const char* const map_contents[] = { BLOCK, "area", NULL } ; 727 static const char* const name_attr[] = { "name", NULL } ; 728 static const char* const action_attr[] = { "action", NULL } ; 729 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ; 730 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ; 731 static const char* const content_attr[] = { "content", NULL } ; 732 static const char* const type_attr[] = { "type", NULL } ; 733 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ; 734 static const char* const object_contents[] = { FLOW, "param", NULL } ; 735 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ; 736 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ; 737 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ; 738 static const char* const option_elt[] = { "option", NULL } ; 739 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ; 740 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ; 741 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ; 742 static const char* const width_attr[] = { "width", NULL } ; 743 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ; 744 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ; 745 static const char* const language_attr[] = { "language", NULL } ; 746 static const char* const select_content[] = { "optgroup", "option", NULL } ; 747 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ; 748 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ; 749 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ; 750 static const char* const table_depr[] = { "align", "bgcolor", NULL } ; 751 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ; 752 static const char* const tr_elt[] = { "tr", NULL } ; 753 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ; 754 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ; 755 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ; 756 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ; 757 static const char* const tr_contents[] = { "th", "td", NULL } ; 758 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ; 759 static const char* const li_elt[] = { "li", NULL } ; 760 static const char* const ul_depr[] = { "type", "compact", NULL} ; 761 static const char* const dir_attr[] = { "dir", NULL} ; 762 763 #define DECL (const char**) 764 765 static const htmlElemDesc 766 html40ElementTable[] = { 767 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ", 768 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL 769 }, 770 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form", 771 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 772 }, 773 { "acronym", 0, 0, 0, 0, 0, 0, 1, "", 774 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 775 }, 776 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ", 777 DECL inline_p , NULL , DECL html_attrs, NULL, NULL 778 }, 779 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ", 780 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL 781 }, 782 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ", 783 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr 784 }, 785 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style", 786 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 787 }, 788 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ", 789 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs 790 }, 791 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " , 792 EMPTY , NULL , NULL, DECL basefont_attrs, NULL 793 }, 794 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ", 795 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr 796 }, 797 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style", 798 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 799 }, 800 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ", 801 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL 802 }, 803 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ", 804 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL 805 }, 806 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ", 807 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL 808 }, 809 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ", 810 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL 811 }, 812 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ", 813 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 814 }, 815 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ", 816 DECL html_flow , NULL , NULL, DECL html_attrs, NULL 817 }, 818 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation", 819 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 820 }, 821 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment", 822 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 823 }, 824 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ", 825 EMPTY , NULL , DECL col_attrs , NULL, NULL 826 }, 827 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ", 828 DECL col_elt , "col" , DECL col_attrs , NULL, NULL 829 }, 830 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ", 831 DECL html_flow , NULL , DECL html_attrs, NULL, NULL 832 }, 833 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ", 834 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL 835 }, 836 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition", 837 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 838 }, 839 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list", 840 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL 841 }, 842 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container", 843 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL 844 }, 845 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ", 846 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL 847 }, 848 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ", 849 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 850 }, 851 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis", 852 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 853 }, 854 { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ", 855 EMPTY, NULL, DECL embed_attrs, NULL, NULL 856 }, 857 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ", 858 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL 859 }, 860 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ", 861 DECL html_inline, NULL, NULL, DECL font_attrs, NULL 862 }, 863 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ", 864 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr 865 }, 866 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " , 867 EMPTY, NULL, NULL, DECL frame_attrs, NULL 868 }, 869 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" , 870 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL 871 }, 872 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ", 873 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 874 }, 875 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ", 876 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 877 }, 878 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ", 879 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 880 }, 881 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ", 882 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 883 }, 884 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ", 885 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 886 }, 887 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ", 888 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 889 }, 890 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ", 891 DECL head_contents, NULL, DECL head_attrs, NULL, NULL 892 }, 893 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " , 894 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL 895 }, 896 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ", 897 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL 898 }, 899 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style", 900 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 901 }, 902 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ", 903 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL 904 }, 905 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ", 906 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs 907 }, 908 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ", 909 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL 910 }, 911 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text", 912 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL 913 }, 914 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ", 915 EMPTY, NULL, NULL, DECL prompt_attrs, NULL 916 }, 917 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user", 918 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 919 }, 920 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ", 921 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL 922 }, 923 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ", 924 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL 925 }, 926 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ", 927 DECL html_flow, NULL, DECL html_attrs, NULL, NULL 928 }, 929 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ", 930 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL 931 }, 932 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ", 933 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr 934 }, 935 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ", 936 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL 937 }, 938 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ", 939 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr 940 }, 941 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ", 942 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL 943 }, 944 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ", 945 DECL html_flow, "div", DECL html_attrs, NULL, NULL 946 }, 947 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ", 948 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL 949 }, 950 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ", 951 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL 952 }, 953 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ", 954 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr 955 }, 956 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " , 957 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL 958 }, 959 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ", 960 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 961 }, 962 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ", 963 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr 964 }, 965 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ", 966 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL 967 }, 968 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ", 969 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL 970 }, 971 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style", 972 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 973 }, 974 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.", 975 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 976 }, 977 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ", 978 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr 979 }, 980 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ", 981 DECL select_content, NULL, DECL select_attrs, NULL, NULL 982 }, 983 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style", 984 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 985 }, 986 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ", 987 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 988 }, 989 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text", 990 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 991 }, 992 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis", 993 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 994 }, 995 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ", 996 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr 997 }, 998 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript", 999 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1000 }, 1001 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ", 1002 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1003 }, 1004 { "table", 0, 0, 0, 0, 0, 0, 0, "", 1005 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL 1006 }, 1007 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ", 1008 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1009 }, 1010 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell", 1011 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 1012 }, 1013 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ", 1014 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr 1015 }, 1016 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ", 1017 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1018 }, 1019 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell", 1020 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 1021 }, 1022 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ", 1023 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1024 }, 1025 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ", 1026 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL 1027 }, 1028 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ", 1029 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL 1030 }, 1031 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style", 1032 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1033 }, 1034 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style", 1035 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 1036 }, 1037 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ", 1038 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL 1039 }, 1040 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument", 1041 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1042 } 1043 }; 1044 1045 /* 1046 * start tags that imply the end of current element 1047 */ 1048 static const char * const htmlStartClose[] = { 1049 "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6", 1050 "dl", "ul", "ol", "menu", "dir", "address", "pre", 1051 "listing", "xmp", "head", NULL, 1052 "head", "p", NULL, 1053 "title", "p", NULL, 1054 "body", "head", "style", "link", "title", "p", NULL, 1055 "frameset", "head", "style", "link", "title", "p", NULL, 1056 "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address", 1057 "pre", "listing", "xmp", "head", "li", NULL, 1058 "hr", "p", "head", NULL, 1059 "h1", "p", "head", NULL, 1060 "h2", "p", "head", NULL, 1061 "h3", "p", "head", NULL, 1062 "h4", "p", "head", NULL, 1063 "h5", "p", "head", NULL, 1064 "h6", "p", "head", NULL, 1065 "dir", "p", "head", NULL, 1066 "address", "p", "head", "ul", NULL, 1067 "pre", "p", "head", "ul", NULL, 1068 "listing", "p", "head", NULL, 1069 "xmp", "p", "head", NULL, 1070 "blockquote", "p", "head", NULL, 1071 "dl", "p", "dt", "menu", "dir", "address", "pre", "listing", 1072 "xmp", "head", NULL, 1073 "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp", 1074 "head", "dd", NULL, 1075 "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp", 1076 "head", "dt", NULL, 1077 "ul", "p", "head", "ol", "menu", "dir", "address", "pre", 1078 "listing", "xmp", NULL, 1079 "ol", "p", "head", "ul", NULL, 1080 "menu", "p", "head", "ul", NULL, 1081 "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL, 1082 "div", "p", "head", NULL, 1083 "noscript", "p", "head", NULL, 1084 "center", "font", "b", "i", "p", "head", NULL, 1085 "a", "a", NULL, 1086 "caption", "p", NULL, 1087 "colgroup", "caption", "colgroup", "col", "p", NULL, 1088 "col", "caption", "col", "p", NULL, 1089 "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre", 1090 "listing", "xmp", "a", NULL, 1091 "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 1092 "td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 1093 "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL, 1094 "thead", "caption", "col", "colgroup", NULL, 1095 "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead", 1096 "tbody", "p", NULL, 1097 "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead", 1098 "tfoot", "tbody", "p", NULL, 1099 "optgroup", "option", NULL, 1100 "option", "option", NULL, 1101 "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", 1102 "pre", "listing", "xmp", "a", NULL, 1103 NULL 1104 }; 1105 1106 /* 1107 * The list of HTML elements which are supposed not to have 1108 * CDATA content and where a p element will be implied 1109 * 1110 * TODO: extend that list by reading the HTML SGML DTD on 1111 * implied paragraph 1112 */ 1113 static const char *const htmlNoContentElements[] = { 1114 "html", 1115 "head", 1116 NULL 1117 }; 1118 1119 /* 1120 * The list of HTML attributes which are of content %Script; 1121 * NOTE: when adding ones, check htmlIsScriptAttribute() since 1122 * it assumes the name starts with 'on' 1123 */ 1124 static const char *const htmlScriptAttributes[] = { 1125 "onclick", 1126 "ondblclick", 1127 "onmousedown", 1128 "onmouseup", 1129 "onmouseover", 1130 "onmousemove", 1131 "onmouseout", 1132 "onkeypress", 1133 "onkeydown", 1134 "onkeyup", 1135 "onload", 1136 "onunload", 1137 "onfocus", 1138 "onblur", 1139 "onsubmit", 1140 "onrest", 1141 "onchange", 1142 "onselect" 1143 }; 1144 1145 /* 1146 * This table is used by the htmlparser to know what to do with 1147 * broken html pages. By assigning different priorities to different 1148 * elements the parser can decide how to handle extra endtags. 1149 * Endtags are only allowed to close elements with lower or equal 1150 * priority. 1151 */ 1152 1153 typedef struct { 1154 const char *name; 1155 int priority; 1156 } elementPriority; 1157 1158 static const elementPriority htmlEndPriority[] = { 1159 {"div", 150}, 1160 {"td", 160}, 1161 {"th", 160}, 1162 {"tr", 170}, 1163 {"thead", 180}, 1164 {"tbody", 180}, 1165 {"tfoot", 180}, 1166 {"table", 190}, 1167 {"head", 200}, 1168 {"body", 200}, 1169 {"html", 220}, 1170 {NULL, 100} /* Default priority */ 1171 }; 1172 1173 static const char** htmlStartCloseIndex[100]; 1174 static int htmlStartCloseIndexinitialized = 0; 1175 1176 /************************************************************************ 1177 * * 1178 * functions to handle HTML specific data * 1179 * * 1180 ************************************************************************/ 1181 1182 /** 1183 * htmlInitAutoClose: 1184 * 1185 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. 1186 * This is not reentrant. Call xmlInitParser() once before processing in 1187 * case of use in multithreaded programs. 1188 */ 1189 void 1190 htmlInitAutoClose(void) { 1191 int indx, i = 0; 1192 1193 if (htmlStartCloseIndexinitialized) return; 1194 1195 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL; 1196 indx = 0; 1197 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) { 1198 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i]; 1199 while (htmlStartClose[i] != NULL) i++; 1200 i++; 1201 } 1202 htmlStartCloseIndexinitialized = 1; 1203 } 1204 1205 /** 1206 * htmlTagLookup: 1207 * @tag: The tag name in lowercase 1208 * 1209 * Lookup the HTML tag in the ElementTable 1210 * 1211 * Returns the related htmlElemDescPtr or NULL if not found. 1212 */ 1213 const htmlElemDesc * 1214 htmlTagLookup(const xmlChar *tag) { 1215 unsigned int i; 1216 1217 for (i = 0; i < (sizeof(html40ElementTable) / 1218 sizeof(html40ElementTable[0]));i++) { 1219 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name)) 1220 return((htmlElemDescPtr) &html40ElementTable[i]); 1221 } 1222 return(NULL); 1223 } 1224 1225 /** 1226 * htmlGetEndPriority: 1227 * @name: The name of the element to look up the priority for. 1228 * 1229 * Return value: The "endtag" priority. 1230 **/ 1231 static int 1232 htmlGetEndPriority (const xmlChar *name) { 1233 int i = 0; 1234 1235 while ((htmlEndPriority[i].name != NULL) && 1236 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name))) 1237 i++; 1238 1239 return(htmlEndPriority[i].priority); 1240 } 1241 1242 1243 /** 1244 * htmlCheckAutoClose: 1245 * @newtag: The new tag name 1246 * @oldtag: The old tag name 1247 * 1248 * Checks whether the new tag is one of the registered valid tags for 1249 * closing old. 1250 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. 1251 * 1252 * Returns 0 if no, 1 if yes. 1253 */ 1254 static int 1255 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag) 1256 { 1257 int i, indx; 1258 const char **closed = NULL; 1259 1260 if (htmlStartCloseIndexinitialized == 0) 1261 htmlInitAutoClose(); 1262 1263 /* inefficient, but not a big deal */ 1264 for (indx = 0; indx < 100; indx++) { 1265 closed = htmlStartCloseIndex[indx]; 1266 if (closed == NULL) 1267 return (0); 1268 if (xmlStrEqual(BAD_CAST * closed, newtag)) 1269 break; 1270 } 1271 1272 i = closed - htmlStartClose; 1273 i++; 1274 while (htmlStartClose[i] != NULL) { 1275 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) { 1276 return (1); 1277 } 1278 i++; 1279 } 1280 return (0); 1281 } 1282 1283 /** 1284 * htmlAutoCloseOnClose: 1285 * @ctxt: an HTML parser context 1286 * @newtag: The new tag name 1287 * @force: force the tag closure 1288 * 1289 * The HTML DTD allows an ending tag to implicitly close other tags. 1290 */ 1291 static void 1292 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1293 { 1294 const htmlElemDesc *info; 1295 int i, priority; 1296 1297 priority = htmlGetEndPriority(newtag); 1298 1299 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1300 1301 if (xmlStrEqual(newtag, ctxt->nameTab[i])) 1302 break; 1303 /* 1304 * A missplaced endtag can only close elements with lower 1305 * or equal priority, so if we find an element with higher 1306 * priority before we find an element with 1307 * matching name, we just ignore this endtag 1308 */ 1309 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority) 1310 return; 1311 } 1312 if (i < 0) 1313 return; 1314 1315 while (!xmlStrEqual(newtag, ctxt->name)) { 1316 info = htmlTagLookup(ctxt->name); 1317 if ((info != NULL) && (info->endTag == 3)) { 1318 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 1319 "Opening and ending tag mismatch: %s and %s\n", 1320 newtag, ctxt->name); 1321 } 1322 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1323 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1324 htmlnamePop(ctxt); 1325 } 1326 } 1327 1328 /** 1329 * htmlAutoCloseOnEnd: 1330 * @ctxt: an HTML parser context 1331 * 1332 * Close all remaining tags at the end of the stream 1333 */ 1334 static void 1335 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) 1336 { 1337 int i; 1338 1339 if (ctxt->nameNr == 0) 1340 return; 1341 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1342 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1343 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1344 htmlnamePop(ctxt); 1345 } 1346 } 1347 1348 /** 1349 * htmlAutoClose: 1350 * @ctxt: an HTML parser context 1351 * @newtag: The new tag name or NULL 1352 * 1353 * The HTML DTD allows a tag to implicitly close other tags. 1354 * The list is kept in htmlStartClose array. This function is 1355 * called when a new tag has been detected and generates the 1356 * appropriates closes if possible/needed. 1357 * If newtag is NULL this mean we are at the end of the resource 1358 * and we should check 1359 */ 1360 static void 1361 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1362 { 1363 while ((newtag != NULL) && (ctxt->name != NULL) && 1364 (htmlCheckAutoClose(newtag, ctxt->name))) { 1365 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1366 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1367 htmlnamePop(ctxt); 1368 } 1369 if (newtag == NULL) { 1370 htmlAutoCloseOnEnd(ctxt); 1371 return; 1372 } 1373 while ((newtag == NULL) && (ctxt->name != NULL) && 1374 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) || 1375 (xmlStrEqual(ctxt->name, BAD_CAST "body")) || 1376 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) { 1377 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1378 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1379 htmlnamePop(ctxt); 1380 } 1381 } 1382 1383 /** 1384 * htmlAutoCloseTag: 1385 * @doc: the HTML document 1386 * @name: The tag name 1387 * @elem: the HTML element 1388 * 1389 * The HTML DTD allows a tag to implicitly close other tags. 1390 * The list is kept in htmlStartClose array. This function checks 1391 * if the element or one of it's children would autoclose the 1392 * given tag. 1393 * 1394 * Returns 1 if autoclose, 0 otherwise 1395 */ 1396 int 1397 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) { 1398 htmlNodePtr child; 1399 1400 if (elem == NULL) return(1); 1401 if (xmlStrEqual(name, elem->name)) return(0); 1402 if (htmlCheckAutoClose(elem->name, name)) return(1); 1403 child = elem->children; 1404 while (child != NULL) { 1405 if (htmlAutoCloseTag(doc, name, child)) return(1); 1406 child = child->next; 1407 } 1408 return(0); 1409 } 1410 1411 /** 1412 * htmlIsAutoClosed: 1413 * @doc: the HTML document 1414 * @elem: the HTML element 1415 * 1416 * The HTML DTD allows a tag to implicitly close other tags. 1417 * The list is kept in htmlStartClose array. This function checks 1418 * if a tag is autoclosed by one of it's child 1419 * 1420 * Returns 1 if autoclosed, 0 otherwise 1421 */ 1422 int 1423 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) { 1424 htmlNodePtr child; 1425 1426 if (elem == NULL) return(1); 1427 child = elem->children; 1428 while (child != NULL) { 1429 if (htmlAutoCloseTag(doc, elem->name, child)) return(1); 1430 child = child->next; 1431 } 1432 return(0); 1433 } 1434 1435 /** 1436 * htmlCheckImplied: 1437 * @ctxt: an HTML parser context 1438 * @newtag: The new tag name 1439 * 1440 * The HTML DTD allows a tag to exists only implicitly 1441 * called when a new tag has been detected and generates the 1442 * appropriates implicit tags if missing 1443 */ 1444 static void 1445 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { 1446 int i; 1447 1448 if (ctxt->options & HTML_PARSE_NOIMPLIED) 1449 return; 1450 if (!htmlOmittedDefaultValue) 1451 return; 1452 if (xmlStrEqual(newtag, BAD_CAST"html")) 1453 return; 1454 if (ctxt->nameNr <= 0) { 1455 htmlnamePush(ctxt, BAD_CAST"html"); 1456 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1457 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL); 1458 } 1459 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head"))) 1460 return; 1461 if ((ctxt->nameNr <= 1) && 1462 ((xmlStrEqual(newtag, BAD_CAST"script")) || 1463 (xmlStrEqual(newtag, BAD_CAST"style")) || 1464 (xmlStrEqual(newtag, BAD_CAST"meta")) || 1465 (xmlStrEqual(newtag, BAD_CAST"link")) || 1466 (xmlStrEqual(newtag, BAD_CAST"title")) || 1467 (xmlStrEqual(newtag, BAD_CAST"base")))) { 1468 if (ctxt->html >= 3) { 1469 /* we already saw or generated an <head> before */ 1470 return; 1471 } 1472 /* 1473 * dropped OBJECT ... i you put it first BODY will be 1474 * assumed ! 1475 */ 1476 htmlnamePush(ctxt, BAD_CAST"head"); 1477 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1478 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL); 1479 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) && 1480 (!xmlStrEqual(newtag, BAD_CAST"frame")) && 1481 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) { 1482 if (ctxt->html >= 10) { 1483 /* we already saw or generated a <body> before */ 1484 return; 1485 } 1486 for (i = 0;i < ctxt->nameNr;i++) { 1487 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) { 1488 return; 1489 } 1490 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) { 1491 return; 1492 } 1493 } 1494 1495 htmlnamePush(ctxt, BAD_CAST"body"); 1496 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1497 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL); 1498 } 1499 } 1500 1501 /** 1502 * htmlCheckParagraph 1503 * @ctxt: an HTML parser context 1504 * 1505 * Check whether a p element need to be implied before inserting 1506 * characters in the current element. 1507 * 1508 * Returns 1 if a paragraph has been inserted, 0 if not and -1 1509 * in case of error. 1510 */ 1511 1512 static int 1513 htmlCheckParagraph(htmlParserCtxtPtr ctxt) { 1514 const xmlChar *tag; 1515 int i; 1516 1517 if (ctxt == NULL) 1518 return(-1); 1519 tag = ctxt->name; 1520 if (tag == NULL) { 1521 htmlAutoClose(ctxt, BAD_CAST"p"); 1522 htmlCheckImplied(ctxt, BAD_CAST"p"); 1523 htmlnamePush(ctxt, BAD_CAST"p"); 1524 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1525 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 1526 return(1); 1527 } 1528 if (!htmlOmittedDefaultValue) 1529 return(0); 1530 for (i = 0; htmlNoContentElements[i] != NULL; i++) { 1531 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) { 1532 htmlAutoClose(ctxt, BAD_CAST"p"); 1533 htmlCheckImplied(ctxt, BAD_CAST"p"); 1534 htmlnamePush(ctxt, BAD_CAST"p"); 1535 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1536 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 1537 return(1); 1538 } 1539 } 1540 return(0); 1541 } 1542 1543 /** 1544 * htmlIsScriptAttribute: 1545 * @name: an attribute name 1546 * 1547 * Check if an attribute is of content type Script 1548 * 1549 * Returns 1 is the attribute is a script 0 otherwise 1550 */ 1551 int 1552 htmlIsScriptAttribute(const xmlChar *name) { 1553 unsigned int i; 1554 1555 if (name == NULL) 1556 return(0); 1557 /* 1558 * all script attributes start with 'on' 1559 */ 1560 if ((name[0] != 'o') || (name[1] != 'n')) 1561 return(0); 1562 for (i = 0; 1563 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]); 1564 i++) { 1565 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i])) 1566 return(1); 1567 } 1568 return(0); 1569 } 1570 1571 /************************************************************************ 1572 * * 1573 * The list of HTML predefined entities * 1574 * * 1575 ************************************************************************/ 1576 1577 1578 static const htmlEntityDesc html40EntitiesTable[] = { 1579 /* 1580 * the 4 absolute ones, plus apostrophe. 1581 */ 1582 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" }, 1583 { 38, "amp", "ampersand, U+0026 ISOnum" }, 1584 { 39, "apos", "single quote" }, 1585 { 60, "lt", "less-than sign, U+003C ISOnum" }, 1586 { 62, "gt", "greater-than sign, U+003E ISOnum" }, 1587 1588 /* 1589 * A bunch still in the 128-255 range 1590 * Replacing them depend really on the charset used. 1591 */ 1592 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" }, 1593 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" }, 1594 { 162, "cent", "cent sign, U+00A2 ISOnum" }, 1595 { 163, "pound","pound sign, U+00A3 ISOnum" }, 1596 { 164, "curren","currency sign, U+00A4 ISOnum" }, 1597 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" }, 1598 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" }, 1599 { 167, "sect", "section sign, U+00A7 ISOnum" }, 1600 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" }, 1601 { 169, "copy", "copyright sign, U+00A9 ISOnum" }, 1602 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" }, 1603 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" }, 1604 { 172, "not", "not sign, U+00AC ISOnum" }, 1605 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" }, 1606 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" }, 1607 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" }, 1608 { 176, "deg", "degree sign, U+00B0 ISOnum" }, 1609 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" }, 1610 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" }, 1611 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" }, 1612 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" }, 1613 { 181, "micro","micro sign, U+00B5 ISOnum" }, 1614 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" }, 1615 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" }, 1616 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" }, 1617 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" }, 1618 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" }, 1619 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" }, 1620 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" }, 1621 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" }, 1622 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" }, 1623 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" }, 1624 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" }, 1625 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" }, 1626 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" }, 1627 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" }, 1628 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" }, 1629 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" }, 1630 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" }, 1631 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" }, 1632 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" }, 1633 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" }, 1634 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" }, 1635 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" }, 1636 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" }, 1637 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" }, 1638 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" }, 1639 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" }, 1640 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" }, 1641 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" }, 1642 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" }, 1643 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" }, 1644 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" }, 1645 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" }, 1646 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" }, 1647 { 215, "times","multiplication sign, U+00D7 ISOnum" }, 1648 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" }, 1649 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" }, 1650 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" }, 1651 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" }, 1652 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" }, 1653 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" }, 1654 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" }, 1655 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" }, 1656 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" }, 1657 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" }, 1658 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" }, 1659 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" }, 1660 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" }, 1661 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" }, 1662 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" }, 1663 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" }, 1664 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" }, 1665 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" }, 1666 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" }, 1667 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" }, 1668 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" }, 1669 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" }, 1670 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" }, 1671 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" }, 1672 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" }, 1673 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" }, 1674 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" }, 1675 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" }, 1676 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" }, 1677 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" }, 1678 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" }, 1679 { 247, "divide","division sign, U+00F7 ISOnum" }, 1680 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" }, 1681 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" }, 1682 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" }, 1683 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" }, 1684 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" }, 1685 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" }, 1686 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" }, 1687 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" }, 1688 1689 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" }, 1690 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" }, 1691 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" }, 1692 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" }, 1693 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" }, 1694 1695 /* 1696 * Anything below should really be kept as entities references 1697 */ 1698 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" }, 1699 1700 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" }, 1701 { 732, "tilde","small tilde, U+02DC ISOdia" }, 1702 1703 { 913, "Alpha","greek capital letter alpha, U+0391" }, 1704 { 914, "Beta", "greek capital letter beta, U+0392" }, 1705 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" }, 1706 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" }, 1707 { 917, "Epsilon","greek capital letter epsilon, U+0395" }, 1708 { 918, "Zeta", "greek capital letter zeta, U+0396" }, 1709 { 919, "Eta", "greek capital letter eta, U+0397" }, 1710 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" }, 1711 { 921, "Iota", "greek capital letter iota, U+0399" }, 1712 { 922, "Kappa","greek capital letter kappa, U+039A" }, 1713 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" }, 1714 { 924, "Mu", "greek capital letter mu, U+039C" }, 1715 { 925, "Nu", "greek capital letter nu, U+039D" }, 1716 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" }, 1717 { 927, "Omicron","greek capital letter omicron, U+039F" }, 1718 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" }, 1719 { 929, "Rho", "greek capital letter rho, U+03A1" }, 1720 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" }, 1721 { 932, "Tau", "greek capital letter tau, U+03A4" }, 1722 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" }, 1723 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" }, 1724 { 935, "Chi", "greek capital letter chi, U+03A7" }, 1725 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" }, 1726 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" }, 1727 1728 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" }, 1729 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" }, 1730 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" }, 1731 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" }, 1732 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" }, 1733 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" }, 1734 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" }, 1735 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" }, 1736 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" }, 1737 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" }, 1738 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" }, 1739 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" }, 1740 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" }, 1741 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" }, 1742 { 959, "omicron","greek small letter omicron, U+03BF NEW" }, 1743 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" }, 1744 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" }, 1745 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" }, 1746 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" }, 1747 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" }, 1748 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" }, 1749 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" }, 1750 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" }, 1751 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" }, 1752 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" }, 1753 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" }, 1754 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" }, 1755 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" }, 1756 1757 { 8194, "ensp", "en space, U+2002 ISOpub" }, 1758 { 8195, "emsp", "em space, U+2003 ISOpub" }, 1759 { 8201, "thinsp","thin space, U+2009 ISOpub" }, 1760 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" }, 1761 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" }, 1762 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" }, 1763 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" }, 1764 { 8211, "ndash","en dash, U+2013 ISOpub" }, 1765 { 8212, "mdash","em dash, U+2014 ISOpub" }, 1766 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" }, 1767 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" }, 1768 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" }, 1769 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" }, 1770 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" }, 1771 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" }, 1772 { 8224, "dagger","dagger, U+2020 ISOpub" }, 1773 { 8225, "Dagger","double dagger, U+2021 ISOpub" }, 1774 1775 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" }, 1776 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" }, 1777 1778 { 8240, "permil","per mille sign, U+2030 ISOtech" }, 1779 1780 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" }, 1781 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" }, 1782 1783 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" }, 1784 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" }, 1785 1786 { 8254, "oline","overline = spacing overscore, U+203E NEW" }, 1787 { 8260, "frasl","fraction slash, U+2044 NEW" }, 1788 1789 { 8364, "euro", "euro sign, U+20AC NEW" }, 1790 1791 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" }, 1792 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" }, 1793 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" }, 1794 { 8482, "trade","trade mark sign, U+2122 ISOnum" }, 1795 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" }, 1796 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" }, 1797 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" }, 1798 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" }, 1799 { 8595, "darr", "downwards arrow, U+2193 ISOnum" }, 1800 { 8596, "harr", "left right arrow, U+2194 ISOamsa" }, 1801 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" }, 1802 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" }, 1803 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" }, 1804 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" }, 1805 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" }, 1806 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" }, 1807 1808 { 8704, "forall","for all, U+2200 ISOtech" }, 1809 { 8706, "part", "partial differential, U+2202 ISOtech" }, 1810 { 8707, "exist","there exists, U+2203 ISOtech" }, 1811 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" }, 1812 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" }, 1813 { 8712, "isin", "element of, U+2208 ISOtech" }, 1814 { 8713, "notin","not an element of, U+2209 ISOtech" }, 1815 { 8715, "ni", "contains as member, U+220B ISOtech" }, 1816 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" }, 1817 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" }, 1818 { 8722, "minus","minus sign, U+2212 ISOtech" }, 1819 { 8727, "lowast","asterisk operator, U+2217 ISOtech" }, 1820 { 8730, "radic","square root = radical sign, U+221A ISOtech" }, 1821 { 8733, "prop", "proportional to, U+221D ISOtech" }, 1822 { 8734, "infin","infinity, U+221E ISOtech" }, 1823 { 8736, "ang", "angle, U+2220 ISOamso" }, 1824 { 8743, "and", "logical and = wedge, U+2227 ISOtech" }, 1825 { 8744, "or", "logical or = vee, U+2228 ISOtech" }, 1826 { 8745, "cap", "intersection = cap, U+2229 ISOtech" }, 1827 { 8746, "cup", "union = cup, U+222A ISOtech" }, 1828 { 8747, "int", "integral, U+222B ISOtech" }, 1829 { 8756, "there4","therefore, U+2234 ISOtech" }, 1830 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" }, 1831 { 8773, "cong", "approximately equal to, U+2245 ISOtech" }, 1832 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" }, 1833 { 8800, "ne", "not equal to, U+2260 ISOtech" }, 1834 { 8801, "equiv","identical to, U+2261 ISOtech" }, 1835 { 8804, "le", "less-than or equal to, U+2264 ISOtech" }, 1836 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" }, 1837 { 8834, "sub", "subset of, U+2282 ISOtech" }, 1838 { 8835, "sup", "superset of, U+2283 ISOtech" }, 1839 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" }, 1840 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" }, 1841 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" }, 1842 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" }, 1843 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" }, 1844 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" }, 1845 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" }, 1846 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" }, 1847 { 8969, "rceil","right ceiling, U+2309 ISOamsc" }, 1848 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" }, 1849 { 8971, "rfloor","right floor, U+230B ISOamsc" }, 1850 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" }, 1851 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" }, 1852 { 9674, "loz", "lozenge, U+25CA ISOpub" }, 1853 1854 { 9824, "spades","black spade suit, U+2660 ISOpub" }, 1855 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" }, 1856 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" }, 1857 { 9830, "diams","black diamond suit, U+2666 ISOpub" }, 1858 1859 }; 1860 1861 /************************************************************************ 1862 * * 1863 * Commodity functions to handle entities * 1864 * * 1865 ************************************************************************/ 1866 1867 /* 1868 * Macro used to grow the current buffer. 1869 */ 1870 #define growBuffer(buffer) { \ 1871 xmlChar *tmp; \ 1872 buffer##_size *= 2; \ 1873 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \ 1874 if (tmp == NULL) { \ 1875 htmlErrMemory(ctxt, "growing buffer\n"); \ 1876 xmlFree(buffer); \ 1877 return(NULL); \ 1878 } \ 1879 buffer = tmp; \ 1880 } 1881 1882 /** 1883 * htmlEntityLookup: 1884 * @name: the entity name 1885 * 1886 * Lookup the given entity in EntitiesTable 1887 * 1888 * TODO: the linear scan is really ugly, an hash table is really needed. 1889 * 1890 * Returns the associated htmlEntityDescPtr if found, NULL otherwise. 1891 */ 1892 const htmlEntityDesc * 1893 htmlEntityLookup(const xmlChar *name) { 1894 unsigned int i; 1895 1896 for (i = 0;i < (sizeof(html40EntitiesTable)/ 1897 sizeof(html40EntitiesTable[0]));i++) { 1898 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) { 1899 return((htmlEntityDescPtr) &html40EntitiesTable[i]); 1900 } 1901 } 1902 return(NULL); 1903 } 1904 1905 /** 1906 * htmlEntityValueLookup: 1907 * @value: the entity's unicode value 1908 * 1909 * Lookup the given entity in EntitiesTable 1910 * 1911 * TODO: the linear scan is really ugly, an hash table is really needed. 1912 * 1913 * Returns the associated htmlEntityDescPtr if found, NULL otherwise. 1914 */ 1915 const htmlEntityDesc * 1916 htmlEntityValueLookup(unsigned int value) { 1917 unsigned int i; 1918 1919 for (i = 0;i < (sizeof(html40EntitiesTable)/ 1920 sizeof(html40EntitiesTable[0]));i++) { 1921 if (html40EntitiesTable[i].value >= value) { 1922 if (html40EntitiesTable[i].value > value) 1923 break; 1924 return((htmlEntityDescPtr) &html40EntitiesTable[i]); 1925 } 1926 } 1927 return(NULL); 1928 } 1929 1930 /** 1931 * UTF8ToHtml: 1932 * @out: a pointer to an array of bytes to store the result 1933 * @outlen: the length of @out 1934 * @in: a pointer to an array of UTF-8 chars 1935 * @inlen: the length of @in 1936 * 1937 * Take a block of UTF-8 chars in and try to convert it to an ASCII 1938 * plus HTML entities block of chars out. 1939 * 1940 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise 1941 * The value of @inlen after return is the number of octets consumed 1942 * as the return value is positive, else unpredictable. 1943 * The value of @outlen after return is the number of octets consumed. 1944 */ 1945 int 1946 UTF8ToHtml(unsigned char* out, int *outlen, 1947 const unsigned char* in, int *inlen) { 1948 const unsigned char* processed = in; 1949 const unsigned char* outend; 1950 const unsigned char* outstart = out; 1951 const unsigned char* instart = in; 1952 const unsigned char* inend; 1953 unsigned int c, d; 1954 int trailing; 1955 1956 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1); 1957 if (in == NULL) { 1958 /* 1959 * initialization nothing to do 1960 */ 1961 *outlen = 0; 1962 *inlen = 0; 1963 return(0); 1964 } 1965 inend = in + (*inlen); 1966 outend = out + (*outlen); 1967 while (in < inend) { 1968 d = *in++; 1969 if (d < 0x80) { c= d; trailing= 0; } 1970 else if (d < 0xC0) { 1971 /* trailing byte in leading position */ 1972 *outlen = out - outstart; 1973 *inlen = processed - instart; 1974 return(-2); 1975 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 1976 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 1977 else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 1978 else { 1979 /* no chance for this in Ascii */ 1980 *outlen = out - outstart; 1981 *inlen = processed - instart; 1982 return(-2); 1983 } 1984 1985 if (inend - in < trailing) { 1986 break; 1987 } 1988 1989 for ( ; trailing; trailing--) { 1990 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) 1991 break; 1992 c <<= 6; 1993 c |= d & 0x3F; 1994 } 1995 1996 /* assertion: c is a single UTF-4 value */ 1997 if (c < 0x80) { 1998 if (out + 1 >= outend) 1999 break; 2000 *out++ = c; 2001 } else { 2002 int len; 2003 const htmlEntityDesc * ent; 2004 const char *cp; 2005 char nbuf[16]; 2006 2007 /* 2008 * Try to lookup a predefined HTML entity for it 2009 */ 2010 2011 ent = htmlEntityValueLookup(c); 2012 if (ent == NULL) { 2013 snprintf(nbuf, sizeof(nbuf), "#%u", c); 2014 cp = nbuf; 2015 } 2016 else 2017 cp = ent->name; 2018 len = strlen(cp); 2019 if (out + 2 + len >= outend) 2020 break; 2021 *out++ = '&'; 2022 memcpy(out, cp, len); 2023 out += len; 2024 *out++ = ';'; 2025 } 2026 processed = in; 2027 } 2028 *outlen = out - outstart; 2029 *inlen = processed - instart; 2030 return(0); 2031 } 2032 2033 /** 2034 * htmlEncodeEntities: 2035 * @out: a pointer to an array of bytes to store the result 2036 * @outlen: the length of @out 2037 * @in: a pointer to an array of UTF-8 chars 2038 * @inlen: the length of @in 2039 * @quoteChar: the quote character to escape (' or ") or zero. 2040 * 2041 * Take a block of UTF-8 chars in and try to convert it to an ASCII 2042 * plus HTML entities block of chars out. 2043 * 2044 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise 2045 * The value of @inlen after return is the number of octets consumed 2046 * as the return value is positive, else unpredictable. 2047 * The value of @outlen after return is the number of octets consumed. 2048 */ 2049 int 2050 htmlEncodeEntities(unsigned char* out, int *outlen, 2051 const unsigned char* in, int *inlen, int quoteChar) { 2052 const unsigned char* processed = in; 2053 const unsigned char* outend; 2054 const unsigned char* outstart = out; 2055 const unsigned char* instart = in; 2056 const unsigned char* inend; 2057 unsigned int c, d; 2058 int trailing; 2059 2060 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) 2061 return(-1); 2062 outend = out + (*outlen); 2063 inend = in + (*inlen); 2064 while (in < inend) { 2065 d = *in++; 2066 if (d < 0x80) { c= d; trailing= 0; } 2067 else if (d < 0xC0) { 2068 /* trailing byte in leading position */ 2069 *outlen = out - outstart; 2070 *inlen = processed - instart; 2071 return(-2); 2072 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 2073 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 2074 else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 2075 else { 2076 /* no chance for this in Ascii */ 2077 *outlen = out - outstart; 2078 *inlen = processed - instart; 2079 return(-2); 2080 } 2081 2082 if (inend - in < trailing) 2083 break; 2084 2085 while (trailing--) { 2086 if (((d= *in++) & 0xC0) != 0x80) { 2087 *outlen = out - outstart; 2088 *inlen = processed - instart; 2089 return(-2); 2090 } 2091 c <<= 6; 2092 c |= d & 0x3F; 2093 } 2094 2095 /* assertion: c is a single UTF-4 value */ 2096 if ((c < 0x80) && (c != (unsigned int) quoteChar) && 2097 (c != '&') && (c != '<') && (c != '>')) { 2098 if (out >= outend) 2099 break; 2100 *out++ = c; 2101 } else { 2102 const htmlEntityDesc * ent; 2103 const char *cp; 2104 char nbuf[16]; 2105 int len; 2106 2107 /* 2108 * Try to lookup a predefined HTML entity for it 2109 */ 2110 ent = htmlEntityValueLookup(c); 2111 if (ent == NULL) { 2112 snprintf(nbuf, sizeof(nbuf), "#%u", c); 2113 cp = nbuf; 2114 } 2115 else 2116 cp = ent->name; 2117 len = strlen(cp); 2118 if (out + 2 + len > outend) 2119 break; 2120 *out++ = '&'; 2121 memcpy(out, cp, len); 2122 out += len; 2123 *out++ = ';'; 2124 } 2125 processed = in; 2126 } 2127 *outlen = out - outstart; 2128 *inlen = processed - instart; 2129 return(0); 2130 } 2131 2132 /************************************************************************ 2133 * * 2134 * Commodity functions to handle streams * 2135 * * 2136 ************************************************************************/ 2137 2138 /** 2139 * htmlNewInputStream: 2140 * @ctxt: an HTML parser context 2141 * 2142 * Create a new input stream structure 2143 * Returns the new input stream or NULL 2144 */ 2145 static htmlParserInputPtr 2146 htmlNewInputStream(htmlParserCtxtPtr ctxt) { 2147 htmlParserInputPtr input; 2148 2149 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput)); 2150 if (input == NULL) { 2151 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n"); 2152 return(NULL); 2153 } 2154 memset(input, 0, sizeof(htmlParserInput)); 2155 input->filename = NULL; 2156 input->directory = NULL; 2157 input->base = NULL; 2158 input->cur = NULL; 2159 input->buf = NULL; 2160 input->line = 1; 2161 input->col = 1; 2162 input->buf = NULL; 2163 input->free = NULL; 2164 input->version = NULL; 2165 input->consumed = 0; 2166 input->length = 0; 2167 return(input); 2168 } 2169 2170 2171 /************************************************************************ 2172 * * 2173 * Commodity functions, cleanup needed ? * 2174 * * 2175 ************************************************************************/ 2176 /* 2177 * all tags allowing pc data from the html 4.01 loose dtd 2178 * NOTE: it might be more apropriate to integrate this information 2179 * into the html40ElementTable array but I don't want to risk any 2180 * binary incomptibility 2181 */ 2182 static const char *allowPCData[] = { 2183 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big", 2184 "blockquote", "body", "button", "caption", "center", "cite", "code", 2185 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2", 2186 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend", 2187 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp", 2188 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var" 2189 }; 2190 2191 /** 2192 * areBlanks: 2193 * @ctxt: an HTML parser context 2194 * @str: a xmlChar * 2195 * @len: the size of @str 2196 * 2197 * Is this a sequence of blank chars that one can ignore ? 2198 * 2199 * Returns 1 if ignorable 0 otherwise. 2200 */ 2201 2202 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) { 2203 unsigned int i; 2204 int j; 2205 xmlNodePtr lastChild; 2206 xmlDtdPtr dtd; 2207 2208 for (j = 0;j < len;j++) 2209 if (!(IS_BLANK_CH(str[j]))) return(0); 2210 2211 if (CUR == 0) return(1); 2212 if (CUR != '<') return(0); 2213 if (ctxt->name == NULL) 2214 return(1); 2215 if (xmlStrEqual(ctxt->name, BAD_CAST"html")) 2216 return(1); 2217 if (xmlStrEqual(ctxt->name, BAD_CAST"head")) 2218 return(1); 2219 2220 /* Only strip CDATA children of the body tag for strict HTML DTDs */ 2221 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) { 2222 dtd = xmlGetIntSubset(ctxt->myDoc); 2223 if (dtd != NULL && dtd->ExternalID != NULL) { 2224 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") || 2225 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN")) 2226 return(1); 2227 } 2228 } 2229 2230 if (ctxt->node == NULL) return(0); 2231 lastChild = xmlGetLastChild(ctxt->node); 2232 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE)) 2233 lastChild = lastChild->prev; 2234 if (lastChild == NULL) { 2235 if ((ctxt->node->type != XML_ELEMENT_NODE) && 2236 (ctxt->node->content != NULL)) return(0); 2237 /* keep ws in constructs like ...<b> </b>... 2238 for all tags "b" allowing PCDATA */ 2239 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2240 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) { 2241 return(0); 2242 } 2243 } 2244 } else if (xmlNodeIsText(lastChild)) { 2245 return(0); 2246 } else { 2247 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p> 2248 for all tags "p" allowing PCDATA */ 2249 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2250 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) { 2251 return(0); 2252 } 2253 } 2254 } 2255 return(1); 2256 } 2257 2258 /** 2259 * htmlNewDocNoDtD: 2260 * @URI: URI for the dtd, or NULL 2261 * @ExternalID: the external ID of the DTD, or NULL 2262 * 2263 * Creates a new HTML document without a DTD node if @URI and @ExternalID 2264 * are NULL 2265 * 2266 * Returns a new document, do not initialize the DTD if not provided 2267 */ 2268 htmlDocPtr 2269 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) { 2270 xmlDocPtr cur; 2271 2272 /* 2273 * Allocate a new document and fill the fields. 2274 */ 2275 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc)); 2276 if (cur == NULL) { 2277 htmlErrMemory(NULL, "HTML document creation failed\n"); 2278 return(NULL); 2279 } 2280 memset(cur, 0, sizeof(xmlDoc)); 2281 2282 cur->type = XML_HTML_DOCUMENT_NODE; 2283 cur->version = NULL; 2284 cur->intSubset = NULL; 2285 cur->doc = cur; 2286 cur->name = NULL; 2287 cur->children = NULL; 2288 cur->extSubset = NULL; 2289 cur->oldNs = NULL; 2290 cur->encoding = NULL; 2291 cur->standalone = 1; 2292 cur->compression = 0; 2293 cur->ids = NULL; 2294 cur->refs = NULL; 2295 cur->_private = NULL; 2296 cur->charset = XML_CHAR_ENCODING_UTF8; 2297 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT; 2298 if ((ExternalID != NULL) || 2299 (URI != NULL)) 2300 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI); 2301 return(cur); 2302 } 2303 2304 /** 2305 * htmlNewDoc: 2306 * @URI: URI for the dtd, or NULL 2307 * @ExternalID: the external ID of the DTD, or NULL 2308 * 2309 * Creates a new HTML document 2310 * 2311 * Returns a new document 2312 */ 2313 htmlDocPtr 2314 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) { 2315 if ((URI == NULL) && (ExternalID == NULL)) 2316 return(htmlNewDocNoDtD( 2317 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd", 2318 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN")); 2319 2320 return(htmlNewDocNoDtD(URI, ExternalID)); 2321 } 2322 2323 2324 /************************************************************************ 2325 * * 2326 * The parser itself * 2327 * Relates to http://www.w3.org/TR/html40 * 2328 * * 2329 ************************************************************************/ 2330 2331 /************************************************************************ 2332 * * 2333 * The parser itself * 2334 * * 2335 ************************************************************************/ 2336 2337 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt); 2338 2339 /** 2340 * htmlParseHTMLName: 2341 * @ctxt: an HTML parser context 2342 * 2343 * parse an HTML tag or attribute name, note that we convert it to lowercase 2344 * since HTML names are not case-sensitive. 2345 * 2346 * Returns the Tag Name parsed or NULL 2347 */ 2348 2349 static const xmlChar * 2350 htmlParseHTMLName(htmlParserCtxtPtr ctxt) { 2351 int i = 0; 2352 xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 2353 2354 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') && 2355 (CUR != ':') && (CUR != '.')) return(NULL); 2356 2357 while ((i < HTML_PARSER_BUFFER_SIZE) && 2358 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) || 2359 (CUR == ':') || (CUR == '-') || (CUR == '_') || 2360 (CUR == '.'))) { 2361 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20; 2362 else loc[i] = CUR; 2363 i++; 2364 2365 NEXT; 2366 } 2367 2368 return(xmlDictLookup(ctxt->dict, loc, i)); 2369 } 2370 2371 2372 /** 2373 * htmlParseHTMLName_nonInvasive: 2374 * @ctxt: an HTML parser context 2375 * 2376 * parse an HTML tag or attribute name, note that we convert it to lowercase 2377 * since HTML names are not case-sensitive, this doesn't consume the data 2378 * from the stream, it's a look-ahead 2379 * 2380 * Returns the Tag Name parsed or NULL 2381 */ 2382 2383 static const xmlChar * 2384 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) { 2385 int i = 0; 2386 xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 2387 2388 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') && 2389 (NXT(1) != ':')) return(NULL); 2390 2391 while ((i < HTML_PARSER_BUFFER_SIZE) && 2392 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) || 2393 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) { 2394 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20; 2395 else loc[i] = NXT(1+i); 2396 i++; 2397 } 2398 2399 return(xmlDictLookup(ctxt->dict, loc, i)); 2400 } 2401 2402 2403 /** 2404 * htmlParseName: 2405 * @ctxt: an HTML parser context 2406 * 2407 * parse an HTML name, this routine is case sensitive. 2408 * 2409 * Returns the Name parsed or NULL 2410 */ 2411 2412 static const xmlChar * 2413 htmlParseName(htmlParserCtxtPtr ctxt) { 2414 const xmlChar *in; 2415 const xmlChar *ret; 2416 int count = 0; 2417 2418 GROW; 2419 2420 /* 2421 * Accelerator for simple ASCII names 2422 */ 2423 in = ctxt->input->cur; 2424 if (((*in >= 0x61) && (*in <= 0x7A)) || 2425 ((*in >= 0x41) && (*in <= 0x5A)) || 2426 (*in == '_') || (*in == ':')) { 2427 in++; 2428 while (((*in >= 0x61) && (*in <= 0x7A)) || 2429 ((*in >= 0x41) && (*in <= 0x5A)) || 2430 ((*in >= 0x30) && (*in <= 0x39)) || 2431 (*in == '_') || (*in == '-') || 2432 (*in == ':') || (*in == '.')) 2433 in++; 2434 if ((*in > 0) && (*in < 0x80)) { 2435 count = in - ctxt->input->cur; 2436 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count); 2437 ctxt->input->cur = in; 2438 ctxt->nbChars += count; 2439 ctxt->input->col += count; 2440 return(ret); 2441 } 2442 } 2443 return(htmlParseNameComplex(ctxt)); 2444 } 2445 2446 static const xmlChar * 2447 htmlParseNameComplex(xmlParserCtxtPtr ctxt) { 2448 int len = 0, l; 2449 int c; 2450 int count = 0; 2451 2452 /* 2453 * Handler for more complex cases 2454 */ 2455 GROW; 2456 c = CUR_CHAR(l); 2457 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */ 2458 (!IS_LETTER(c) && (c != '_') && 2459 (c != ':'))) { 2460 return(NULL); 2461 } 2462 2463 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */ 2464 ((IS_LETTER(c)) || (IS_DIGIT(c)) || 2465 (c == '.') || (c == '-') || 2466 (c == '_') || (c == ':') || 2467 (IS_COMBINING(c)) || 2468 (IS_EXTENDER(c)))) { 2469 if (count++ > 100) { 2470 count = 0; 2471 GROW; 2472 } 2473 len += l; 2474 NEXTL(l); 2475 c = CUR_CHAR(l); 2476 } 2477 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len)); 2478 } 2479 2480 2481 /** 2482 * htmlParseHTMLAttribute: 2483 * @ctxt: an HTML parser context 2484 * @stop: a char stop value 2485 * 2486 * parse an HTML attribute value till the stop (quote), if 2487 * stop is 0 then it stops at the first space 2488 * 2489 * Returns the attribute parsed or NULL 2490 */ 2491 2492 static xmlChar * 2493 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) { 2494 xmlChar *buffer = NULL; 2495 int buffer_size = 0; 2496 xmlChar *out = NULL; 2497 const xmlChar *name = NULL; 2498 const xmlChar *cur = NULL; 2499 const htmlEntityDesc * ent; 2500 2501 /* 2502 * allocate a translation buffer. 2503 */ 2504 buffer_size = HTML_PARSER_BUFFER_SIZE; 2505 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar)); 2506 if (buffer == NULL) { 2507 htmlErrMemory(ctxt, "buffer allocation failed\n"); 2508 return(NULL); 2509 } 2510 out = buffer; 2511 2512 /* 2513 * Ok loop until we reach one of the ending chars 2514 */ 2515 while ((CUR != 0) && (CUR != stop)) { 2516 if ((stop == 0) && (CUR == '>')) break; 2517 if ((stop == 0) && (IS_BLANK_CH(CUR))) break; 2518 if (CUR == '&') { 2519 if (NXT(1) == '#') { 2520 unsigned int c; 2521 int bits; 2522 2523 c = htmlParseCharRef(ctxt); 2524 if (c < 0x80) 2525 { *out++ = c; bits= -6; } 2526 else if (c < 0x800) 2527 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2528 else if (c < 0x10000) 2529 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2530 else 2531 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2532 2533 for ( ; bits >= 0; bits-= 6) { 2534 *out++ = ((c >> bits) & 0x3F) | 0x80; 2535 } 2536 2537 if (out - buffer > buffer_size - 100) { 2538 int indx = out - buffer; 2539 2540 growBuffer(buffer); 2541 out = &buffer[indx]; 2542 } 2543 } else { 2544 ent = htmlParseEntityRef(ctxt, &name); 2545 if (name == NULL) { 2546 *out++ = '&'; 2547 if (out - buffer > buffer_size - 100) { 2548 int indx = out - buffer; 2549 2550 growBuffer(buffer); 2551 out = &buffer[indx]; 2552 } 2553 } else if (ent == NULL) { 2554 *out++ = '&'; 2555 cur = name; 2556 while (*cur != 0) { 2557 if (out - buffer > buffer_size - 100) { 2558 int indx = out - buffer; 2559 2560 growBuffer(buffer); 2561 out = &buffer[indx]; 2562 } 2563 *out++ = *cur++; 2564 } 2565 } else { 2566 unsigned int c; 2567 int bits; 2568 2569 if (out - buffer > buffer_size - 100) { 2570 int indx = out - buffer; 2571 2572 growBuffer(buffer); 2573 out = &buffer[indx]; 2574 } 2575 c = ent->value; 2576 if (c < 0x80) 2577 { *out++ = c; bits= -6; } 2578 else if (c < 0x800) 2579 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2580 else if (c < 0x10000) 2581 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2582 else 2583 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2584 2585 for ( ; bits >= 0; bits-= 6) { 2586 *out++ = ((c >> bits) & 0x3F) | 0x80; 2587 } 2588 } 2589 } 2590 } else { 2591 unsigned int c; 2592 int bits, l; 2593 2594 if (out - buffer > buffer_size - 100) { 2595 int indx = out - buffer; 2596 2597 growBuffer(buffer); 2598 out = &buffer[indx]; 2599 } 2600 c = CUR_CHAR(l); 2601 if (c < 0x80) 2602 { *out++ = c; bits= -6; } 2603 else if (c < 0x800) 2604 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2605 else if (c < 0x10000) 2606 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2607 else 2608 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2609 2610 for ( ; bits >= 0; bits-= 6) { 2611 *out++ = ((c >> bits) & 0x3F) | 0x80; 2612 } 2613 NEXT; 2614 } 2615 } 2616 *out = 0; 2617 return(buffer); 2618 } 2619 2620 /** 2621 * htmlParseEntityRef: 2622 * @ctxt: an HTML parser context 2623 * @str: location to store the entity name 2624 * 2625 * parse an HTML ENTITY references 2626 * 2627 * [68] EntityRef ::= '&' Name ';' 2628 * 2629 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise, 2630 * if non-NULL *str will have to be freed by the caller. 2631 */ 2632 const htmlEntityDesc * 2633 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) { 2634 const xmlChar *name; 2635 const htmlEntityDesc * ent = NULL; 2636 2637 if (str != NULL) *str = NULL; 2638 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL); 2639 2640 if (CUR == '&') { 2641 NEXT; 2642 name = htmlParseName(ctxt); 2643 if (name == NULL) { 2644 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 2645 "htmlParseEntityRef: no name\n", NULL, NULL); 2646 } else { 2647 GROW; 2648 if (CUR == ';') { 2649 if (str != NULL) 2650 *str = name; 2651 2652 /* 2653 * Lookup the entity in the table. 2654 */ 2655 ent = htmlEntityLookup(name); 2656 if (ent != NULL) /* OK that's ugly !!! */ 2657 NEXT; 2658 } else { 2659 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING, 2660 "htmlParseEntityRef: expecting ';'\n", 2661 NULL, NULL); 2662 if (str != NULL) 2663 *str = name; 2664 } 2665 } 2666 } 2667 return(ent); 2668 } 2669 2670 /** 2671 * htmlParseAttValue: 2672 * @ctxt: an HTML parser context 2673 * 2674 * parse a value for an attribute 2675 * Note: the parser won't do substitution of entities here, this 2676 * will be handled later in xmlStringGetNodeList, unless it was 2677 * asked for ctxt->replaceEntities != 0 2678 * 2679 * Returns the AttValue parsed or NULL. 2680 */ 2681 2682 static xmlChar * 2683 htmlParseAttValue(htmlParserCtxtPtr ctxt) { 2684 xmlChar *ret = NULL; 2685 2686 if (CUR == '"') { 2687 NEXT; 2688 ret = htmlParseHTMLAttribute(ctxt, '"'); 2689 if (CUR != '"') { 2690 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 2691 "AttValue: \" expected\n", NULL, NULL); 2692 } else 2693 NEXT; 2694 } else if (CUR == '\'') { 2695 NEXT; 2696 ret = htmlParseHTMLAttribute(ctxt, '\''); 2697 if (CUR != '\'') { 2698 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 2699 "AttValue: ' expected\n", NULL, NULL); 2700 } else 2701 NEXT; 2702 } else { 2703 /* 2704 * That's an HTMLism, the attribute value may not be quoted 2705 */ 2706 ret = htmlParseHTMLAttribute(ctxt, 0); 2707 if (ret == NULL) { 2708 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE, 2709 "AttValue: no value found\n", NULL, NULL); 2710 } 2711 } 2712 return(ret); 2713 } 2714 2715 /** 2716 * htmlParseSystemLiteral: 2717 * @ctxt: an HTML parser context 2718 * 2719 * parse an HTML Literal 2720 * 2721 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") 2722 * 2723 * Returns the SystemLiteral parsed or NULL 2724 */ 2725 2726 static xmlChar * 2727 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { 2728 const xmlChar *q; 2729 xmlChar *ret = NULL; 2730 2731 if (CUR == '"') { 2732 NEXT; 2733 q = CUR_PTR; 2734 while ((IS_CHAR_CH(CUR)) && (CUR != '"')) 2735 NEXT; 2736 if (!IS_CHAR_CH(CUR)) { 2737 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2738 "Unfinished SystemLiteral\n", NULL, NULL); 2739 } else { 2740 ret = xmlStrndup(q, CUR_PTR - q); 2741 NEXT; 2742 } 2743 } else if (CUR == '\'') { 2744 NEXT; 2745 q = CUR_PTR; 2746 while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) 2747 NEXT; 2748 if (!IS_CHAR_CH(CUR)) { 2749 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2750 "Unfinished SystemLiteral\n", NULL, NULL); 2751 } else { 2752 ret = xmlStrndup(q, CUR_PTR - q); 2753 NEXT; 2754 } 2755 } else { 2756 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2757 " or ' expected\n", NULL, NULL); 2758 } 2759 2760 return(ret); 2761 } 2762 2763 /** 2764 * htmlParsePubidLiteral: 2765 * @ctxt: an HTML parser context 2766 * 2767 * parse an HTML public literal 2768 * 2769 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" 2770 * 2771 * Returns the PubidLiteral parsed or NULL. 2772 */ 2773 2774 static xmlChar * 2775 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { 2776 const xmlChar *q; 2777 xmlChar *ret = NULL; 2778 /* 2779 * Name ::= (Letter | '_') (NameChar)* 2780 */ 2781 if (CUR == '"') { 2782 NEXT; 2783 q = CUR_PTR; 2784 while (IS_PUBIDCHAR_CH(CUR)) NEXT; 2785 if (CUR != '"') { 2786 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2787 "Unfinished PubidLiteral\n", NULL, NULL); 2788 } else { 2789 ret = xmlStrndup(q, CUR_PTR - q); 2790 NEXT; 2791 } 2792 } else if (CUR == '\'') { 2793 NEXT; 2794 q = CUR_PTR; 2795 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')) 2796 NEXT; 2797 if (CUR != '\'') { 2798 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2799 "Unfinished PubidLiteral\n", NULL, NULL); 2800 } else { 2801 ret = xmlStrndup(q, CUR_PTR - q); 2802 NEXT; 2803 } 2804 } else { 2805 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2806 "PubidLiteral \" or ' expected\n", NULL, NULL); 2807 } 2808 2809 return(ret); 2810 } 2811 2812 /** 2813 * htmlParseScript: 2814 * @ctxt: an HTML parser context 2815 * 2816 * parse the content of an HTML SCRIPT or STYLE element 2817 * http://www.w3.org/TR/html4/sgml/dtd.html#Script 2818 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet 2819 * http://www.w3.org/TR/html4/types.html#type-script 2820 * http://www.w3.org/TR/html4/types.html#h-6.15 2821 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1 2822 * 2823 * Script data ( %Script; in the DTD) can be the content of the SCRIPT 2824 * element and the value of intrinsic event attributes. User agents must 2825 * not evaluate script data as HTML markup but instead must pass it on as 2826 * data to a script engine. 2827 * NOTES: 2828 * - The content is passed like CDATA 2829 * - the attributes for style and scripting "onXXX" are also described 2830 * as CDATA but SGML allows entities references in attributes so their 2831 * processing is identical as other attributes 2832 */ 2833 static void 2834 htmlParseScript(htmlParserCtxtPtr ctxt) { 2835 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; 2836 int nbchar = 0; 2837 int cur,l; 2838 2839 SHRINK; 2840 cur = CUR_CHAR(l); 2841 while (IS_CHAR_CH(cur)) { 2842 if ((cur == '<') && (NXT(1) == '/')) { 2843 /* 2844 * One should break here, the specification is clear: 2845 * Authors should therefore escape "</" within the content. 2846 * Escape mechanisms are specific to each scripting or 2847 * style sheet language. 2848 * 2849 * In recovery mode, only break if end tag match the 2850 * current tag, effectively ignoring all tags inside the 2851 * script/style block and treating the entire block as 2852 * CDATA. 2853 */ 2854 if (ctxt->recovery) { 2855 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2, 2856 xmlStrlen(ctxt->name)) == 0) 2857 { 2858 break; /* while */ 2859 } else { 2860 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 2861 "Element %s embeds close tag\n", 2862 ctxt->name, NULL); 2863 } 2864 } else { 2865 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) || 2866 ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) 2867 { 2868 break; /* while */ 2869 } 2870 } 2871 } 2872 COPY_BUF(l,buf,nbchar,cur); 2873 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 2874 if (ctxt->sax->cdataBlock!= NULL) { 2875 /* 2876 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2877 */ 2878 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 2879 } else if (ctxt->sax->characters != NULL) { 2880 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2881 } 2882 nbchar = 0; 2883 } 2884 GROW; 2885 NEXTL(l); 2886 cur = CUR_CHAR(l); 2887 } 2888 2889 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) { 2890 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 2891 "Invalid char in CDATA 0x%X\n", cur); 2892 NEXT; 2893 } 2894 2895 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2896 if (ctxt->sax->cdataBlock!= NULL) { 2897 /* 2898 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2899 */ 2900 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 2901 } else if (ctxt->sax->characters != NULL) { 2902 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2903 } 2904 } 2905 } 2906 2907 2908 /** 2909 * htmlParseCharData: 2910 * @ctxt: an HTML parser context 2911 * 2912 * parse a CharData section. 2913 * if we are within a CDATA section ']]>' marks an end of section. 2914 * 2915 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) 2916 */ 2917 2918 static void 2919 htmlParseCharData(htmlParserCtxtPtr ctxt) { 2920 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; 2921 int nbchar = 0; 2922 int cur, l; 2923 int chunk = 0; 2924 2925 SHRINK; 2926 cur = CUR_CHAR(l); 2927 while (((cur != '<') || (ctxt->token == '<')) && 2928 ((cur != '&') || (ctxt->token == '&')) && 2929 (cur != 0)) { 2930 if (!(IS_CHAR(cur))) { 2931 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 2932 "Invalid char in CDATA 0x%X\n", cur); 2933 } else { 2934 COPY_BUF(l,buf,nbchar,cur); 2935 } 2936 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 2937 /* 2938 * Ok the segment is to be consumed as chars. 2939 */ 2940 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2941 if (areBlanks(ctxt, buf, nbchar)) { 2942 if (ctxt->sax->ignorableWhitespace != NULL) 2943 ctxt->sax->ignorableWhitespace(ctxt->userData, 2944 buf, nbchar); 2945 } else { 2946 htmlCheckParagraph(ctxt); 2947 if (ctxt->sax->characters != NULL) 2948 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2949 } 2950 } 2951 nbchar = 0; 2952 } 2953 NEXTL(l); 2954 chunk++; 2955 if (chunk > HTML_PARSER_BUFFER_SIZE) { 2956 chunk = 0; 2957 SHRINK; 2958 GROW; 2959 } 2960 cur = CUR_CHAR(l); 2961 if (cur == 0) { 2962 SHRINK; 2963 GROW; 2964 cur = CUR_CHAR(l); 2965 } 2966 } 2967 if (nbchar != 0) { 2968 buf[nbchar] = 0; 2969 2970 /* 2971 * Ok the segment is to be consumed as chars. 2972 */ 2973 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2974 if (areBlanks(ctxt, buf, nbchar)) { 2975 if (ctxt->sax->ignorableWhitespace != NULL) 2976 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar); 2977 } else { 2978 htmlCheckParagraph(ctxt); 2979 if (ctxt->sax->characters != NULL) 2980 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2981 } 2982 } 2983 } else { 2984 /* 2985 * Loop detection 2986 */ 2987 if (cur == 0) 2988 ctxt->instate = XML_PARSER_EOF; 2989 } 2990 } 2991 2992 /** 2993 * htmlParseExternalID: 2994 * @ctxt: an HTML parser context 2995 * @publicID: a xmlChar** receiving PubidLiteral 2996 * 2997 * Parse an External ID or a Public ID 2998 * 2999 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral 3000 * | 'PUBLIC' S PubidLiteral S SystemLiteral 3001 * 3002 * [83] PublicID ::= 'PUBLIC' S PubidLiteral 3003 * 3004 * Returns the function returns SystemLiteral and in the second 3005 * case publicID receives PubidLiteral, is strict is off 3006 * it is possible to return NULL and have publicID set. 3007 */ 3008 3009 static xmlChar * 3010 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) { 3011 xmlChar *URI = NULL; 3012 3013 if ((UPPER == 'S') && (UPP(1) == 'Y') && 3014 (UPP(2) == 'S') && (UPP(3) == 'T') && 3015 (UPP(4) == 'E') && (UPP(5) == 'M')) { 3016 SKIP(6); 3017 if (!IS_BLANK_CH(CUR)) { 3018 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3019 "Space required after 'SYSTEM'\n", NULL, NULL); 3020 } 3021 SKIP_BLANKS; 3022 URI = htmlParseSystemLiteral(ctxt); 3023 if (URI == NULL) { 3024 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED, 3025 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL); 3026 } 3027 } else if ((UPPER == 'P') && (UPP(1) == 'U') && 3028 (UPP(2) == 'B') && (UPP(3) == 'L') && 3029 (UPP(4) == 'I') && (UPP(5) == 'C')) { 3030 SKIP(6); 3031 if (!IS_BLANK_CH(CUR)) { 3032 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3033 "Space required after 'PUBLIC'\n", NULL, NULL); 3034 } 3035 SKIP_BLANKS; 3036 *publicID = htmlParsePubidLiteral(ctxt); 3037 if (*publicID == NULL) { 3038 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED, 3039 "htmlParseExternalID: PUBLIC, no Public Identifier\n", 3040 NULL, NULL); 3041 } 3042 SKIP_BLANKS; 3043 if ((CUR == '"') || (CUR == '\'')) { 3044 URI = htmlParseSystemLiteral(ctxt); 3045 } 3046 } 3047 return(URI); 3048 } 3049 3050 /** 3051 * xmlParsePI: 3052 * @ctxt: an XML parser context 3053 * 3054 * parse an XML Processing Instruction. 3055 * 3056 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' 3057 */ 3058 static void 3059 htmlParsePI(htmlParserCtxtPtr ctxt) { 3060 xmlChar *buf = NULL; 3061 int len = 0; 3062 int size = HTML_PARSER_BUFFER_SIZE; 3063 int cur, l; 3064 const xmlChar *target; 3065 xmlParserInputState state; 3066 int count = 0; 3067 3068 if ((RAW == '<') && (NXT(1) == '?')) { 3069 state = ctxt->instate; 3070 ctxt->instate = XML_PARSER_PI; 3071 /* 3072 * this is a Processing Instruction. 3073 */ 3074 SKIP(2); 3075 SHRINK; 3076 3077 /* 3078 * Parse the target name and check for special support like 3079 * namespace. 3080 */ 3081 target = htmlParseName(ctxt); 3082 if (target != NULL) { 3083 if (RAW == '>') { 3084 SKIP(1); 3085 3086 /* 3087 * SAX: PI detected. 3088 */ 3089 if ((ctxt->sax) && (!ctxt->disableSAX) && 3090 (ctxt->sax->processingInstruction != NULL)) 3091 ctxt->sax->processingInstruction(ctxt->userData, 3092 target, NULL); 3093 ctxt->instate = state; 3094 return; 3095 } 3096 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 3097 if (buf == NULL) { 3098 htmlErrMemory(ctxt, NULL); 3099 ctxt->instate = state; 3100 return; 3101 } 3102 cur = CUR; 3103 if (!IS_BLANK(cur)) { 3104 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3105 "ParsePI: PI %s space expected\n", target, NULL); 3106 } 3107 SKIP_BLANKS; 3108 cur = CUR_CHAR(l); 3109 while (IS_CHAR(cur) && (cur != '>')) { 3110 if (len + 5 >= size) { 3111 xmlChar *tmp; 3112 3113 size *= 2; 3114 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 3115 if (tmp == NULL) { 3116 htmlErrMemory(ctxt, NULL); 3117 xmlFree(buf); 3118 ctxt->instate = state; 3119 return; 3120 } 3121 buf = tmp; 3122 } 3123 count++; 3124 if (count > 50) { 3125 GROW; 3126 count = 0; 3127 } 3128 COPY_BUF(l,buf,len,cur); 3129 NEXTL(l); 3130 cur = CUR_CHAR(l); 3131 if (cur == 0) { 3132 SHRINK; 3133 GROW; 3134 cur = CUR_CHAR(l); 3135 } 3136 } 3137 buf[len] = 0; 3138 if (cur != '>') { 3139 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED, 3140 "ParsePI: PI %s never end ...\n", target, NULL); 3141 } else { 3142 SKIP(1); 3143 3144 /* 3145 * SAX: PI detected. 3146 */ 3147 if ((ctxt->sax) && (!ctxt->disableSAX) && 3148 (ctxt->sax->processingInstruction != NULL)) 3149 ctxt->sax->processingInstruction(ctxt->userData, 3150 target, buf); 3151 } 3152 xmlFree(buf); 3153 } else { 3154 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED, 3155 "PI is not started correctly", NULL, NULL); 3156 } 3157 ctxt->instate = state; 3158 } 3159 } 3160 3161 /** 3162 * htmlParseComment: 3163 * @ctxt: an HTML parser context 3164 * 3165 * Parse an XML (SGML) comment <!-- .... --> 3166 * 3167 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' 3168 */ 3169 static void 3170 htmlParseComment(htmlParserCtxtPtr ctxt) { 3171 xmlChar *buf = NULL; 3172 int len; 3173 int size = HTML_PARSER_BUFFER_SIZE; 3174 int q, ql; 3175 int r, rl; 3176 int cur, l; 3177 xmlParserInputState state; 3178 3179 /* 3180 * Check that there is a comment right here. 3181 */ 3182 if ((RAW != '<') || (NXT(1) != '!') || 3183 (NXT(2) != '-') || (NXT(3) != '-')) return; 3184 3185 state = ctxt->instate; 3186 ctxt->instate = XML_PARSER_COMMENT; 3187 SHRINK; 3188 SKIP(4); 3189 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 3190 if (buf == NULL) { 3191 htmlErrMemory(ctxt, "buffer allocation failed\n"); 3192 ctxt->instate = state; 3193 return; 3194 } 3195 q = CUR_CHAR(ql); 3196 NEXTL(ql); 3197 r = CUR_CHAR(rl); 3198 NEXTL(rl); 3199 cur = CUR_CHAR(l); 3200 len = 0; 3201 while (IS_CHAR(cur) && 3202 ((cur != '>') || 3203 (r != '-') || (q != '-'))) { 3204 if (len + 5 >= size) { 3205 xmlChar *tmp; 3206 3207 size *= 2; 3208 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 3209 if (tmp == NULL) { 3210 xmlFree(buf); 3211 htmlErrMemory(ctxt, "growing buffer failed\n"); 3212 ctxt->instate = state; 3213 return; 3214 } 3215 buf = tmp; 3216 } 3217 COPY_BUF(ql,buf,len,q); 3218 q = r; 3219 ql = rl; 3220 r = cur; 3221 rl = l; 3222 NEXTL(l); 3223 cur = CUR_CHAR(l); 3224 if (cur == 0) { 3225 SHRINK; 3226 GROW; 3227 cur = CUR_CHAR(l); 3228 } 3229 } 3230 buf[len] = 0; 3231 if (!IS_CHAR(cur)) { 3232 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, 3233 "Comment not terminated \n<!--%.50s\n", buf, NULL); 3234 xmlFree(buf); 3235 } else { 3236 NEXT; 3237 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) && 3238 (!ctxt->disableSAX)) 3239 ctxt->sax->comment(ctxt->userData, buf); 3240 xmlFree(buf); 3241 } 3242 ctxt->instate = state; 3243 } 3244 3245 /** 3246 * htmlParseCharRef: 3247 * @ctxt: an HTML parser context 3248 * 3249 * parse Reference declarations 3250 * 3251 * [66] CharRef ::= '&#' [0-9]+ ';' | 3252 * '&#x' [0-9a-fA-F]+ ';' 3253 * 3254 * Returns the value parsed (as an int) 3255 */ 3256 int 3257 htmlParseCharRef(htmlParserCtxtPtr ctxt) { 3258 int val = 0; 3259 3260 if ((ctxt == NULL) || (ctxt->input == NULL)) { 3261 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3262 "htmlParseCharRef: context error\n", 3263 NULL, NULL); 3264 return(0); 3265 } 3266 if ((CUR == '&') && (NXT(1) == '#') && 3267 ((NXT(2) == 'x') || NXT(2) == 'X')) { 3268 SKIP(3); 3269 while (CUR != ';') { 3270 if ((CUR >= '0') && (CUR <= '9')) 3271 val = val * 16 + (CUR - '0'); 3272 else if ((CUR >= 'a') && (CUR <= 'f')) 3273 val = val * 16 + (CUR - 'a') + 10; 3274 else if ((CUR >= 'A') && (CUR <= 'F')) 3275 val = val * 16 + (CUR - 'A') + 10; 3276 else { 3277 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF, 3278 "htmlParseCharRef: missing semicolumn\n", 3279 NULL, NULL); 3280 break; 3281 } 3282 NEXT; 3283 } 3284 if (CUR == ';') 3285 NEXT; 3286 } else if ((CUR == '&') && (NXT(1) == '#')) { 3287 SKIP(2); 3288 while (CUR != ';') { 3289 if ((CUR >= '0') && (CUR <= '9')) 3290 val = val * 10 + (CUR - '0'); 3291 else { 3292 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF, 3293 "htmlParseCharRef: missing semicolumn\n", 3294 NULL, NULL); 3295 break; 3296 } 3297 NEXT; 3298 } 3299 if (CUR == ';') 3300 NEXT; 3301 } else { 3302 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF, 3303 "htmlParseCharRef: invalid value\n", NULL, NULL); 3304 } 3305 /* 3306 * Check the value IS_CHAR ... 3307 */ 3308 if (IS_CHAR(val)) { 3309 return(val); 3310 } else { 3311 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 3312 "htmlParseCharRef: invalid xmlChar value %d\n", 3313 val); 3314 } 3315 return(0); 3316 } 3317 3318 3319 /** 3320 * htmlParseDocTypeDecl: 3321 * @ctxt: an HTML parser context 3322 * 3323 * parse a DOCTYPE declaration 3324 * 3325 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? 3326 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>' 3327 */ 3328 3329 static void 3330 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) { 3331 const xmlChar *name; 3332 xmlChar *ExternalID = NULL; 3333 xmlChar *URI = NULL; 3334 3335 /* 3336 * We know that '<!DOCTYPE' has been detected. 3337 */ 3338 SKIP(9); 3339 3340 SKIP_BLANKS; 3341 3342 /* 3343 * Parse the DOCTYPE name. 3344 */ 3345 name = htmlParseName(ctxt); 3346 if (name == NULL) { 3347 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3348 "htmlParseDocTypeDecl : no DOCTYPE name !\n", 3349 NULL, NULL); 3350 } 3351 /* 3352 * Check that upper(name) == "HTML" !!!!!!!!!!!!! 3353 */ 3354 3355 SKIP_BLANKS; 3356 3357 /* 3358 * Check for SystemID and ExternalID 3359 */ 3360 URI = htmlParseExternalID(ctxt, &ExternalID); 3361 SKIP_BLANKS; 3362 3363 /* 3364 * We should be at the end of the DOCTYPE declaration. 3365 */ 3366 if (CUR != '>') { 3367 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED, 3368 "DOCTYPE improperly terminated\n", NULL, NULL); 3369 /* We shouldn't try to resynchronize ... */ 3370 } 3371 NEXT; 3372 3373 /* 3374 * Create or update the document accordingly to the DOCTYPE 3375 */ 3376 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) && 3377 (!ctxt->disableSAX)) 3378 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI); 3379 3380 /* 3381 * Cleanup, since we don't use all those identifiers 3382 */ 3383 if (URI != NULL) xmlFree(URI); 3384 if (ExternalID != NULL) xmlFree(ExternalID); 3385 } 3386 3387 /** 3388 * htmlParseAttribute: 3389 * @ctxt: an HTML parser context 3390 * @value: a xmlChar ** used to store the value of the attribute 3391 * 3392 * parse an attribute 3393 * 3394 * [41] Attribute ::= Name Eq AttValue 3395 * 3396 * [25] Eq ::= S? '=' S? 3397 * 3398 * With namespace: 3399 * 3400 * [NS 11] Attribute ::= QName Eq AttValue 3401 * 3402 * Also the case QName == xmlns:??? is handled independently as a namespace 3403 * definition. 3404 * 3405 * Returns the attribute name, and the value in *value. 3406 */ 3407 3408 static const xmlChar * 3409 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { 3410 const xmlChar *name; 3411 xmlChar *val = NULL; 3412 3413 *value = NULL; 3414 name = htmlParseHTMLName(ctxt); 3415 if (name == NULL) { 3416 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3417 "error parsing attribute name\n", NULL, NULL); 3418 return(NULL); 3419 } 3420 3421 /* 3422 * read the value 3423 */ 3424 SKIP_BLANKS; 3425 if (CUR == '=') { 3426 NEXT; 3427 SKIP_BLANKS; 3428 val = htmlParseAttValue(ctxt); 3429 } 3430 3431 *value = val; 3432 return(name); 3433 } 3434 3435 /** 3436 * htmlCheckEncoding: 3437 * @ctxt: an HTML parser context 3438 * @attvalue: the attribute value 3439 * 3440 * Checks an http-equiv attribute from a Meta tag to detect 3441 * the encoding 3442 * If a new encoding is detected the parser is switched to decode 3443 * it and pass UTF8 3444 */ 3445 static void 3446 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { 3447 const xmlChar *encoding; 3448 3449 if ((ctxt == NULL) || (attvalue == NULL)) 3450 return; 3451 3452 /* do not change encoding */ 3453 if (ctxt->input->encoding != NULL) 3454 return; 3455 3456 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset="); 3457 if (encoding != NULL) { 3458 encoding += 8; 3459 } else { 3460 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset ="); 3461 if (encoding != NULL) 3462 encoding += 9; 3463 } 3464 if (encoding != NULL) { 3465 xmlCharEncoding enc; 3466 xmlCharEncodingHandlerPtr handler; 3467 3468 while ((*encoding == ' ') || (*encoding == '\t')) encoding++; 3469 3470 if (ctxt->input->encoding != NULL) 3471 xmlFree((xmlChar *) ctxt->input->encoding); 3472 ctxt->input->encoding = xmlStrdup(encoding); 3473 3474 enc = xmlParseCharEncoding((const char *) encoding); 3475 /* 3476 * registered set of known encodings 3477 */ 3478 if (enc != XML_CHAR_ENCODING_ERROR) { 3479 if (((enc == XML_CHAR_ENCODING_UTF16LE) || 3480 (enc == XML_CHAR_ENCODING_UTF16BE) || 3481 (enc == XML_CHAR_ENCODING_UCS4LE) || 3482 (enc == XML_CHAR_ENCODING_UCS4BE)) && 3483 (ctxt->input->buf != NULL) && 3484 (ctxt->input->buf->encoder == NULL)) { 3485 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 3486 "htmlCheckEncoding: wrong encoding meta\n", 3487 NULL, NULL); 3488 } else { 3489 xmlSwitchEncoding(ctxt, enc); 3490 } 3491 ctxt->charset = XML_CHAR_ENCODING_UTF8; 3492 } else { 3493 /* 3494 * fallback for unknown encodings 3495 */ 3496 handler = xmlFindCharEncodingHandler((const char *) encoding); 3497 if (handler != NULL) { 3498 xmlSwitchToEncoding(ctxt, handler); 3499 ctxt->charset = XML_CHAR_ENCODING_UTF8; 3500 } else { 3501 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; 3502 } 3503 } 3504 3505 if ((ctxt->input->buf != NULL) && 3506 (ctxt->input->buf->encoder != NULL) && 3507 (ctxt->input->buf->raw != NULL) && 3508 (ctxt->input->buf->buffer != NULL)) { 3509 int nbchars; 3510 int processed; 3511 3512 /* 3513 * convert as much as possible to the parser reading buffer. 3514 */ 3515 processed = ctxt->input->cur - ctxt->input->base; 3516 xmlBufferShrink(ctxt->input->buf->buffer, processed); 3517 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder, 3518 ctxt->input->buf->buffer, 3519 ctxt->input->buf->raw); 3520 if (nbchars < 0) { 3521 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 3522 "htmlCheckEncoding: encoder error\n", 3523 NULL, NULL); 3524 } 3525 ctxt->input->base = 3526 ctxt->input->cur = ctxt->input->buf->buffer->content; 3527 ctxt->input->end = 3528 &ctxt->input->base[ctxt->input->buf->buffer->use]; 3529 } 3530 } 3531 } 3532 3533 /** 3534 * htmlCheckMeta: 3535 * @ctxt: an HTML parser context 3536 * @atts: the attributes values 3537 * 3538 * Checks an attributes from a Meta tag 3539 */ 3540 static void 3541 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) { 3542 int i; 3543 const xmlChar *att, *value; 3544 int http = 0; 3545 const xmlChar *content = NULL; 3546 3547 if ((ctxt == NULL) || (atts == NULL)) 3548 return; 3549 3550 i = 0; 3551 att = atts[i++]; 3552 while (att != NULL) { 3553 value = atts[i++]; 3554 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv")) 3555 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 3556 http = 1; 3557 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content"))) 3558 content = value; 3559 att = atts[i++]; 3560 } 3561 if ((http) && (content != NULL)) 3562 htmlCheckEncoding(ctxt, content); 3563 3564 } 3565 3566 /** 3567 * htmlParseStartTag: 3568 * @ctxt: an HTML parser context 3569 * 3570 * parse a start of tag either for rule element or 3571 * EmptyElement. In both case we don't parse the tag closing chars. 3572 * 3573 * [40] STag ::= '<' Name (S Attribute)* S? '>' 3574 * 3575 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' 3576 * 3577 * With namespace: 3578 * 3579 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>' 3580 * 3581 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>' 3582 * 3583 * Returns 0 in case of success, -1 in case of error and 1 if discarded 3584 */ 3585 3586 static int 3587 htmlParseStartTag(htmlParserCtxtPtr ctxt) { 3588 const xmlChar *name; 3589 const xmlChar *attname; 3590 xmlChar *attvalue; 3591 const xmlChar **atts; 3592 int nbatts = 0; 3593 int maxatts; 3594 int meta = 0; 3595 int i; 3596 int discardtag = 0; 3597 3598 if (ctxt->instate == XML_PARSER_EOF) 3599 return(-1); 3600 if ((ctxt == NULL) || (ctxt->input == NULL)) { 3601 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3602 "htmlParseStartTag: context error\n", NULL, NULL); 3603 return -1; 3604 } 3605 if (CUR != '<') return -1; 3606 NEXT; 3607 3608 atts = ctxt->atts; 3609 maxatts = ctxt->maxatts; 3610 3611 GROW; 3612 name = htmlParseHTMLName(ctxt); 3613 if (name == NULL) { 3614 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3615 "htmlParseStartTag: invalid element name\n", 3616 NULL, NULL); 3617 /* Dump the bogus tag like browsers do */ 3618 while ((IS_CHAR_CH(CUR)) && (CUR != '>') && 3619 (ctxt->instate != XML_PARSER_EOF)) 3620 NEXT; 3621 return -1; 3622 } 3623 if (xmlStrEqual(name, BAD_CAST"meta")) 3624 meta = 1; 3625 3626 /* 3627 * Check for auto-closure of HTML elements. 3628 */ 3629 htmlAutoClose(ctxt, name); 3630 3631 /* 3632 * Check for implied HTML elements. 3633 */ 3634 htmlCheckImplied(ctxt, name); 3635 3636 /* 3637 * Avoid html at any level > 0, head at any level != 1 3638 * or any attempt to recurse body 3639 */ 3640 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) { 3641 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3642 "htmlParseStartTag: misplaced <html> tag\n", 3643 name, NULL); 3644 discardtag = 1; 3645 ctxt->depth++; 3646 } 3647 if ((ctxt->nameNr != 1) && 3648 (xmlStrEqual(name, BAD_CAST"head"))) { 3649 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3650 "htmlParseStartTag: misplaced <head> tag\n", 3651 name, NULL); 3652 discardtag = 1; 3653 ctxt->depth++; 3654 } 3655 if (xmlStrEqual(name, BAD_CAST"body")) { 3656 int indx; 3657 for (indx = 0;indx < ctxt->nameNr;indx++) { 3658 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) { 3659 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3660 "htmlParseStartTag: misplaced <body> tag\n", 3661 name, NULL); 3662 discardtag = 1; 3663 ctxt->depth++; 3664 } 3665 } 3666 } 3667 3668 /* 3669 * Now parse the attributes, it ends up with the ending 3670 * 3671 * (S Attribute)* S? 3672 */ 3673 SKIP_BLANKS; 3674 while ((IS_CHAR_CH(CUR)) && 3675 (CUR != '>') && 3676 ((CUR != '/') || (NXT(1) != '>'))) { 3677 long cons = ctxt->nbChars; 3678 3679 GROW; 3680 attname = htmlParseAttribute(ctxt, &attvalue); 3681 if (attname != NULL) { 3682 3683 /* 3684 * Well formedness requires at most one declaration of an attribute 3685 */ 3686 for (i = 0; i < nbatts;i += 2) { 3687 if (xmlStrEqual(atts[i], attname)) { 3688 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED, 3689 "Attribute %s redefined\n", attname, NULL); 3690 if (attvalue != NULL) 3691 xmlFree(attvalue); 3692 goto failed; 3693 } 3694 } 3695 3696 /* 3697 * Add the pair to atts 3698 */ 3699 if (atts == NULL) { 3700 maxatts = 22; /* allow for 10 attrs by default */ 3701 atts = (const xmlChar **) 3702 xmlMalloc(maxatts * sizeof(xmlChar *)); 3703 if (atts == NULL) { 3704 htmlErrMemory(ctxt, NULL); 3705 if (attvalue != NULL) 3706 xmlFree(attvalue); 3707 goto failed; 3708 } 3709 ctxt->atts = atts; 3710 ctxt->maxatts = maxatts; 3711 } else if (nbatts + 4 > maxatts) { 3712 const xmlChar **n; 3713 3714 maxatts *= 2; 3715 n = (const xmlChar **) xmlRealloc((void *) atts, 3716 maxatts * sizeof(const xmlChar *)); 3717 if (n == NULL) { 3718 htmlErrMemory(ctxt, NULL); 3719 if (attvalue != NULL) 3720 xmlFree(attvalue); 3721 goto failed; 3722 } 3723 atts = n; 3724 ctxt->atts = atts; 3725 ctxt->maxatts = maxatts; 3726 } 3727 atts[nbatts++] = attname; 3728 atts[nbatts++] = attvalue; 3729 atts[nbatts] = NULL; 3730 atts[nbatts + 1] = NULL; 3731 } 3732 else { 3733 if (attvalue != NULL) 3734 xmlFree(attvalue); 3735 /* Dump the bogus attribute string up to the next blank or 3736 * the end of the tag. */ 3737 while ((IS_CHAR_CH(CUR)) && 3738 !(IS_BLANK_CH(CUR)) && (CUR != '>') && 3739 ((CUR != '/') || (NXT(1) != '>'))) 3740 NEXT; 3741 } 3742 3743 failed: 3744 SKIP_BLANKS; 3745 if (cons == ctxt->nbChars) { 3746 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3747 "htmlParseStartTag: problem parsing attributes\n", 3748 NULL, NULL); 3749 break; 3750 } 3751 } 3752 3753 /* 3754 * Handle specific association to the META tag 3755 */ 3756 if (meta && (nbatts != 0)) 3757 htmlCheckMeta(ctxt, atts); 3758 3759 /* 3760 * SAX: Start of Element ! 3761 */ 3762 if (!discardtag) { 3763 htmlnamePush(ctxt, name); 3764 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) { 3765 if (nbatts != 0) 3766 ctxt->sax->startElement(ctxt->userData, name, atts); 3767 else 3768 ctxt->sax->startElement(ctxt->userData, name, NULL); 3769 } 3770 } 3771 3772 if (atts != NULL) { 3773 for (i = 1;i < nbatts;i += 2) { 3774 if (atts[i] != NULL) 3775 xmlFree((xmlChar *) atts[i]); 3776 } 3777 } 3778 3779 return(discardtag); 3780 } 3781 3782 /** 3783 * htmlParseEndTag: 3784 * @ctxt: an HTML parser context 3785 * 3786 * parse an end of tag 3787 * 3788 * [42] ETag ::= '</' Name S? '>' 3789 * 3790 * With namespace 3791 * 3792 * [NS 9] ETag ::= '</' QName S? '>' 3793 * 3794 * Returns 1 if the current level should be closed. 3795 */ 3796 3797 static int 3798 htmlParseEndTag(htmlParserCtxtPtr ctxt) 3799 { 3800 const xmlChar *name; 3801 const xmlChar *oldname; 3802 int i, ret; 3803 3804 if ((CUR != '<') || (NXT(1) != '/')) { 3805 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED, 3806 "htmlParseEndTag: '</' not found\n", NULL, NULL); 3807 return (0); 3808 } 3809 SKIP(2); 3810 3811 name = htmlParseHTMLName(ctxt); 3812 if (name == NULL) 3813 return (0); 3814 /* 3815 * We should definitely be at the ending "S? '>'" part 3816 */ 3817 SKIP_BLANKS; 3818 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) { 3819 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 3820 "End tag : expected '>'\n", NULL, NULL); 3821 if (ctxt->recovery) { 3822 /* 3823 * We're not at the ending > !! 3824 * Error, unless in recover mode where we search forwards 3825 * until we find a > 3826 */ 3827 while (CUR != '\0' && CUR != '>') NEXT; 3828 NEXT; 3829 } 3830 } else 3831 NEXT; 3832 3833 /* 3834 * if we ignored misplaced tags in htmlParseStartTag don't pop them 3835 * out now. 3836 */ 3837 if ((ctxt->depth > 0) && 3838 (xmlStrEqual(name, BAD_CAST "html") || 3839 xmlStrEqual(name, BAD_CAST "body") || 3840 xmlStrEqual(name, BAD_CAST "head"))) { 3841 ctxt->depth--; 3842 return (0); 3843 } 3844 3845 /* 3846 * If the name read is not one of the element in the parsing stack 3847 * then return, it's just an error. 3848 */ 3849 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 3850 if (xmlStrEqual(name, ctxt->nameTab[i])) 3851 break; 3852 } 3853 if (i < 0) { 3854 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 3855 "Unexpected end tag : %s\n", name, NULL); 3856 return (0); 3857 } 3858 3859 3860 /* 3861 * Check for auto-closure of HTML elements. 3862 */ 3863 3864 htmlAutoCloseOnClose(ctxt, name); 3865 3866 /* 3867 * Well formedness constraints, opening and closing must match. 3868 * With the exception that the autoclose may have popped stuff out 3869 * of the stack. 3870 */ 3871 if (!xmlStrEqual(name, ctxt->name)) { 3872 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) { 3873 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 3874 "Opening and ending tag mismatch: %s and %s\n", 3875 name, ctxt->name); 3876 } 3877 } 3878 3879 /* 3880 * SAX: End of Tag 3881 */ 3882 oldname = ctxt->name; 3883 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) { 3884 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 3885 ctxt->sax->endElement(ctxt->userData, name); 3886 htmlnamePop(ctxt); 3887 ret = 1; 3888 } else { 3889 ret = 0; 3890 } 3891 3892 return (ret); 3893 } 3894 3895 3896 /** 3897 * htmlParseReference: 3898 * @ctxt: an HTML parser context 3899 * 3900 * parse and handle entity references in content, 3901 * this will end-up in a call to character() since this is either a 3902 * CharRef, or a predefined entity. 3903 */ 3904 static void 3905 htmlParseReference(htmlParserCtxtPtr ctxt) { 3906 const htmlEntityDesc * ent; 3907 xmlChar out[6]; 3908 const xmlChar *name; 3909 if (CUR != '&') return; 3910 3911 if (NXT(1) == '#') { 3912 unsigned int c; 3913 int bits, i = 0; 3914 3915 c = htmlParseCharRef(ctxt); 3916 if (c == 0) 3917 return; 3918 3919 if (c < 0x80) { out[i++]= c; bits= -6; } 3920 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } 3921 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } 3922 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } 3923 3924 for ( ; bits >= 0; bits-= 6) { 3925 out[i++]= ((c >> bits) & 0x3F) | 0x80; 3926 } 3927 out[i] = 0; 3928 3929 htmlCheckParagraph(ctxt); 3930 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 3931 ctxt->sax->characters(ctxt->userData, out, i); 3932 } else { 3933 ent = htmlParseEntityRef(ctxt, &name); 3934 if (name == NULL) { 3935 htmlCheckParagraph(ctxt); 3936 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 3937 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); 3938 return; 3939 } 3940 if ((ent == NULL) || !(ent->value > 0)) { 3941 htmlCheckParagraph(ctxt); 3942 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) { 3943 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); 3944 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name)); 3945 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */ 3946 } 3947 } else { 3948 unsigned int c; 3949 int bits, i = 0; 3950 3951 c = ent->value; 3952 if (c < 0x80) 3953 { out[i++]= c; bits= -6; } 3954 else if (c < 0x800) 3955 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } 3956 else if (c < 0x10000) 3957 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } 3958 else 3959 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } 3960 3961 for ( ; bits >= 0; bits-= 6) { 3962 out[i++]= ((c >> bits) & 0x3F) | 0x80; 3963 } 3964 out[i] = 0; 3965 3966 htmlCheckParagraph(ctxt); 3967 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 3968 ctxt->sax->characters(ctxt->userData, out, i); 3969 } 3970 } 3971 } 3972 3973 /** 3974 * htmlParseContent: 3975 * @ctxt: an HTML parser context 3976 * 3977 * Parse a content: comment, sub-element, reference or text. 3978 * Kept for compatibility with old code 3979 */ 3980 3981 static void 3982 htmlParseContent(htmlParserCtxtPtr ctxt) { 3983 xmlChar *currentNode; 3984 int depth; 3985 const xmlChar *name; 3986 3987 currentNode = xmlStrdup(ctxt->name); 3988 depth = ctxt->nameNr; 3989 while (1) { 3990 long cons = ctxt->nbChars; 3991 3992 GROW; 3993 3994 if (ctxt->instate == XML_PARSER_EOF) 3995 break; 3996 3997 /* 3998 * Our tag or one of it's parent or children is ending. 3999 */ 4000 if ((CUR == '<') && (NXT(1) == '/')) { 4001 if (htmlParseEndTag(ctxt) && 4002 ((currentNode != NULL) || (ctxt->nameNr == 0))) { 4003 if (currentNode != NULL) 4004 xmlFree(currentNode); 4005 return; 4006 } 4007 continue; /* while */ 4008 } 4009 4010 else if ((CUR == '<') && 4011 ((IS_ASCII_LETTER(NXT(1))) || 4012 (NXT(1) == '_') || (NXT(1) == ':'))) { 4013 name = htmlParseHTMLName_nonInvasive(ctxt); 4014 if (name == NULL) { 4015 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 4016 "htmlParseStartTag: invalid element name\n", 4017 NULL, NULL); 4018 /* Dump the bogus tag like browsers do */ 4019 while ((IS_CHAR_CH(CUR)) && (CUR != '>')) 4020 NEXT; 4021 4022 if (currentNode != NULL) 4023 xmlFree(currentNode); 4024 return; 4025 } 4026 4027 if (ctxt->name != NULL) { 4028 if (htmlCheckAutoClose(name, ctxt->name) == 1) { 4029 htmlAutoClose(ctxt, name); 4030 continue; 4031 } 4032 } 4033 } 4034 4035 /* 4036 * Has this node been popped out during parsing of 4037 * the next element 4038 */ 4039 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) && 4040 (!xmlStrEqual(currentNode, ctxt->name))) 4041 { 4042 if (currentNode != NULL) xmlFree(currentNode); 4043 return; 4044 } 4045 4046 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) || 4047 (xmlStrEqual(currentNode, BAD_CAST"style")))) { 4048 /* 4049 * Handle SCRIPT/STYLE separately 4050 */ 4051 htmlParseScript(ctxt); 4052 } else { 4053 /* 4054 * Sometimes DOCTYPE arrives in the middle of the document 4055 */ 4056 if ((CUR == '<') && (NXT(1) == '!') && 4057 (UPP(2) == 'D') && (UPP(3) == 'O') && 4058 (UPP(4) == 'C') && (UPP(5) == 'T') && 4059 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4060 (UPP(8) == 'E')) { 4061 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 4062 "Misplaced DOCTYPE declaration\n", 4063 BAD_CAST "DOCTYPE" , NULL); 4064 htmlParseDocTypeDecl(ctxt); 4065 } 4066 4067 /* 4068 * First case : a comment 4069 */ 4070 if ((CUR == '<') && (NXT(1) == '!') && 4071 (NXT(2) == '-') && (NXT(3) == '-')) { 4072 htmlParseComment(ctxt); 4073 } 4074 4075 /* 4076 * Second case : a Processing Instruction. 4077 */ 4078 else if ((CUR == '<') && (NXT(1) == '?')) { 4079 htmlParsePI(ctxt); 4080 } 4081 4082 /* 4083 * Third case : a sub-element. 4084 */ 4085 else if (CUR == '<') { 4086 htmlParseElement(ctxt); 4087 } 4088 4089 /* 4090 * Fourth case : a reference. If if has not been resolved, 4091 * parsing returns it's Name, create the node 4092 */ 4093 else if (CUR == '&') { 4094 htmlParseReference(ctxt); 4095 } 4096 4097 /* 4098 * Fifth case : end of the resource 4099 */ 4100 else if (CUR == 0) { 4101 htmlAutoCloseOnEnd(ctxt); 4102 break; 4103 } 4104 4105 /* 4106 * Last case, text. Note that References are handled directly. 4107 */ 4108 else { 4109 htmlParseCharData(ctxt); 4110 } 4111 4112 if (cons == ctxt->nbChars) { 4113 if (ctxt->node != NULL) { 4114 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4115 "detected an error in element content\n", 4116 NULL, NULL); 4117 } 4118 break; 4119 } 4120 } 4121 GROW; 4122 } 4123 if (currentNode != NULL) xmlFree(currentNode); 4124 } 4125 4126 /** 4127 * htmlParseElement: 4128 * @ctxt: an HTML parser context 4129 * 4130 * parse an HTML element, this is highly recursive 4131 * this is kept for compatibility with previous code versions 4132 * 4133 * [39] element ::= EmptyElemTag | STag content ETag 4134 * 4135 * [41] Attribute ::= Name Eq AttValue 4136 */ 4137 4138 void 4139 htmlParseElement(htmlParserCtxtPtr ctxt) { 4140 const xmlChar *name; 4141 xmlChar *currentNode = NULL; 4142 const htmlElemDesc * info; 4143 htmlParserNodeInfo node_info; 4144 int failed; 4145 int depth; 4146 const xmlChar *oldptr; 4147 4148 if ((ctxt == NULL) || (ctxt->input == NULL)) { 4149 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4150 "htmlParseElement: context error\n", NULL, NULL); 4151 return; 4152 } 4153 4154 if (ctxt->instate == XML_PARSER_EOF) 4155 return; 4156 4157 /* Capture start position */ 4158 if (ctxt->record_info) { 4159 node_info.begin_pos = ctxt->input->consumed + 4160 (CUR_PTR - ctxt->input->base); 4161 node_info.begin_line = ctxt->input->line; 4162 } 4163 4164 failed = htmlParseStartTag(ctxt); 4165 name = ctxt->name; 4166 if ((failed == -1) || (name == NULL)) { 4167 if (CUR == '>') 4168 NEXT; 4169 return; 4170 } 4171 4172 /* 4173 * Lookup the info for that element. 4174 */ 4175 info = htmlTagLookup(name); 4176 if (info == NULL) { 4177 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 4178 "Tag %s invalid\n", name, NULL); 4179 } 4180 4181 /* 4182 * Check for an Empty Element labeled the XML/SGML way 4183 */ 4184 if ((CUR == '/') && (NXT(1) == '>')) { 4185 SKIP(2); 4186 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4187 ctxt->sax->endElement(ctxt->userData, name); 4188 htmlnamePop(ctxt); 4189 return; 4190 } 4191 4192 if (CUR == '>') { 4193 NEXT; 4194 } else { 4195 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 4196 "Couldn't find end of Start Tag %s\n", name, NULL); 4197 4198 /* 4199 * end of parsing of this node. 4200 */ 4201 if (xmlStrEqual(name, ctxt->name)) { 4202 nodePop(ctxt); 4203 htmlnamePop(ctxt); 4204 } 4205 4206 /* 4207 * Capture end position and add node 4208 */ 4209 if (ctxt->record_info) { 4210 node_info.end_pos = ctxt->input->consumed + 4211 (CUR_PTR - ctxt->input->base); 4212 node_info.end_line = ctxt->input->line; 4213 node_info.node = ctxt->node; 4214 xmlParserAddNodeInfo(ctxt, &node_info); 4215 } 4216 return; 4217 } 4218 4219 /* 4220 * Check for an Empty Element from DTD definition 4221 */ 4222 if ((info != NULL) && (info->empty)) { 4223 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4224 ctxt->sax->endElement(ctxt->userData, name); 4225 htmlnamePop(ctxt); 4226 return; 4227 } 4228 4229 /* 4230 * Parse the content of the element: 4231 */ 4232 currentNode = xmlStrdup(ctxt->name); 4233 depth = ctxt->nameNr; 4234 while (IS_CHAR_CH(CUR)) { 4235 oldptr = ctxt->input->cur; 4236 htmlParseContent(ctxt); 4237 if (oldptr==ctxt->input->cur) break; 4238 if (ctxt->nameNr < depth) break; 4239 } 4240 4241 /* 4242 * Capture end position and add node 4243 */ 4244 if ( currentNode != NULL && ctxt->record_info ) { 4245 node_info.end_pos = ctxt->input->consumed + 4246 (CUR_PTR - ctxt->input->base); 4247 node_info.end_line = ctxt->input->line; 4248 node_info.node = ctxt->node; 4249 xmlParserAddNodeInfo(ctxt, &node_info); 4250 } 4251 if (!IS_CHAR_CH(CUR)) { 4252 htmlAutoCloseOnEnd(ctxt); 4253 } 4254 4255 if (currentNode != NULL) 4256 xmlFree(currentNode); 4257 } 4258 4259 static void 4260 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) { 4261 /* 4262 * Capture end position and add node 4263 */ 4264 if ( ctxt->node != NULL && ctxt->record_info ) { 4265 ctxt->nodeInfo->end_pos = ctxt->input->consumed + 4266 (CUR_PTR - ctxt->input->base); 4267 ctxt->nodeInfo->end_line = ctxt->input->line; 4268 ctxt->nodeInfo->node = ctxt->node; 4269 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo); 4270 htmlNodeInfoPop(ctxt); 4271 } 4272 if (!IS_CHAR_CH(CUR)) { 4273 htmlAutoCloseOnEnd(ctxt); 4274 } 4275 } 4276 4277 /** 4278 * htmlParseElementInternal: 4279 * @ctxt: an HTML parser context 4280 * 4281 * parse an HTML element, new version, non recursive 4282 * 4283 * [39] element ::= EmptyElemTag | STag content ETag 4284 * 4285 * [41] Attribute ::= Name Eq AttValue 4286 */ 4287 4288 static void 4289 htmlParseElementInternal(htmlParserCtxtPtr ctxt) { 4290 const xmlChar *name; 4291 const htmlElemDesc * info; 4292 htmlParserNodeInfo node_info; 4293 int failed; 4294 4295 if ((ctxt == NULL) || (ctxt->input == NULL)) { 4296 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4297 "htmlParseElementInternal: context error\n", NULL, NULL); 4298 return; 4299 } 4300 4301 if (ctxt->instate == XML_PARSER_EOF) 4302 return; 4303 4304 /* Capture start position */ 4305 if (ctxt->record_info) { 4306 node_info.begin_pos = ctxt->input->consumed + 4307 (CUR_PTR - ctxt->input->base); 4308 node_info.begin_line = ctxt->input->line; 4309 } 4310 4311 failed = htmlParseStartTag(ctxt); 4312 name = ctxt->name; 4313 if ((failed == -1) || (name == NULL)) { 4314 if (CUR == '>') 4315 NEXT; 4316 return; 4317 } 4318 4319 /* 4320 * Lookup the info for that element. 4321 */ 4322 info = htmlTagLookup(name); 4323 if (info == NULL) { 4324 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 4325 "Tag %s invalid\n", name, NULL); 4326 } 4327 4328 /* 4329 * Check for an Empty Element labeled the XML/SGML way 4330 */ 4331 if ((CUR == '/') && (NXT(1) == '>')) { 4332 SKIP(2); 4333 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4334 ctxt->sax->endElement(ctxt->userData, name); 4335 htmlnamePop(ctxt); 4336 return; 4337 } 4338 4339 if (CUR == '>') { 4340 NEXT; 4341 } else { 4342 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 4343 "Couldn't find end of Start Tag %s\n", name, NULL); 4344 4345 /* 4346 * end of parsing of this node. 4347 */ 4348 if (xmlStrEqual(name, ctxt->name)) { 4349 nodePop(ctxt); 4350 htmlnamePop(ctxt); 4351 } 4352 4353 if (ctxt->record_info) 4354 htmlNodeInfoPush(ctxt, &node_info); 4355 htmlParserFinishElementParsing(ctxt); 4356 return; 4357 } 4358 4359 /* 4360 * Check for an Empty Element from DTD definition 4361 */ 4362 if ((info != NULL) && (info->empty)) { 4363 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4364 ctxt->sax->endElement(ctxt->userData, name); 4365 htmlnamePop(ctxt); 4366 return; 4367 } 4368 4369 if (ctxt->record_info) 4370 htmlNodeInfoPush(ctxt, &node_info); 4371 } 4372 4373 /** 4374 * htmlParseContentInternal: 4375 * @ctxt: an HTML parser context 4376 * 4377 * Parse a content: comment, sub-element, reference or text. 4378 * New version for non recursive htmlParseElementInternal 4379 */ 4380 4381 static void 4382 htmlParseContentInternal(htmlParserCtxtPtr ctxt) { 4383 xmlChar *currentNode; 4384 int depth; 4385 const xmlChar *name; 4386 4387 currentNode = xmlStrdup(ctxt->name); 4388 depth = ctxt->nameNr; 4389 while (1) { 4390 long cons = ctxt->nbChars; 4391 4392 GROW; 4393 4394 if (ctxt->instate == XML_PARSER_EOF) 4395 break; 4396 4397 /* 4398 * Our tag or one of it's parent or children is ending. 4399 */ 4400 if ((CUR == '<') && (NXT(1) == '/')) { 4401 if (htmlParseEndTag(ctxt) && 4402 ((currentNode != NULL) || (ctxt->nameNr == 0))) { 4403 if (currentNode != NULL) 4404 xmlFree(currentNode); 4405 4406 currentNode = xmlStrdup(ctxt->name); 4407 depth = ctxt->nameNr; 4408 } 4409 continue; /* while */ 4410 } 4411 4412 else if ((CUR == '<') && 4413 ((IS_ASCII_LETTER(NXT(1))) || 4414 (NXT(1) == '_') || (NXT(1) == ':'))) { 4415 name = htmlParseHTMLName_nonInvasive(ctxt); 4416 if (name == NULL) { 4417 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 4418 "htmlParseStartTag: invalid element name\n", 4419 NULL, NULL); 4420 /* Dump the bogus tag like browsers do */ 4421 while ((IS_CHAR_CH(CUR)) && (CUR != '>')) 4422 NEXT; 4423 4424 htmlParserFinishElementParsing(ctxt); 4425 if (currentNode != NULL) 4426 xmlFree(currentNode); 4427 4428 currentNode = xmlStrdup(ctxt->name); 4429 depth = ctxt->nameNr; 4430 continue; 4431 } 4432 4433 if (ctxt->name != NULL) { 4434 if (htmlCheckAutoClose(name, ctxt->name) == 1) { 4435 htmlAutoClose(ctxt, name); 4436 continue; 4437 } 4438 } 4439 } 4440 4441 /* 4442 * Has this node been popped out during parsing of 4443 * the next element 4444 */ 4445 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) && 4446 (!xmlStrEqual(currentNode, ctxt->name))) 4447 { 4448 htmlParserFinishElementParsing(ctxt); 4449 if (currentNode != NULL) xmlFree(currentNode); 4450 4451 currentNode = xmlStrdup(ctxt->name); 4452 depth = ctxt->nameNr; 4453 continue; 4454 } 4455 4456 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) || 4457 (xmlStrEqual(currentNode, BAD_CAST"style")))) { 4458 /* 4459 * Handle SCRIPT/STYLE separately 4460 */ 4461 htmlParseScript(ctxt); 4462 } else { 4463 /* 4464 * Sometimes DOCTYPE arrives in the middle of the document 4465 */ 4466 if ((CUR == '<') && (NXT(1) == '!') && 4467 (UPP(2) == 'D') && (UPP(3) == 'O') && 4468 (UPP(4) == 'C') && (UPP(5) == 'T') && 4469 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4470 (UPP(8) == 'E')) { 4471 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 4472 "Misplaced DOCTYPE declaration\n", 4473 BAD_CAST "DOCTYPE" , NULL); 4474 htmlParseDocTypeDecl(ctxt); 4475 } 4476 4477 /* 4478 * First case : a comment 4479 */ 4480 if ((CUR == '<') && (NXT(1) == '!') && 4481 (NXT(2) == '-') && (NXT(3) == '-')) { 4482 htmlParseComment(ctxt); 4483 } 4484 4485 /* 4486 * Second case : a Processing Instruction. 4487 */ 4488 else if ((CUR == '<') && (NXT(1) == '?')) { 4489 htmlParsePI(ctxt); 4490 } 4491 4492 /* 4493 * Third case : a sub-element. 4494 */ 4495 else if (CUR == '<') { 4496 htmlParseElementInternal(ctxt); 4497 if (currentNode != NULL) xmlFree(currentNode); 4498 4499 currentNode = xmlStrdup(ctxt->name); 4500 depth = ctxt->nameNr; 4501 } 4502 4503 /* 4504 * Fourth case : a reference. If if has not been resolved, 4505 * parsing returns it's Name, create the node 4506 */ 4507 else if (CUR == '&') { 4508 htmlParseReference(ctxt); 4509 } 4510 4511 /* 4512 * Fifth case : end of the resource 4513 */ 4514 else if (CUR == 0) { 4515 htmlAutoCloseOnEnd(ctxt); 4516 break; 4517 } 4518 4519 /* 4520 * Last case, text. Note that References are handled directly. 4521 */ 4522 else { 4523 htmlParseCharData(ctxt); 4524 } 4525 4526 if (cons == ctxt->nbChars) { 4527 if (ctxt->node != NULL) { 4528 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4529 "detected an error in element content\n", 4530 NULL, NULL); 4531 } 4532 break; 4533 } 4534 } 4535 GROW; 4536 } 4537 if (currentNode != NULL) xmlFree(currentNode); 4538 } 4539 4540 /** 4541 * htmlParseContent: 4542 * @ctxt: an HTML parser context 4543 * 4544 * Parse a content: comment, sub-element, reference or text. 4545 * This is the entry point when called from parser.c 4546 */ 4547 4548 void 4549 __htmlParseContent(void *ctxt) { 4550 if (ctxt != NULL) 4551 htmlParseContentInternal((htmlParserCtxtPtr) ctxt); 4552 } 4553 4554 /** 4555 * htmlParseDocument: 4556 * @ctxt: an HTML parser context 4557 * 4558 * parse an HTML document (and build a tree if using the standard SAX 4559 * interface). 4560 * 4561 * Returns 0, -1 in case of error. the parser context is augmented 4562 * as a result of the parsing. 4563 */ 4564 4565 int 4566 htmlParseDocument(htmlParserCtxtPtr ctxt) { 4567 xmlChar start[4]; 4568 xmlCharEncoding enc; 4569 xmlDtdPtr dtd; 4570 4571 xmlInitParser(); 4572 4573 htmlDefaultSAXHandlerInit(); 4574 4575 if ((ctxt == NULL) || (ctxt->input == NULL)) { 4576 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4577 "htmlParseDocument: context error\n", NULL, NULL); 4578 return(XML_ERR_INTERNAL_ERROR); 4579 } 4580 ctxt->html = 1; 4581 ctxt->linenumbers = 1; 4582 GROW; 4583 /* 4584 * SAX: beginning of the document processing. 4585 */ 4586 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) 4587 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator); 4588 4589 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) && 4590 ((ctxt->input->end - ctxt->input->cur) >= 4)) { 4591 /* 4592 * Get the 4 first bytes and decode the charset 4593 * if enc != XML_CHAR_ENCODING_NONE 4594 * plug some encoding conversion routines. 4595 */ 4596 start[0] = RAW; 4597 start[1] = NXT(1); 4598 start[2] = NXT(2); 4599 start[3] = NXT(3); 4600 enc = xmlDetectCharEncoding(&start[0], 4); 4601 if (enc != XML_CHAR_ENCODING_NONE) { 4602 xmlSwitchEncoding(ctxt, enc); 4603 } 4604 } 4605 4606 /* 4607 * Wipe out everything which is before the first '<' 4608 */ 4609 SKIP_BLANKS; 4610 if (CUR == 0) { 4611 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY, 4612 "Document is empty\n", NULL, NULL); 4613 } 4614 4615 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX)) 4616 ctxt->sax->startDocument(ctxt->userData); 4617 4618 4619 /* 4620 * Parse possible comments and PIs before any content 4621 */ 4622 while (((CUR == '<') && (NXT(1) == '!') && 4623 (NXT(2) == '-') && (NXT(3) == '-')) || 4624 ((CUR == '<') && (NXT(1) == '?'))) { 4625 htmlParseComment(ctxt); 4626 htmlParsePI(ctxt); 4627 SKIP_BLANKS; 4628 } 4629 4630 4631 /* 4632 * Then possibly doc type declaration(s) and more Misc 4633 * (doctypedecl Misc*)? 4634 */ 4635 if ((CUR == '<') && (NXT(1) == '!') && 4636 (UPP(2) == 'D') && (UPP(3) == 'O') && 4637 (UPP(4) == 'C') && (UPP(5) == 'T') && 4638 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4639 (UPP(8) == 'E')) { 4640 htmlParseDocTypeDecl(ctxt); 4641 } 4642 SKIP_BLANKS; 4643 4644 /* 4645 * Parse possible comments and PIs before any content 4646 */ 4647 while (((CUR == '<') && (NXT(1) == '!') && 4648 (NXT(2) == '-') && (NXT(3) == '-')) || 4649 ((CUR == '<') && (NXT(1) == '?'))) { 4650 htmlParseComment(ctxt); 4651 htmlParsePI(ctxt); 4652 SKIP_BLANKS; 4653 } 4654 4655 /* 4656 * Time to start parsing the tree itself 4657 */ 4658 htmlParseContentInternal(ctxt); 4659 4660 /* 4661 * autoclose 4662 */ 4663 if (CUR == 0) 4664 htmlAutoCloseOnEnd(ctxt); 4665 4666 4667 /* 4668 * SAX: end of the document processing. 4669 */ 4670 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 4671 ctxt->sax->endDocument(ctxt->userData); 4672 4673 if (ctxt->myDoc != NULL) { 4674 dtd = xmlGetIntSubset(ctxt->myDoc); 4675 if (dtd == NULL) 4676 ctxt->myDoc->intSubset = 4677 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 4678 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 4679 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 4680 } 4681 if (! ctxt->wellFormed) return(-1); 4682 return(0); 4683 } 4684 4685 4686 /************************************************************************ 4687 * * 4688 * Parser contexts handling * 4689 * * 4690 ************************************************************************/ 4691 4692 /** 4693 * htmlInitParserCtxt: 4694 * @ctxt: an HTML parser context 4695 * 4696 * Initialize a parser context 4697 * 4698 * Returns 0 in case of success and -1 in case of error 4699 */ 4700 4701 static int 4702 htmlInitParserCtxt(htmlParserCtxtPtr ctxt) 4703 { 4704 htmlSAXHandler *sax; 4705 4706 if (ctxt == NULL) return(-1); 4707 memset(ctxt, 0, sizeof(htmlParserCtxt)); 4708 4709 ctxt->dict = xmlDictCreate(); 4710 if (ctxt->dict == NULL) { 4711 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4712 return(-1); 4713 } 4714 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler)); 4715 if (sax == NULL) { 4716 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4717 return(-1); 4718 } 4719 else 4720 memset(sax, 0, sizeof(htmlSAXHandler)); 4721 4722 /* Allocate the Input stack */ 4723 ctxt->inputTab = (htmlParserInputPtr *) 4724 xmlMalloc(5 * sizeof(htmlParserInputPtr)); 4725 if (ctxt->inputTab == NULL) { 4726 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4727 ctxt->inputNr = 0; 4728 ctxt->inputMax = 0; 4729 ctxt->input = NULL; 4730 return(-1); 4731 } 4732 ctxt->inputNr = 0; 4733 ctxt->inputMax = 5; 4734 ctxt->input = NULL; 4735 ctxt->version = NULL; 4736 ctxt->encoding = NULL; 4737 ctxt->standalone = -1; 4738 ctxt->instate = XML_PARSER_START; 4739 4740 /* Allocate the Node stack */ 4741 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr)); 4742 if (ctxt->nodeTab == NULL) { 4743 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4744 ctxt->nodeNr = 0; 4745 ctxt->nodeMax = 0; 4746 ctxt->node = NULL; 4747 ctxt->inputNr = 0; 4748 ctxt->inputMax = 0; 4749 ctxt->input = NULL; 4750 return(-1); 4751 } 4752 ctxt->nodeNr = 0; 4753 ctxt->nodeMax = 10; 4754 ctxt->node = NULL; 4755 4756 /* Allocate the Name stack */ 4757 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *)); 4758 if (ctxt->nameTab == NULL) { 4759 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4760 ctxt->nameNr = 0; 4761 ctxt->nameMax = 0; 4762 ctxt->name = NULL; 4763 ctxt->nodeNr = 0; 4764 ctxt->nodeMax = 0; 4765 ctxt->node = NULL; 4766 ctxt->inputNr = 0; 4767 ctxt->inputMax = 0; 4768 ctxt->input = NULL; 4769 return(-1); 4770 } 4771 ctxt->nameNr = 0; 4772 ctxt->nameMax = 10; 4773 ctxt->name = NULL; 4774 4775 ctxt->nodeInfoTab = NULL; 4776 ctxt->nodeInfoNr = 0; 4777 ctxt->nodeInfoMax = 0; 4778 4779 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler; 4780 else { 4781 ctxt->sax = sax; 4782 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 4783 } 4784 ctxt->userData = ctxt; 4785 ctxt->myDoc = NULL; 4786 ctxt->wellFormed = 1; 4787 ctxt->replaceEntities = 0; 4788 ctxt->linenumbers = xmlLineNumbersDefaultValue; 4789 ctxt->html = 1; 4790 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0; 4791 ctxt->vctxt.userData = ctxt; 4792 ctxt->vctxt.error = xmlParserValidityError; 4793 ctxt->vctxt.warning = xmlParserValidityWarning; 4794 ctxt->record_info = 0; 4795 ctxt->validate = 0; 4796 ctxt->nbChars = 0; 4797 ctxt->checkIndex = 0; 4798 ctxt->catalogs = NULL; 4799 xmlInitNodeInfoSeq(&ctxt->node_seq); 4800 return(0); 4801 } 4802 4803 /** 4804 * htmlFreeParserCtxt: 4805 * @ctxt: an HTML parser context 4806 * 4807 * Free all the memory used by a parser context. However the parsed 4808 * document in ctxt->myDoc is not freed. 4809 */ 4810 4811 void 4812 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt) 4813 { 4814 xmlFreeParserCtxt(ctxt); 4815 } 4816 4817 /** 4818 * htmlNewParserCtxt: 4819 * 4820 * Allocate and initialize a new parser context. 4821 * 4822 * Returns the htmlParserCtxtPtr or NULL in case of allocation error 4823 */ 4824 4825 htmlParserCtxtPtr 4826 htmlNewParserCtxt(void) 4827 { 4828 xmlParserCtxtPtr ctxt; 4829 4830 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt)); 4831 if (ctxt == NULL) { 4832 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n"); 4833 return(NULL); 4834 } 4835 memset(ctxt, 0, sizeof(xmlParserCtxt)); 4836 if (htmlInitParserCtxt(ctxt) < 0) { 4837 htmlFreeParserCtxt(ctxt); 4838 return(NULL); 4839 } 4840 return(ctxt); 4841 } 4842 4843 /** 4844 * htmlCreateMemoryParserCtxt: 4845 * @buffer: a pointer to a char array 4846 * @size: the size of the array 4847 * 4848 * Create a parser context for an HTML in-memory document. 4849 * 4850 * Returns the new parser context or NULL 4851 */ 4852 htmlParserCtxtPtr 4853 htmlCreateMemoryParserCtxt(const char *buffer, int size) { 4854 xmlParserCtxtPtr ctxt; 4855 xmlParserInputPtr input; 4856 xmlParserInputBufferPtr buf; 4857 4858 if (buffer == NULL) 4859 return(NULL); 4860 if (size <= 0) 4861 return(NULL); 4862 4863 ctxt = htmlNewParserCtxt(); 4864 if (ctxt == NULL) 4865 return(NULL); 4866 4867 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); 4868 if (buf == NULL) return(NULL); 4869 4870 input = xmlNewInputStream(ctxt); 4871 if (input == NULL) { 4872 xmlFreeParserCtxt(ctxt); 4873 return(NULL); 4874 } 4875 4876 input->filename = NULL; 4877 input->buf = buf; 4878 input->base = input->buf->buffer->content; 4879 input->cur = input->buf->buffer->content; 4880 input->end = &input->buf->buffer->content[input->buf->buffer->use]; 4881 4882 inputPush(ctxt, input); 4883 return(ctxt); 4884 } 4885 4886 /** 4887 * htmlCreateDocParserCtxt: 4888 * @cur: a pointer to an array of xmlChar 4889 * @encoding: a free form C string describing the HTML document encoding, or NULL 4890 * 4891 * Create a parser context for an HTML document. 4892 * 4893 * TODO: check the need to add encoding handling there 4894 * 4895 * Returns the new parser context or NULL 4896 */ 4897 static htmlParserCtxtPtr 4898 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) { 4899 int len; 4900 htmlParserCtxtPtr ctxt; 4901 4902 if (cur == NULL) 4903 return(NULL); 4904 len = xmlStrlen(cur); 4905 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len); 4906 if (ctxt == NULL) 4907 return(NULL); 4908 4909 if (encoding != NULL) { 4910 xmlCharEncoding enc; 4911 xmlCharEncodingHandlerPtr handler; 4912 4913 if (ctxt->input->encoding != NULL) 4914 xmlFree((xmlChar *) ctxt->input->encoding); 4915 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding); 4916 4917 enc = xmlParseCharEncoding(encoding); 4918 /* 4919 * registered set of known encodings 4920 */ 4921 if (enc != XML_CHAR_ENCODING_ERROR) { 4922 xmlSwitchEncoding(ctxt, enc); 4923 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { 4924 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 4925 "Unsupported encoding %s\n", 4926 (const xmlChar *) encoding, NULL); 4927 } 4928 } else { 4929 /* 4930 * fallback for unknown encodings 4931 */ 4932 handler = xmlFindCharEncodingHandler((const char *) encoding); 4933 if (handler != NULL) { 4934 xmlSwitchToEncoding(ctxt, handler); 4935 } else { 4936 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 4937 "Unsupported encoding %s\n", 4938 (const xmlChar *) encoding, NULL); 4939 } 4940 } 4941 } 4942 return(ctxt); 4943 } 4944 4945 #ifdef LIBXML_PUSH_ENABLED 4946 /************************************************************************ 4947 * * 4948 * Progressive parsing interfaces * 4949 * * 4950 ************************************************************************/ 4951 4952 /** 4953 * htmlParseLookupSequence: 4954 * @ctxt: an HTML parser context 4955 * @first: the first char to lookup 4956 * @next: the next char to lookup or zero 4957 * @third: the next char to lookup or zero 4958 * @comment: flag to force checking inside comments 4959 * 4960 * Try to find if a sequence (first, next, third) or just (first next) or 4961 * (first) is available in the input stream. 4962 * This function has a side effect of (possibly) incrementing ctxt->checkIndex 4963 * to avoid rescanning sequences of bytes, it DOES change the state of the 4964 * parser, do not use liberally. 4965 * This is basically similar to xmlParseLookupSequence() 4966 * 4967 * Returns the index to the current parsing point if the full sequence 4968 * is available, -1 otherwise. 4969 */ 4970 static int 4971 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, 4972 xmlChar next, xmlChar third, int iscomment, 4973 int ignoreattrval) 4974 { 4975 int base, len; 4976 htmlParserInputPtr in; 4977 const xmlChar *buf; 4978 int incomment = 0; 4979 int invalue = 0; 4980 char valdellim = 0x0; 4981 4982 in = ctxt->input; 4983 if (in == NULL) 4984 return (-1); 4985 4986 base = in->cur - in->base; 4987 if (base < 0) 4988 return (-1); 4989 4990 if (ctxt->checkIndex > base) 4991 base = ctxt->checkIndex; 4992 4993 if (in->buf == NULL) { 4994 buf = in->base; 4995 len = in->length; 4996 } else { 4997 buf = in->buf->buffer->content; 4998 len = in->buf->buffer->use; 4999 } 5000 5001 /* take into account the sequence length */ 5002 if (third) 5003 len -= 2; 5004 else if (next) 5005 len--; 5006 for (; base < len; base++) { 5007 if ((!incomment) && (base + 4 < len) && (!iscomment)) { 5008 if ((buf[base] == '<') && (buf[base + 1] == '!') && 5009 (buf[base + 2] == '-') && (buf[base + 3] == '-')) { 5010 incomment = 1; 5011 /* do not increment past <! - some people use <!--> */ 5012 base += 2; 5013 } 5014 } 5015 if (ignoreattrval) { 5016 if (buf[base] == '"' || buf[base] == '\'') { 5017 if (invalue) { 5018 if (buf[base] == valdellim) { 5019 invalue = 0; 5020 continue; 5021 } 5022 } else { 5023 valdellim = buf[base]; 5024 invalue = 1; 5025 continue; 5026 } 5027 } else if (invalue) { 5028 continue; 5029 } 5030 } 5031 if (incomment) { 5032 if (base + 3 > len) 5033 return (-1); 5034 if ((buf[base] == '-') && (buf[base + 1] == '-') && 5035 (buf[base + 2] == '>')) { 5036 incomment = 0; 5037 base += 2; 5038 } 5039 continue; 5040 } 5041 if (buf[base] == first) { 5042 if (third != 0) { 5043 if ((buf[base + 1] != next) || (buf[base + 2] != third)) 5044 continue; 5045 } else if (next != 0) { 5046 if (buf[base + 1] != next) 5047 continue; 5048 } 5049 ctxt->checkIndex = 0; 5050 #ifdef DEBUG_PUSH 5051 if (next == 0) 5052 xmlGenericError(xmlGenericErrorContext, 5053 "HPP: lookup '%c' found at %d\n", 5054 first, base); 5055 else if (third == 0) 5056 xmlGenericError(xmlGenericErrorContext, 5057 "HPP: lookup '%c%c' found at %d\n", 5058 first, next, base); 5059 else 5060 xmlGenericError(xmlGenericErrorContext, 5061 "HPP: lookup '%c%c%c' found at %d\n", 5062 first, next, third, base); 5063 #endif 5064 return (base - (in->cur - in->base)); 5065 } 5066 } 5067 if ((!incomment) && (!invalue)) 5068 ctxt->checkIndex = base; 5069 #ifdef DEBUG_PUSH 5070 if (next == 0) 5071 xmlGenericError(xmlGenericErrorContext, 5072 "HPP: lookup '%c' failed\n", first); 5073 else if (third == 0) 5074 xmlGenericError(xmlGenericErrorContext, 5075 "HPP: lookup '%c%c' failed\n", first, next); 5076 else 5077 xmlGenericError(xmlGenericErrorContext, 5078 "HPP: lookup '%c%c%c' failed\n", first, next, 5079 third); 5080 #endif 5081 return (-1); 5082 } 5083 5084 /** 5085 * htmlParseLookupChars: 5086 * @ctxt: an HTML parser context 5087 * @stop: Array of chars, which stop the lookup. 5088 * @stopLen: Length of stop-Array 5089 * 5090 * Try to find if any char of the stop-Array is available in the input 5091 * stream. 5092 * This function has a side effect of (possibly) incrementing ctxt->checkIndex 5093 * to avoid rescanning sequences of bytes, it DOES change the state of the 5094 * parser, do not use liberally. 5095 * 5096 * Returns the index to the current parsing point if a stopChar 5097 * is available, -1 otherwise. 5098 */ 5099 static int 5100 htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop, 5101 int stopLen) 5102 { 5103 int base, len; 5104 htmlParserInputPtr in; 5105 const xmlChar *buf; 5106 int incomment = 0; 5107 int i; 5108 5109 in = ctxt->input; 5110 if (in == NULL) 5111 return (-1); 5112 5113 base = in->cur - in->base; 5114 if (base < 0) 5115 return (-1); 5116 5117 if (ctxt->checkIndex > base) 5118 base = ctxt->checkIndex; 5119 5120 if (in->buf == NULL) { 5121 buf = in->base; 5122 len = in->length; 5123 } else { 5124 buf = in->buf->buffer->content; 5125 len = in->buf->buffer->use; 5126 } 5127 5128 for (; base < len; base++) { 5129 if (!incomment && (base + 4 < len)) { 5130 if ((buf[base] == '<') && (buf[base + 1] == '!') && 5131 (buf[base + 2] == '-') && (buf[base + 3] == '-')) { 5132 incomment = 1; 5133 /* do not increment past <! - some people use <!--> */ 5134 base += 2; 5135 } 5136 } 5137 if (incomment) { 5138 if (base + 3 > len) 5139 return (-1); 5140 if ((buf[base] == '-') && (buf[base + 1] == '-') && 5141 (buf[base + 2] == '>')) { 5142 incomment = 0; 5143 base += 2; 5144 } 5145 continue; 5146 } 5147 for (i = 0; i < stopLen; ++i) { 5148 if (buf[base] == stop[i]) { 5149 ctxt->checkIndex = 0; 5150 return (base - (in->cur - in->base)); 5151 } 5152 } 5153 } 5154 ctxt->checkIndex = base; 5155 return (-1); 5156 } 5157 5158 /** 5159 * htmlParseTryOrFinish: 5160 * @ctxt: an HTML parser context 5161 * @terminate: last chunk indicator 5162 * 5163 * Try to progress on parsing 5164 * 5165 * Returns zero if no parsing was possible 5166 */ 5167 static int 5168 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { 5169 int ret = 0; 5170 htmlParserInputPtr in; 5171 int avail = 0; 5172 xmlChar cur, next; 5173 5174 #ifdef DEBUG_PUSH 5175 switch (ctxt->instate) { 5176 case XML_PARSER_EOF: 5177 xmlGenericError(xmlGenericErrorContext, 5178 "HPP: try EOF\n"); break; 5179 case XML_PARSER_START: 5180 xmlGenericError(xmlGenericErrorContext, 5181 "HPP: try START\n"); break; 5182 case XML_PARSER_MISC: 5183 xmlGenericError(xmlGenericErrorContext, 5184 "HPP: try MISC\n");break; 5185 case XML_PARSER_COMMENT: 5186 xmlGenericError(xmlGenericErrorContext, 5187 "HPP: try COMMENT\n");break; 5188 case XML_PARSER_PROLOG: 5189 xmlGenericError(xmlGenericErrorContext, 5190 "HPP: try PROLOG\n");break; 5191 case XML_PARSER_START_TAG: 5192 xmlGenericError(xmlGenericErrorContext, 5193 "HPP: try START_TAG\n");break; 5194 case XML_PARSER_CONTENT: 5195 xmlGenericError(xmlGenericErrorContext, 5196 "HPP: try CONTENT\n");break; 5197 case XML_PARSER_CDATA_SECTION: 5198 xmlGenericError(xmlGenericErrorContext, 5199 "HPP: try CDATA_SECTION\n");break; 5200 case XML_PARSER_END_TAG: 5201 xmlGenericError(xmlGenericErrorContext, 5202 "HPP: try END_TAG\n");break; 5203 case XML_PARSER_ENTITY_DECL: 5204 xmlGenericError(xmlGenericErrorContext, 5205 "HPP: try ENTITY_DECL\n");break; 5206 case XML_PARSER_ENTITY_VALUE: 5207 xmlGenericError(xmlGenericErrorContext, 5208 "HPP: try ENTITY_VALUE\n");break; 5209 case XML_PARSER_ATTRIBUTE_VALUE: 5210 xmlGenericError(xmlGenericErrorContext, 5211 "HPP: try ATTRIBUTE_VALUE\n");break; 5212 case XML_PARSER_DTD: 5213 xmlGenericError(xmlGenericErrorContext, 5214 "HPP: try DTD\n");break; 5215 case XML_PARSER_EPILOG: 5216 xmlGenericError(xmlGenericErrorContext, 5217 "HPP: try EPILOG\n");break; 5218 case XML_PARSER_PI: 5219 xmlGenericError(xmlGenericErrorContext, 5220 "HPP: try PI\n");break; 5221 case XML_PARSER_SYSTEM_LITERAL: 5222 xmlGenericError(xmlGenericErrorContext, 5223 "HPP: try SYSTEM_LITERAL\n");break; 5224 } 5225 #endif 5226 5227 while (1) { 5228 5229 in = ctxt->input; 5230 if (in == NULL) break; 5231 if (in->buf == NULL) 5232 avail = in->length - (in->cur - in->base); 5233 else 5234 avail = in->buf->buffer->use - (in->cur - in->base); 5235 if ((avail == 0) && (terminate)) { 5236 htmlAutoCloseOnEnd(ctxt); 5237 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 5238 /* 5239 * SAX: end of the document processing. 5240 */ 5241 ctxt->instate = XML_PARSER_EOF; 5242 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5243 ctxt->sax->endDocument(ctxt->userData); 5244 } 5245 } 5246 if (avail < 1) 5247 goto done; 5248 cur = in->cur[0]; 5249 if (cur == 0) { 5250 SKIP(1); 5251 continue; 5252 } 5253 5254 switch (ctxt->instate) { 5255 case XML_PARSER_EOF: 5256 /* 5257 * Document parsing is done ! 5258 */ 5259 goto done; 5260 case XML_PARSER_START: 5261 /* 5262 * Very first chars read from the document flow. 5263 */ 5264 cur = in->cur[0]; 5265 if (IS_BLANK_CH(cur)) { 5266 SKIP_BLANKS; 5267 if (in->buf == NULL) 5268 avail = in->length - (in->cur - in->base); 5269 else 5270 avail = in->buf->buffer->use - (in->cur - in->base); 5271 } 5272 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) 5273 ctxt->sax->setDocumentLocator(ctxt->userData, 5274 &xmlDefaultSAXLocator); 5275 if ((ctxt->sax) && (ctxt->sax->startDocument) && 5276 (!ctxt->disableSAX)) 5277 ctxt->sax->startDocument(ctxt->userData); 5278 5279 cur = in->cur[0]; 5280 next = in->cur[1]; 5281 if ((cur == '<') && (next == '!') && 5282 (UPP(2) == 'D') && (UPP(3) == 'O') && 5283 (UPP(4) == 'C') && (UPP(5) == 'T') && 5284 (UPP(6) == 'Y') && (UPP(7) == 'P') && 5285 (UPP(8) == 'E')) { 5286 if ((!terminate) && 5287 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5288 goto done; 5289 #ifdef DEBUG_PUSH 5290 xmlGenericError(xmlGenericErrorContext, 5291 "HPP: Parsing internal subset\n"); 5292 #endif 5293 htmlParseDocTypeDecl(ctxt); 5294 ctxt->instate = XML_PARSER_PROLOG; 5295 #ifdef DEBUG_PUSH 5296 xmlGenericError(xmlGenericErrorContext, 5297 "HPP: entering PROLOG\n"); 5298 #endif 5299 } else { 5300 ctxt->instate = XML_PARSER_MISC; 5301 #ifdef DEBUG_PUSH 5302 xmlGenericError(xmlGenericErrorContext, 5303 "HPP: entering MISC\n"); 5304 #endif 5305 } 5306 break; 5307 case XML_PARSER_MISC: 5308 SKIP_BLANKS; 5309 if (in->buf == NULL) 5310 avail = in->length - (in->cur - in->base); 5311 else 5312 avail = in->buf->buffer->use - (in->cur - in->base); 5313 if (avail < 2) 5314 goto done; 5315 cur = in->cur[0]; 5316 next = in->cur[1]; 5317 if ((cur == '<') && (next == '!') && 5318 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5319 if ((!terminate) && 5320 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 5321 goto done; 5322 #ifdef DEBUG_PUSH 5323 xmlGenericError(xmlGenericErrorContext, 5324 "HPP: Parsing Comment\n"); 5325 #endif 5326 htmlParseComment(ctxt); 5327 ctxt->instate = XML_PARSER_MISC; 5328 } else if ((cur == '<') && (next == '?')) { 5329 if ((!terminate) && 5330 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5331 goto done; 5332 #ifdef DEBUG_PUSH 5333 xmlGenericError(xmlGenericErrorContext, 5334 "HPP: Parsing PI\n"); 5335 #endif 5336 htmlParsePI(ctxt); 5337 ctxt->instate = XML_PARSER_MISC; 5338 } else if ((cur == '<') && (next == '!') && 5339 (UPP(2) == 'D') && (UPP(3) == 'O') && 5340 (UPP(4) == 'C') && (UPP(5) == 'T') && 5341 (UPP(6) == 'Y') && (UPP(7) == 'P') && 5342 (UPP(8) == 'E')) { 5343 if ((!terminate) && 5344 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5345 goto done; 5346 #ifdef DEBUG_PUSH 5347 xmlGenericError(xmlGenericErrorContext, 5348 "HPP: Parsing internal subset\n"); 5349 #endif 5350 htmlParseDocTypeDecl(ctxt); 5351 ctxt->instate = XML_PARSER_PROLOG; 5352 #ifdef DEBUG_PUSH 5353 xmlGenericError(xmlGenericErrorContext, 5354 "HPP: entering PROLOG\n"); 5355 #endif 5356 } else if ((cur == '<') && (next == '!') && 5357 (avail < 9)) { 5358 goto done; 5359 } else { 5360 ctxt->instate = XML_PARSER_START_TAG; 5361 #ifdef DEBUG_PUSH 5362 xmlGenericError(xmlGenericErrorContext, 5363 "HPP: entering START_TAG\n"); 5364 #endif 5365 } 5366 break; 5367 case XML_PARSER_PROLOG: 5368 SKIP_BLANKS; 5369 if (in->buf == NULL) 5370 avail = in->length - (in->cur - in->base); 5371 else 5372 avail = in->buf->buffer->use - (in->cur - in->base); 5373 if (avail < 2) 5374 goto done; 5375 cur = in->cur[0]; 5376 next = in->cur[1]; 5377 if ((cur == '<') && (next == '!') && 5378 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5379 if ((!terminate) && 5380 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 5381 goto done; 5382 #ifdef DEBUG_PUSH 5383 xmlGenericError(xmlGenericErrorContext, 5384 "HPP: Parsing Comment\n"); 5385 #endif 5386 htmlParseComment(ctxt); 5387 ctxt->instate = XML_PARSER_PROLOG; 5388 } else if ((cur == '<') && (next == '?')) { 5389 if ((!terminate) && 5390 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5391 goto done; 5392 #ifdef DEBUG_PUSH 5393 xmlGenericError(xmlGenericErrorContext, 5394 "HPP: Parsing PI\n"); 5395 #endif 5396 htmlParsePI(ctxt); 5397 ctxt->instate = XML_PARSER_PROLOG; 5398 } else if ((cur == '<') && (next == '!') && 5399 (avail < 4)) { 5400 goto done; 5401 } else { 5402 ctxt->instate = XML_PARSER_START_TAG; 5403 #ifdef DEBUG_PUSH 5404 xmlGenericError(xmlGenericErrorContext, 5405 "HPP: entering START_TAG\n"); 5406 #endif 5407 } 5408 break; 5409 case XML_PARSER_EPILOG: 5410 if (in->buf == NULL) 5411 avail = in->length - (in->cur - in->base); 5412 else 5413 avail = in->buf->buffer->use - (in->cur - in->base); 5414 if (avail < 1) 5415 goto done; 5416 cur = in->cur[0]; 5417 if (IS_BLANK_CH(cur)) { 5418 htmlParseCharData(ctxt); 5419 goto done; 5420 } 5421 if (avail < 2) 5422 goto done; 5423 next = in->cur[1]; 5424 if ((cur == '<') && (next == '!') && 5425 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5426 if ((!terminate) && 5427 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 5428 goto done; 5429 #ifdef DEBUG_PUSH 5430 xmlGenericError(xmlGenericErrorContext, 5431 "HPP: Parsing Comment\n"); 5432 #endif 5433 htmlParseComment(ctxt); 5434 ctxt->instate = XML_PARSER_EPILOG; 5435 } else if ((cur == '<') && (next == '?')) { 5436 if ((!terminate) && 5437 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5438 goto done; 5439 #ifdef DEBUG_PUSH 5440 xmlGenericError(xmlGenericErrorContext, 5441 "HPP: Parsing PI\n"); 5442 #endif 5443 htmlParsePI(ctxt); 5444 ctxt->instate = XML_PARSER_EPILOG; 5445 } else if ((cur == '<') && (next == '!') && 5446 (avail < 4)) { 5447 goto done; 5448 } else { 5449 ctxt->errNo = XML_ERR_DOCUMENT_END; 5450 ctxt->wellFormed = 0; 5451 ctxt->instate = XML_PARSER_EOF; 5452 #ifdef DEBUG_PUSH 5453 xmlGenericError(xmlGenericErrorContext, 5454 "HPP: entering EOF\n"); 5455 #endif 5456 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5457 ctxt->sax->endDocument(ctxt->userData); 5458 goto done; 5459 } 5460 break; 5461 case XML_PARSER_START_TAG: { 5462 const xmlChar *name; 5463 int failed; 5464 const htmlElemDesc * info; 5465 5466 if (avail < 2) 5467 goto done; 5468 cur = in->cur[0]; 5469 if (cur != '<') { 5470 ctxt->instate = XML_PARSER_CONTENT; 5471 #ifdef DEBUG_PUSH 5472 xmlGenericError(xmlGenericErrorContext, 5473 "HPP: entering CONTENT\n"); 5474 #endif 5475 break; 5476 } 5477 if (in->cur[1] == '/') { 5478 ctxt->instate = XML_PARSER_END_TAG; 5479 ctxt->checkIndex = 0; 5480 #ifdef DEBUG_PUSH 5481 xmlGenericError(xmlGenericErrorContext, 5482 "HPP: entering END_TAG\n"); 5483 #endif 5484 break; 5485 } 5486 if ((!terminate) && 5487 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5488 goto done; 5489 5490 failed = htmlParseStartTag(ctxt); 5491 name = ctxt->name; 5492 if ((failed == -1) || 5493 (name == NULL)) { 5494 if (CUR == '>') 5495 NEXT; 5496 break; 5497 } 5498 5499 /* 5500 * Lookup the info for that element. 5501 */ 5502 info = htmlTagLookup(name); 5503 if (info == NULL) { 5504 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 5505 "Tag %s invalid\n", name, NULL); 5506 } 5507 5508 /* 5509 * Check for an Empty Element labeled the XML/SGML way 5510 */ 5511 if ((CUR == '/') && (NXT(1) == '>')) { 5512 SKIP(2); 5513 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 5514 ctxt->sax->endElement(ctxt->userData, name); 5515 htmlnamePop(ctxt); 5516 ctxt->instate = XML_PARSER_CONTENT; 5517 #ifdef DEBUG_PUSH 5518 xmlGenericError(xmlGenericErrorContext, 5519 "HPP: entering CONTENT\n"); 5520 #endif 5521 break; 5522 } 5523 5524 if (CUR == '>') { 5525 NEXT; 5526 } else { 5527 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 5528 "Couldn't find end of Start Tag %s\n", 5529 name, NULL); 5530 5531 /* 5532 * end of parsing of this node. 5533 */ 5534 if (xmlStrEqual(name, ctxt->name)) { 5535 nodePop(ctxt); 5536 htmlnamePop(ctxt); 5537 } 5538 5539 ctxt->instate = XML_PARSER_CONTENT; 5540 #ifdef DEBUG_PUSH 5541 xmlGenericError(xmlGenericErrorContext, 5542 "HPP: entering CONTENT\n"); 5543 #endif 5544 break; 5545 } 5546 5547 /* 5548 * Check for an Empty Element from DTD definition 5549 */ 5550 if ((info != NULL) && (info->empty)) { 5551 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 5552 ctxt->sax->endElement(ctxt->userData, name); 5553 htmlnamePop(ctxt); 5554 } 5555 ctxt->instate = XML_PARSER_CONTENT; 5556 #ifdef DEBUG_PUSH 5557 xmlGenericError(xmlGenericErrorContext, 5558 "HPP: entering CONTENT\n"); 5559 #endif 5560 break; 5561 } 5562 case XML_PARSER_CONTENT: { 5563 long cons; 5564 /* 5565 * Handle preparsed entities and charRef 5566 */ 5567 if (ctxt->token != 0) { 5568 xmlChar chr[2] = { 0 , 0 } ; 5569 5570 chr[0] = (xmlChar) ctxt->token; 5571 htmlCheckParagraph(ctxt); 5572 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 5573 ctxt->sax->characters(ctxt->userData, chr, 1); 5574 ctxt->token = 0; 5575 ctxt->checkIndex = 0; 5576 } 5577 if ((avail == 1) && (terminate)) { 5578 cur = in->cur[0]; 5579 if ((cur != '<') && (cur != '&')) { 5580 if (ctxt->sax != NULL) { 5581 if (IS_BLANK_CH(cur)) { 5582 if (ctxt->sax->ignorableWhitespace != NULL) 5583 ctxt->sax->ignorableWhitespace( 5584 ctxt->userData, &cur, 1); 5585 } else { 5586 htmlCheckParagraph(ctxt); 5587 if (ctxt->sax->characters != NULL) 5588 ctxt->sax->characters( 5589 ctxt->userData, &cur, 1); 5590 } 5591 } 5592 ctxt->token = 0; 5593 ctxt->checkIndex = 0; 5594 in->cur++; 5595 break; 5596 } 5597 } 5598 if (avail < 2) 5599 goto done; 5600 cur = in->cur[0]; 5601 next = in->cur[1]; 5602 cons = ctxt->nbChars; 5603 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) || 5604 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) { 5605 /* 5606 * Handle SCRIPT/STYLE separately 5607 */ 5608 if (!terminate) { 5609 int idx; 5610 xmlChar val; 5611 5612 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 1); 5613 if (idx < 0) 5614 goto done; 5615 val = in->cur[idx + 2]; 5616 if (val == 0) /* bad cut of input */ 5617 goto done; 5618 } 5619 htmlParseScript(ctxt); 5620 if ((cur == '<') && (next == '/')) { 5621 ctxt->instate = XML_PARSER_END_TAG; 5622 ctxt->checkIndex = 0; 5623 #ifdef DEBUG_PUSH 5624 xmlGenericError(xmlGenericErrorContext, 5625 "HPP: entering END_TAG\n"); 5626 #endif 5627 break; 5628 } 5629 } else { 5630 /* 5631 * Sometimes DOCTYPE arrives in the middle of the document 5632 */ 5633 if ((cur == '<') && (next == '!') && 5634 (UPP(2) == 'D') && (UPP(3) == 'O') && 5635 (UPP(4) == 'C') && (UPP(5) == 'T') && 5636 (UPP(6) == 'Y') && (UPP(7) == 'P') && 5637 (UPP(8) == 'E')) { 5638 if ((!terminate) && 5639 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5640 goto done; 5641 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 5642 "Misplaced DOCTYPE declaration\n", 5643 BAD_CAST "DOCTYPE" , NULL); 5644 htmlParseDocTypeDecl(ctxt); 5645 } else if ((cur == '<') && (next == '!') && 5646 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5647 if ((!terminate) && 5648 (htmlParseLookupSequence( 5649 ctxt, '-', '-', '>', 1, 1) < 0)) 5650 goto done; 5651 #ifdef DEBUG_PUSH 5652 xmlGenericError(xmlGenericErrorContext, 5653 "HPP: Parsing Comment\n"); 5654 #endif 5655 htmlParseComment(ctxt); 5656 ctxt->instate = XML_PARSER_CONTENT; 5657 } else if ((cur == '<') && (next == '?')) { 5658 if ((!terminate) && 5659 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5660 goto done; 5661 #ifdef DEBUG_PUSH 5662 xmlGenericError(xmlGenericErrorContext, 5663 "HPP: Parsing PI\n"); 5664 #endif 5665 htmlParsePI(ctxt); 5666 ctxt->instate = XML_PARSER_CONTENT; 5667 } else if ((cur == '<') && (next == '!') && (avail < 4)) { 5668 goto done; 5669 } else if ((cur == '<') && (next == '/')) { 5670 ctxt->instate = XML_PARSER_END_TAG; 5671 ctxt->checkIndex = 0; 5672 #ifdef DEBUG_PUSH 5673 xmlGenericError(xmlGenericErrorContext, 5674 "HPP: entering END_TAG\n"); 5675 #endif 5676 break; 5677 } else if (cur == '<') { 5678 ctxt->instate = XML_PARSER_START_TAG; 5679 ctxt->checkIndex = 0; 5680 #ifdef DEBUG_PUSH 5681 xmlGenericError(xmlGenericErrorContext, 5682 "HPP: entering START_TAG\n"); 5683 #endif 5684 break; 5685 } else if (cur == '&') { 5686 if ((!terminate) && 5687 (htmlParseLookupChars(ctxt, 5688 BAD_CAST "; >/", 4) < 0)) 5689 goto done; 5690 #ifdef DEBUG_PUSH 5691 xmlGenericError(xmlGenericErrorContext, 5692 "HPP: Parsing Reference\n"); 5693 #endif 5694 /* TODO: check generation of subtrees if noent !!! */ 5695 htmlParseReference(ctxt); 5696 } else { 5697 /* 5698 * check that the text sequence is complete 5699 * before handing out the data to the parser 5700 * to avoid problems with erroneous end of 5701 * data detection. 5702 */ 5703 if ((!terminate) && 5704 (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0)) 5705 goto done; 5706 ctxt->checkIndex = 0; 5707 #ifdef DEBUG_PUSH 5708 xmlGenericError(xmlGenericErrorContext, 5709 "HPP: Parsing char data\n"); 5710 #endif 5711 htmlParseCharData(ctxt); 5712 } 5713 } 5714 if (cons == ctxt->nbChars) { 5715 if (ctxt->node != NULL) { 5716 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5717 "detected an error in element content\n", 5718 NULL, NULL); 5719 } 5720 NEXT; 5721 break; 5722 } 5723 5724 break; 5725 } 5726 case XML_PARSER_END_TAG: 5727 if (avail < 2) 5728 goto done; 5729 if ((!terminate) && 5730 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5731 goto done; 5732 htmlParseEndTag(ctxt); 5733 if (ctxt->nameNr == 0) { 5734 ctxt->instate = XML_PARSER_EPILOG; 5735 } else { 5736 ctxt->instate = XML_PARSER_CONTENT; 5737 } 5738 ctxt->checkIndex = 0; 5739 #ifdef DEBUG_PUSH 5740 xmlGenericError(xmlGenericErrorContext, 5741 "HPP: entering CONTENT\n"); 5742 #endif 5743 break; 5744 case XML_PARSER_CDATA_SECTION: 5745 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5746 "HPP: internal error, state == CDATA\n", 5747 NULL, NULL); 5748 ctxt->instate = XML_PARSER_CONTENT; 5749 ctxt->checkIndex = 0; 5750 #ifdef DEBUG_PUSH 5751 xmlGenericError(xmlGenericErrorContext, 5752 "HPP: entering CONTENT\n"); 5753 #endif 5754 break; 5755 case XML_PARSER_DTD: 5756 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5757 "HPP: internal error, state == DTD\n", 5758 NULL, NULL); 5759 ctxt->instate = XML_PARSER_CONTENT; 5760 ctxt->checkIndex = 0; 5761 #ifdef DEBUG_PUSH 5762 xmlGenericError(xmlGenericErrorContext, 5763 "HPP: entering CONTENT\n"); 5764 #endif 5765 break; 5766 case XML_PARSER_COMMENT: 5767 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5768 "HPP: internal error, state == COMMENT\n", 5769 NULL, NULL); 5770 ctxt->instate = XML_PARSER_CONTENT; 5771 ctxt->checkIndex = 0; 5772 #ifdef DEBUG_PUSH 5773 xmlGenericError(xmlGenericErrorContext, 5774 "HPP: entering CONTENT\n"); 5775 #endif 5776 break; 5777 case XML_PARSER_PI: 5778 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5779 "HPP: internal error, state == PI\n", 5780 NULL, NULL); 5781 ctxt->instate = XML_PARSER_CONTENT; 5782 ctxt->checkIndex = 0; 5783 #ifdef DEBUG_PUSH 5784 xmlGenericError(xmlGenericErrorContext, 5785 "HPP: entering CONTENT\n"); 5786 #endif 5787 break; 5788 case XML_PARSER_ENTITY_DECL: 5789 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5790 "HPP: internal error, state == ENTITY_DECL\n", 5791 NULL, NULL); 5792 ctxt->instate = XML_PARSER_CONTENT; 5793 ctxt->checkIndex = 0; 5794 #ifdef DEBUG_PUSH 5795 xmlGenericError(xmlGenericErrorContext, 5796 "HPP: entering CONTENT\n"); 5797 #endif 5798 break; 5799 case XML_PARSER_ENTITY_VALUE: 5800 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5801 "HPP: internal error, state == ENTITY_VALUE\n", 5802 NULL, NULL); 5803 ctxt->instate = XML_PARSER_CONTENT; 5804 ctxt->checkIndex = 0; 5805 #ifdef DEBUG_PUSH 5806 xmlGenericError(xmlGenericErrorContext, 5807 "HPP: entering DTD\n"); 5808 #endif 5809 break; 5810 case XML_PARSER_ATTRIBUTE_VALUE: 5811 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5812 "HPP: internal error, state == ATTRIBUTE_VALUE\n", 5813 NULL, NULL); 5814 ctxt->instate = XML_PARSER_START_TAG; 5815 ctxt->checkIndex = 0; 5816 #ifdef DEBUG_PUSH 5817 xmlGenericError(xmlGenericErrorContext, 5818 "HPP: entering START_TAG\n"); 5819 #endif 5820 break; 5821 case XML_PARSER_SYSTEM_LITERAL: 5822 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5823 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n", 5824 NULL, NULL); 5825 ctxt->instate = XML_PARSER_CONTENT; 5826 ctxt->checkIndex = 0; 5827 #ifdef DEBUG_PUSH 5828 xmlGenericError(xmlGenericErrorContext, 5829 "HPP: entering CONTENT\n"); 5830 #endif 5831 break; 5832 case XML_PARSER_IGNORE: 5833 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5834 "HPP: internal error, state == XML_PARSER_IGNORE\n", 5835 NULL, NULL); 5836 ctxt->instate = XML_PARSER_CONTENT; 5837 ctxt->checkIndex = 0; 5838 #ifdef DEBUG_PUSH 5839 xmlGenericError(xmlGenericErrorContext, 5840 "HPP: entering CONTENT\n"); 5841 #endif 5842 break; 5843 case XML_PARSER_PUBLIC_LITERAL: 5844 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5845 "HPP: internal error, state == XML_PARSER_LITERAL\n", 5846 NULL, NULL); 5847 ctxt->instate = XML_PARSER_CONTENT; 5848 ctxt->checkIndex = 0; 5849 #ifdef DEBUG_PUSH 5850 xmlGenericError(xmlGenericErrorContext, 5851 "HPP: entering CONTENT\n"); 5852 #endif 5853 break; 5854 5855 } 5856 } 5857 done: 5858 if ((avail == 0) && (terminate)) { 5859 htmlAutoCloseOnEnd(ctxt); 5860 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 5861 /* 5862 * SAX: end of the document processing. 5863 */ 5864 ctxt->instate = XML_PARSER_EOF; 5865 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5866 ctxt->sax->endDocument(ctxt->userData); 5867 } 5868 } 5869 if ((ctxt->myDoc != NULL) && 5870 ((terminate) || (ctxt->instate == XML_PARSER_EOF) || 5871 (ctxt->instate == XML_PARSER_EPILOG))) { 5872 xmlDtdPtr dtd; 5873 dtd = xmlGetIntSubset(ctxt->myDoc); 5874 if (dtd == NULL) 5875 ctxt->myDoc->intSubset = 5876 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 5877 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 5878 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 5879 } 5880 #ifdef DEBUG_PUSH 5881 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret); 5882 #endif 5883 return(ret); 5884 } 5885 5886 /** 5887 * htmlParseChunk: 5888 * @ctxt: an HTML parser context 5889 * @chunk: an char array 5890 * @size: the size in byte of the chunk 5891 * @terminate: last chunk indicator 5892 * 5893 * Parse a Chunk of memory 5894 * 5895 * Returns zero if no error, the xmlParserErrors otherwise. 5896 */ 5897 int 5898 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, 5899 int terminate) { 5900 if ((ctxt == NULL) || (ctxt->input == NULL)) { 5901 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5902 "htmlParseChunk: context error\n", NULL, NULL); 5903 return(XML_ERR_INTERNAL_ERROR); 5904 } 5905 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 5906 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) { 5907 int base = ctxt->input->base - ctxt->input->buf->buffer->content; 5908 int cur = ctxt->input->cur - ctxt->input->base; 5909 int res; 5910 5911 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 5912 if (res < 0) { 5913 ctxt->errNo = XML_PARSER_EOF; 5914 ctxt->disableSAX = 1; 5915 return (XML_PARSER_EOF); 5916 } 5917 ctxt->input->base = ctxt->input->buf->buffer->content + base; 5918 ctxt->input->cur = ctxt->input->base + cur; 5919 ctxt->input->end = 5920 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; 5921 #ifdef DEBUG_PUSH 5922 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 5923 #endif 5924 5925 #if 0 5926 if ((terminate) || (ctxt->input->buf->buffer->use > 80)) 5927 htmlParseTryOrFinish(ctxt, terminate); 5928 #endif 5929 } else if (ctxt->instate != XML_PARSER_EOF) { 5930 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) { 5931 xmlParserInputBufferPtr in = ctxt->input->buf; 5932 if ((in->encoder != NULL) && (in->buffer != NULL) && 5933 (in->raw != NULL)) { 5934 int nbchars; 5935 5936 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw); 5937 if (nbchars < 0) { 5938 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 5939 "encoder error\n", NULL, NULL); 5940 return(XML_ERR_INVALID_ENCODING); 5941 } 5942 } 5943 } 5944 } 5945 htmlParseTryOrFinish(ctxt, terminate); 5946 if (terminate) { 5947 if ((ctxt->instate != XML_PARSER_EOF) && 5948 (ctxt->instate != XML_PARSER_EPILOG) && 5949 (ctxt->instate != XML_PARSER_MISC)) { 5950 ctxt->errNo = XML_ERR_DOCUMENT_END; 5951 ctxt->wellFormed = 0; 5952 } 5953 if (ctxt->instate != XML_PARSER_EOF) { 5954 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5955 ctxt->sax->endDocument(ctxt->userData); 5956 } 5957 ctxt->instate = XML_PARSER_EOF; 5958 } 5959 return((xmlParserErrors) ctxt->errNo); 5960 } 5961 5962 /************************************************************************ 5963 * * 5964 * User entry points * 5965 * * 5966 ************************************************************************/ 5967 5968 /** 5969 * htmlCreatePushParserCtxt: 5970 * @sax: a SAX handler 5971 * @user_data: The user data returned on SAX callbacks 5972 * @chunk: a pointer to an array of chars 5973 * @size: number of chars in the array 5974 * @filename: an optional file name or URI 5975 * @enc: an optional encoding 5976 * 5977 * Create a parser context for using the HTML parser in push mode 5978 * The value of @filename is used for fetching external entities 5979 * and error/warning reports. 5980 * 5981 * Returns the new parser context or NULL 5982 */ 5983 htmlParserCtxtPtr 5984 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, 5985 const char *chunk, int size, const char *filename, 5986 xmlCharEncoding enc) { 5987 htmlParserCtxtPtr ctxt; 5988 htmlParserInputPtr inputStream; 5989 xmlParserInputBufferPtr buf; 5990 5991 xmlInitParser(); 5992 5993 buf = xmlAllocParserInputBuffer(enc); 5994 if (buf == NULL) return(NULL); 5995 5996 ctxt = htmlNewParserCtxt(); 5997 if (ctxt == NULL) { 5998 xmlFreeParserInputBuffer(buf); 5999 return(NULL); 6000 } 6001 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder) 6002 ctxt->charset=XML_CHAR_ENCODING_UTF8; 6003 if (sax != NULL) { 6004 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler) 6005 xmlFree(ctxt->sax); 6006 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler)); 6007 if (ctxt->sax == NULL) { 6008 xmlFree(buf); 6009 xmlFree(ctxt); 6010 return(NULL); 6011 } 6012 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler)); 6013 if (user_data != NULL) 6014 ctxt->userData = user_data; 6015 } 6016 if (filename == NULL) { 6017 ctxt->directory = NULL; 6018 } else { 6019 ctxt->directory = xmlParserGetDirectory(filename); 6020 } 6021 6022 inputStream = htmlNewInputStream(ctxt); 6023 if (inputStream == NULL) { 6024 xmlFreeParserCtxt(ctxt); 6025 xmlFree(buf); 6026 return(NULL); 6027 } 6028 6029 if (filename == NULL) 6030 inputStream->filename = NULL; 6031 else 6032 inputStream->filename = (char *) 6033 xmlCanonicPath((const xmlChar *) filename); 6034 inputStream->buf = buf; 6035 inputStream->base = inputStream->buf->buffer->content; 6036 inputStream->cur = inputStream->buf->buffer->content; 6037 inputStream->end = 6038 &inputStream->buf->buffer->content[inputStream->buf->buffer->use]; 6039 6040 inputPush(ctxt, inputStream); 6041 6042 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 6043 (ctxt->input->buf != NULL)) { 6044 int base = ctxt->input->base - ctxt->input->buf->buffer->content; 6045 int cur = ctxt->input->cur - ctxt->input->base; 6046 6047 xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 6048 6049 ctxt->input->base = ctxt->input->buf->buffer->content + base; 6050 ctxt->input->cur = ctxt->input->base + cur; 6051 ctxt->input->end = 6052 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; 6053 #ifdef DEBUG_PUSH 6054 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 6055 #endif 6056 } 6057 ctxt->progressive = 1; 6058 6059 return(ctxt); 6060 } 6061 #endif /* LIBXML_PUSH_ENABLED */ 6062 6063 /** 6064 * htmlSAXParseDoc: 6065 * @cur: a pointer to an array of xmlChar 6066 * @encoding: a free form C string describing the HTML document encoding, or NULL 6067 * @sax: the SAX handler block 6068 * @userData: if using SAX, this pointer will be provided on callbacks. 6069 * 6070 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks 6071 * to handle parse events. If sax is NULL, fallback to the default DOM 6072 * behavior and return a tree. 6073 * 6074 * Returns the resulting document tree unless SAX is NULL or the document is 6075 * not well formed. 6076 */ 6077 6078 htmlDocPtr 6079 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) { 6080 htmlDocPtr ret; 6081 htmlParserCtxtPtr ctxt; 6082 6083 xmlInitParser(); 6084 6085 if (cur == NULL) return(NULL); 6086 6087 6088 ctxt = htmlCreateDocParserCtxt(cur, encoding); 6089 if (ctxt == NULL) return(NULL); 6090 if (sax != NULL) { 6091 if (ctxt->sax != NULL) xmlFree (ctxt->sax); 6092 ctxt->sax = sax; 6093 ctxt->userData = userData; 6094 } 6095 6096 htmlParseDocument(ctxt); 6097 ret = ctxt->myDoc; 6098 if (sax != NULL) { 6099 ctxt->sax = NULL; 6100 ctxt->userData = NULL; 6101 } 6102 htmlFreeParserCtxt(ctxt); 6103 6104 return(ret); 6105 } 6106 6107 /** 6108 * htmlParseDoc: 6109 * @cur: a pointer to an array of xmlChar 6110 * @encoding: a free form C string describing the HTML document encoding, or NULL 6111 * 6112 * parse an HTML in-memory document and build a tree. 6113 * 6114 * Returns the resulting document tree 6115 */ 6116 6117 htmlDocPtr 6118 htmlParseDoc(xmlChar *cur, const char *encoding) { 6119 return(htmlSAXParseDoc(cur, encoding, NULL, NULL)); 6120 } 6121 6122 6123 /** 6124 * htmlCreateFileParserCtxt: 6125 * @filename: the filename 6126 * @encoding: a free form C string describing the HTML document encoding, or NULL 6127 * 6128 * Create a parser context for a file content. 6129 * Automatic support for ZLIB/Compress compressed document is provided 6130 * by default if found at compile-time. 6131 * 6132 * Returns the new parser context or NULL 6133 */ 6134 htmlParserCtxtPtr 6135 htmlCreateFileParserCtxt(const char *filename, const char *encoding) 6136 { 6137 htmlParserCtxtPtr ctxt; 6138 htmlParserInputPtr inputStream; 6139 char *canonicFilename; 6140 /* htmlCharEncoding enc; */ 6141 xmlChar *content, *content_line = (xmlChar *) "charset="; 6142 6143 if (filename == NULL) 6144 return(NULL); 6145 6146 ctxt = htmlNewParserCtxt(); 6147 if (ctxt == NULL) { 6148 return(NULL); 6149 } 6150 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename); 6151 if (canonicFilename == NULL) { 6152 #ifdef LIBXML_SAX1_ENABLED 6153 if (xmlDefaultSAXHandler.error != NULL) { 6154 xmlDefaultSAXHandler.error(NULL, "out of memory\n"); 6155 } 6156 #endif 6157 xmlFreeParserCtxt(ctxt); 6158 return(NULL); 6159 } 6160 6161 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt); 6162 xmlFree(canonicFilename); 6163 if (inputStream == NULL) { 6164 xmlFreeParserCtxt(ctxt); 6165 return(NULL); 6166 } 6167 6168 inputPush(ctxt, inputStream); 6169 6170 /* set encoding */ 6171 if (encoding) { 6172 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1); 6173 if (content) { 6174 strcpy ((char *)content, (char *)content_line); 6175 strcat ((char *)content, (char *)encoding); 6176 htmlCheckEncoding (ctxt, content); 6177 xmlFree (content); 6178 } 6179 } 6180 6181 return(ctxt); 6182 } 6183 6184 /** 6185 * htmlSAXParseFile: 6186 * @filename: the filename 6187 * @encoding: a free form C string describing the HTML document encoding, or NULL 6188 * @sax: the SAX handler block 6189 * @userData: if using SAX, this pointer will be provided on callbacks. 6190 * 6191 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 6192 * compressed document is provided by default if found at compile-time. 6193 * It use the given SAX function block to handle the parsing callback. 6194 * If sax is NULL, fallback to the default DOM tree building routines. 6195 * 6196 * Returns the resulting document tree unless SAX is NULL or the document is 6197 * not well formed. 6198 */ 6199 6200 htmlDocPtr 6201 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax, 6202 void *userData) { 6203 htmlDocPtr ret; 6204 htmlParserCtxtPtr ctxt; 6205 htmlSAXHandlerPtr oldsax = NULL; 6206 6207 xmlInitParser(); 6208 6209 ctxt = htmlCreateFileParserCtxt(filename, encoding); 6210 if (ctxt == NULL) return(NULL); 6211 if (sax != NULL) { 6212 oldsax = ctxt->sax; 6213 ctxt->sax = sax; 6214 ctxt->userData = userData; 6215 } 6216 6217 htmlParseDocument(ctxt); 6218 6219 ret = ctxt->myDoc; 6220 if (sax != NULL) { 6221 ctxt->sax = oldsax; 6222 ctxt->userData = NULL; 6223 } 6224 htmlFreeParserCtxt(ctxt); 6225 6226 return(ret); 6227 } 6228 6229 /** 6230 * htmlParseFile: 6231 * @filename: the filename 6232 * @encoding: a free form C string describing the HTML document encoding, or NULL 6233 * 6234 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 6235 * compressed document is provided by default if found at compile-time. 6236 * 6237 * Returns the resulting document tree 6238 */ 6239 6240 htmlDocPtr 6241 htmlParseFile(const char *filename, const char *encoding) { 6242 return(htmlSAXParseFile(filename, encoding, NULL, NULL)); 6243 } 6244 6245 /** 6246 * htmlHandleOmittedElem: 6247 * @val: int 0 or 1 6248 * 6249 * Set and return the previous value for handling HTML omitted tags. 6250 * 6251 * Returns the last value for 0 for no handling, 1 for auto insertion. 6252 */ 6253 6254 int 6255 htmlHandleOmittedElem(int val) { 6256 int old = htmlOmittedDefaultValue; 6257 6258 htmlOmittedDefaultValue = val; 6259 return(old); 6260 } 6261 6262 /** 6263 * htmlElementAllowedHere: 6264 * @parent: HTML parent element 6265 * @elt: HTML element 6266 * 6267 * Checks whether an HTML element may be a direct child of a parent element. 6268 * Note - doesn't check for deprecated elements 6269 * 6270 * Returns 1 if allowed; 0 otherwise. 6271 */ 6272 int 6273 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) { 6274 const char** p ; 6275 6276 if ( ! elt || ! parent || ! parent->subelts ) 6277 return 0 ; 6278 6279 for ( p = parent->subelts; *p; ++p ) 6280 if ( !xmlStrcmp((const xmlChar *)*p, elt) ) 6281 return 1 ; 6282 6283 return 0 ; 6284 } 6285 /** 6286 * htmlElementStatusHere: 6287 * @parent: HTML parent element 6288 * @elt: HTML element 6289 * 6290 * Checks whether an HTML element may be a direct child of a parent element. 6291 * and if so whether it is valid or deprecated. 6292 * 6293 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID 6294 */ 6295 htmlStatus 6296 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) { 6297 if ( ! parent || ! elt ) 6298 return HTML_INVALID ; 6299 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) ) 6300 return HTML_INVALID ; 6301 6302 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ; 6303 } 6304 /** 6305 * htmlAttrAllowed: 6306 * @elt: HTML element 6307 * @attr: HTML attribute 6308 * @legacy: whether to allow deprecated attributes 6309 * 6310 * Checks whether an attribute is valid for an element 6311 * Has full knowledge of Required and Deprecated attributes 6312 * 6313 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID 6314 */ 6315 htmlStatus 6316 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) { 6317 const char** p ; 6318 6319 if ( !elt || ! attr ) 6320 return HTML_INVALID ; 6321 6322 if ( elt->attrs_req ) 6323 for ( p = elt->attrs_req; *p; ++p) 6324 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6325 return HTML_REQUIRED ; 6326 6327 if ( elt->attrs_opt ) 6328 for ( p = elt->attrs_opt; *p; ++p) 6329 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6330 return HTML_VALID ; 6331 6332 if ( legacy && elt->attrs_depr ) 6333 for ( p = elt->attrs_depr; *p; ++p) 6334 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6335 return HTML_DEPRECATED ; 6336 6337 return HTML_INVALID ; 6338 } 6339 /** 6340 * htmlNodeStatus: 6341 * @node: an htmlNodePtr in a tree 6342 * @legacy: whether to allow deprecated elements (YES is faster here 6343 * for Element nodes) 6344 * 6345 * Checks whether the tree node is valid. Experimental (the author 6346 * only uses the HTML enhancements in a SAX parser) 6347 * 6348 * Return: for Element nodes, a return from htmlElementAllowedHere (if 6349 * legacy allowed) or htmlElementStatusHere (otherwise). 6350 * for Attribute nodes, a return from htmlAttrAllowed 6351 * for other nodes, HTML_NA (no checks performed) 6352 */ 6353 htmlStatus 6354 htmlNodeStatus(const htmlNodePtr node, int legacy) { 6355 if ( ! node ) 6356 return HTML_INVALID ; 6357 6358 switch ( node->type ) { 6359 case XML_ELEMENT_NODE: 6360 return legacy 6361 ? ( htmlElementAllowedHere ( 6362 htmlTagLookup(node->parent->name) , node->name 6363 ) ? HTML_VALID : HTML_INVALID ) 6364 : htmlElementStatusHere( 6365 htmlTagLookup(node->parent->name) , 6366 htmlTagLookup(node->name) ) 6367 ; 6368 case XML_ATTRIBUTE_NODE: 6369 return htmlAttrAllowed( 6370 htmlTagLookup(node->parent->name) , node->name, legacy) ; 6371 default: return HTML_NA ; 6372 } 6373 } 6374 /************************************************************************ 6375 * * 6376 * New set (2.6.0) of simpler and more flexible APIs * 6377 * * 6378 ************************************************************************/ 6379 /** 6380 * DICT_FREE: 6381 * @str: a string 6382 * 6383 * Free a string if it is not owned by the "dict" dictionnary in the 6384 * current scope 6385 */ 6386 #define DICT_FREE(str) \ 6387 if ((str) && ((!dict) || \ 6388 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \ 6389 xmlFree((char *)(str)); 6390 6391 /** 6392 * htmlCtxtReset: 6393 * @ctxt: an HTML parser context 6394 * 6395 * Reset a parser context 6396 */ 6397 void 6398 htmlCtxtReset(htmlParserCtxtPtr ctxt) 6399 { 6400 xmlParserInputPtr input; 6401 xmlDictPtr dict; 6402 6403 if (ctxt == NULL) 6404 return; 6405 6406 xmlInitParser(); 6407 dict = ctxt->dict; 6408 6409 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */ 6410 xmlFreeInputStream(input); 6411 } 6412 ctxt->inputNr = 0; 6413 ctxt->input = NULL; 6414 6415 ctxt->spaceNr = 0; 6416 if (ctxt->spaceTab != NULL) { 6417 ctxt->spaceTab[0] = -1; 6418 ctxt->space = &ctxt->spaceTab[0]; 6419 } else { 6420 ctxt->space = NULL; 6421 } 6422 6423 6424 ctxt->nodeNr = 0; 6425 ctxt->node = NULL; 6426 6427 ctxt->nameNr = 0; 6428 ctxt->name = NULL; 6429 6430 DICT_FREE(ctxt->version); 6431 ctxt->version = NULL; 6432 DICT_FREE(ctxt->encoding); 6433 ctxt->encoding = NULL; 6434 DICT_FREE(ctxt->directory); 6435 ctxt->directory = NULL; 6436 DICT_FREE(ctxt->extSubURI); 6437 ctxt->extSubURI = NULL; 6438 DICT_FREE(ctxt->extSubSystem); 6439 ctxt->extSubSystem = NULL; 6440 if (ctxt->myDoc != NULL) 6441 xmlFreeDoc(ctxt->myDoc); 6442 ctxt->myDoc = NULL; 6443 6444 ctxt->standalone = -1; 6445 ctxt->hasExternalSubset = 0; 6446 ctxt->hasPErefs = 0; 6447 ctxt->html = 1; 6448 ctxt->external = 0; 6449 ctxt->instate = XML_PARSER_START; 6450 ctxt->token = 0; 6451 6452 ctxt->wellFormed = 1; 6453 ctxt->nsWellFormed = 1; 6454 ctxt->valid = 1; 6455 ctxt->vctxt.userData = ctxt; 6456 ctxt->vctxt.error = xmlParserValidityError; 6457 ctxt->vctxt.warning = xmlParserValidityWarning; 6458 ctxt->record_info = 0; 6459 ctxt->nbChars = 0; 6460 ctxt->checkIndex = 0; 6461 ctxt->inSubset = 0; 6462 ctxt->errNo = XML_ERR_OK; 6463 ctxt->depth = 0; 6464 ctxt->charset = XML_CHAR_ENCODING_NONE; 6465 ctxt->catalogs = NULL; 6466 xmlInitNodeInfoSeq(&ctxt->node_seq); 6467 6468 if (ctxt->attsDefault != NULL) { 6469 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree); 6470 ctxt->attsDefault = NULL; 6471 } 6472 if (ctxt->attsSpecial != NULL) { 6473 xmlHashFree(ctxt->attsSpecial, NULL); 6474 ctxt->attsSpecial = NULL; 6475 } 6476 } 6477 6478 /** 6479 * htmlCtxtUseOptions: 6480 * @ctxt: an HTML parser context 6481 * @options: a combination of htmlParserOption(s) 6482 * 6483 * Applies the options to the parser context 6484 * 6485 * Returns 0 in case of success, the set of unknown or unimplemented options 6486 * in case of error. 6487 */ 6488 int 6489 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options) 6490 { 6491 if (ctxt == NULL) 6492 return(-1); 6493 6494 if (options & HTML_PARSE_NOWARNING) { 6495 ctxt->sax->warning = NULL; 6496 ctxt->vctxt.warning = NULL; 6497 options -= XML_PARSE_NOWARNING; 6498 ctxt->options |= XML_PARSE_NOWARNING; 6499 } 6500 if (options & HTML_PARSE_NOERROR) { 6501 ctxt->sax->error = NULL; 6502 ctxt->vctxt.error = NULL; 6503 ctxt->sax->fatalError = NULL; 6504 options -= XML_PARSE_NOERROR; 6505 ctxt->options |= XML_PARSE_NOERROR; 6506 } 6507 if (options & HTML_PARSE_PEDANTIC) { 6508 ctxt->pedantic = 1; 6509 options -= XML_PARSE_PEDANTIC; 6510 ctxt->options |= XML_PARSE_PEDANTIC; 6511 } else 6512 ctxt->pedantic = 0; 6513 if (options & XML_PARSE_NOBLANKS) { 6514 ctxt->keepBlanks = 0; 6515 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace; 6516 options -= XML_PARSE_NOBLANKS; 6517 ctxt->options |= XML_PARSE_NOBLANKS; 6518 } else 6519 ctxt->keepBlanks = 1; 6520 if (options & HTML_PARSE_RECOVER) { 6521 ctxt->recovery = 1; 6522 options -= HTML_PARSE_RECOVER; 6523 } else 6524 ctxt->recovery = 0; 6525 if (options & HTML_PARSE_COMPACT) { 6526 ctxt->options |= HTML_PARSE_COMPACT; 6527 options -= HTML_PARSE_COMPACT; 6528 } 6529 if (options & XML_PARSE_HUGE) { 6530 ctxt->options |= XML_PARSE_HUGE; 6531 options -= XML_PARSE_HUGE; 6532 } 6533 ctxt->dictNames = 0; 6534 return (options); 6535 } 6536 6537 /** 6538 * htmlDoRead: 6539 * @ctxt: an HTML parser context 6540 * @URL: the base URL to use for the document 6541 * @encoding: the document encoding, or NULL 6542 * @options: a combination of htmlParserOption(s) 6543 * @reuse: keep the context for reuse 6544 * 6545 * Common front-end for the htmlRead functions 6546 * 6547 * Returns the resulting document tree or NULL 6548 */ 6549 static htmlDocPtr 6550 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding, 6551 int options, int reuse) 6552 { 6553 htmlDocPtr ret; 6554 6555 htmlCtxtUseOptions(ctxt, options); 6556 ctxt->html = 1; 6557 if (encoding != NULL) { 6558 xmlCharEncodingHandlerPtr hdlr; 6559 6560 hdlr = xmlFindCharEncodingHandler(encoding); 6561 if (hdlr != NULL) { 6562 xmlSwitchToEncoding(ctxt, hdlr); 6563 if (ctxt->input->encoding != NULL) 6564 xmlFree((xmlChar *) ctxt->input->encoding); 6565 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding); 6566 } 6567 } 6568 if ((URL != NULL) && (ctxt->input != NULL) && 6569 (ctxt->input->filename == NULL)) 6570 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL); 6571 htmlParseDocument(ctxt); 6572 ret = ctxt->myDoc; 6573 ctxt->myDoc = NULL; 6574 if (!reuse) { 6575 if ((ctxt->dictNames) && 6576 (ret != NULL) && 6577 (ret->dict == ctxt->dict)) 6578 ctxt->dict = NULL; 6579 xmlFreeParserCtxt(ctxt); 6580 } 6581 return (ret); 6582 } 6583 6584 /** 6585 * htmlReadDoc: 6586 * @cur: a pointer to a zero terminated string 6587 * @URL: the base URL to use for the document 6588 * @encoding: the document encoding, or NULL 6589 * @options: a combination of htmlParserOption(s) 6590 * 6591 * parse an XML in-memory document and build a tree. 6592 * 6593 * Returns the resulting document tree 6594 */ 6595 htmlDocPtr 6596 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options) 6597 { 6598 htmlParserCtxtPtr ctxt; 6599 6600 if (cur == NULL) 6601 return (NULL); 6602 6603 xmlInitParser(); 6604 ctxt = htmlCreateDocParserCtxt(cur, NULL); 6605 if (ctxt == NULL) 6606 return (NULL); 6607 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6608 } 6609 6610 /** 6611 * htmlReadFile: 6612 * @filename: a file or URL 6613 * @encoding: the document encoding, or NULL 6614 * @options: a combination of htmlParserOption(s) 6615 * 6616 * parse an XML file from the filesystem or the network. 6617 * 6618 * Returns the resulting document tree 6619 */ 6620 htmlDocPtr 6621 htmlReadFile(const char *filename, const char *encoding, int options) 6622 { 6623 htmlParserCtxtPtr ctxt; 6624 6625 xmlInitParser(); 6626 ctxt = htmlCreateFileParserCtxt(filename, encoding); 6627 if (ctxt == NULL) 6628 return (NULL); 6629 return (htmlDoRead(ctxt, NULL, NULL, options, 0)); 6630 } 6631 6632 /** 6633 * htmlReadMemory: 6634 * @buffer: a pointer to a char array 6635 * @size: the size of the array 6636 * @URL: the base URL to use for the document 6637 * @encoding: the document encoding, or NULL 6638 * @options: a combination of htmlParserOption(s) 6639 * 6640 * parse an XML in-memory document and build a tree. 6641 * 6642 * Returns the resulting document tree 6643 */ 6644 htmlDocPtr 6645 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options) 6646 { 6647 htmlParserCtxtPtr ctxt; 6648 6649 xmlInitParser(); 6650 ctxt = xmlCreateMemoryParserCtxt(buffer, size); 6651 if (ctxt == NULL) 6652 return (NULL); 6653 htmlDefaultSAXHandlerInit(); 6654 if (ctxt->sax != NULL) 6655 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 6656 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6657 } 6658 6659 /** 6660 * htmlReadFd: 6661 * @fd: an open file descriptor 6662 * @URL: the base URL to use for the document 6663 * @encoding: the document encoding, or NULL 6664 * @options: a combination of htmlParserOption(s) 6665 * 6666 * parse an XML from a file descriptor and build a tree. 6667 * 6668 * Returns the resulting document tree 6669 */ 6670 htmlDocPtr 6671 htmlReadFd(int fd, const char *URL, const char *encoding, int options) 6672 { 6673 htmlParserCtxtPtr ctxt; 6674 xmlParserInputBufferPtr input; 6675 xmlParserInputPtr stream; 6676 6677 if (fd < 0) 6678 return (NULL); 6679 6680 xmlInitParser(); 6681 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 6682 if (input == NULL) 6683 return (NULL); 6684 ctxt = xmlNewParserCtxt(); 6685 if (ctxt == NULL) { 6686 xmlFreeParserInputBuffer(input); 6687 return (NULL); 6688 } 6689 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6690 if (stream == NULL) { 6691 xmlFreeParserInputBuffer(input); 6692 xmlFreeParserCtxt(ctxt); 6693 return (NULL); 6694 } 6695 inputPush(ctxt, stream); 6696 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6697 } 6698 6699 /** 6700 * htmlReadIO: 6701 * @ioread: an I/O read function 6702 * @ioclose: an I/O close function 6703 * @ioctx: an I/O handler 6704 * @URL: the base URL to use for the document 6705 * @encoding: the document encoding, or NULL 6706 * @options: a combination of htmlParserOption(s) 6707 * 6708 * parse an HTML document from I/O functions and source and build a tree. 6709 * 6710 * Returns the resulting document tree 6711 */ 6712 htmlDocPtr 6713 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, 6714 void *ioctx, const char *URL, const char *encoding, int options) 6715 { 6716 htmlParserCtxtPtr ctxt; 6717 xmlParserInputBufferPtr input; 6718 xmlParserInputPtr stream; 6719 6720 if (ioread == NULL) 6721 return (NULL); 6722 xmlInitParser(); 6723 6724 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 6725 XML_CHAR_ENCODING_NONE); 6726 if (input == NULL) 6727 return (NULL); 6728 ctxt = htmlNewParserCtxt(); 6729 if (ctxt == NULL) { 6730 xmlFreeParserInputBuffer(input); 6731 return (NULL); 6732 } 6733 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6734 if (stream == NULL) { 6735 xmlFreeParserInputBuffer(input); 6736 xmlFreeParserCtxt(ctxt); 6737 return (NULL); 6738 } 6739 inputPush(ctxt, stream); 6740 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6741 } 6742 6743 /** 6744 * htmlCtxtReadDoc: 6745 * @ctxt: an HTML parser context 6746 * @cur: a pointer to a zero terminated string 6747 * @URL: the base URL to use for the document 6748 * @encoding: the document encoding, or NULL 6749 * @options: a combination of htmlParserOption(s) 6750 * 6751 * parse an XML in-memory document and build a tree. 6752 * This reuses the existing @ctxt parser context 6753 * 6754 * Returns the resulting document tree 6755 */ 6756 htmlDocPtr 6757 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur, 6758 const char *URL, const char *encoding, int options) 6759 { 6760 xmlParserInputPtr stream; 6761 6762 if (cur == NULL) 6763 return (NULL); 6764 if (ctxt == NULL) 6765 return (NULL); 6766 6767 htmlCtxtReset(ctxt); 6768 6769 stream = xmlNewStringInputStream(ctxt, cur); 6770 if (stream == NULL) { 6771 return (NULL); 6772 } 6773 inputPush(ctxt, stream); 6774 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6775 } 6776 6777 /** 6778 * htmlCtxtReadFile: 6779 * @ctxt: an HTML parser context 6780 * @filename: a file or URL 6781 * @encoding: the document encoding, or NULL 6782 * @options: a combination of htmlParserOption(s) 6783 * 6784 * parse an XML file from the filesystem or the network. 6785 * This reuses the existing @ctxt parser context 6786 * 6787 * Returns the resulting document tree 6788 */ 6789 htmlDocPtr 6790 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename, 6791 const char *encoding, int options) 6792 { 6793 xmlParserInputPtr stream; 6794 6795 if (filename == NULL) 6796 return (NULL); 6797 if (ctxt == NULL) 6798 return (NULL); 6799 6800 htmlCtxtReset(ctxt); 6801 6802 stream = xmlLoadExternalEntity(filename, NULL, ctxt); 6803 if (stream == NULL) { 6804 return (NULL); 6805 } 6806 inputPush(ctxt, stream); 6807 return (htmlDoRead(ctxt, NULL, encoding, options, 1)); 6808 } 6809 6810 /** 6811 * htmlCtxtReadMemory: 6812 * @ctxt: an HTML parser context 6813 * @buffer: a pointer to a char array 6814 * @size: the size of the array 6815 * @URL: the base URL to use for the document 6816 * @encoding: the document encoding, or NULL 6817 * @options: a combination of htmlParserOption(s) 6818 * 6819 * parse an XML in-memory document and build a tree. 6820 * This reuses the existing @ctxt parser context 6821 * 6822 * Returns the resulting document tree 6823 */ 6824 htmlDocPtr 6825 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size, 6826 const char *URL, const char *encoding, int options) 6827 { 6828 xmlParserInputBufferPtr input; 6829 xmlParserInputPtr stream; 6830 6831 if (ctxt == NULL) 6832 return (NULL); 6833 if (buffer == NULL) 6834 return (NULL); 6835 6836 htmlCtxtReset(ctxt); 6837 6838 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); 6839 if (input == NULL) { 6840 return(NULL); 6841 } 6842 6843 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6844 if (stream == NULL) { 6845 xmlFreeParserInputBuffer(input); 6846 return(NULL); 6847 } 6848 6849 inputPush(ctxt, stream); 6850 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6851 } 6852 6853 /** 6854 * htmlCtxtReadFd: 6855 * @ctxt: an HTML parser context 6856 * @fd: an open file descriptor 6857 * @URL: the base URL to use for the document 6858 * @encoding: the document encoding, or NULL 6859 * @options: a combination of htmlParserOption(s) 6860 * 6861 * parse an XML from a file descriptor and build a tree. 6862 * This reuses the existing @ctxt parser context 6863 * 6864 * Returns the resulting document tree 6865 */ 6866 htmlDocPtr 6867 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd, 6868 const char *URL, const char *encoding, int options) 6869 { 6870 xmlParserInputBufferPtr input; 6871 xmlParserInputPtr stream; 6872 6873 if (fd < 0) 6874 return (NULL); 6875 if (ctxt == NULL) 6876 return (NULL); 6877 6878 htmlCtxtReset(ctxt); 6879 6880 6881 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 6882 if (input == NULL) 6883 return (NULL); 6884 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6885 if (stream == NULL) { 6886 xmlFreeParserInputBuffer(input); 6887 return (NULL); 6888 } 6889 inputPush(ctxt, stream); 6890 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6891 } 6892 6893 /** 6894 * htmlCtxtReadIO: 6895 * @ctxt: an HTML parser context 6896 * @ioread: an I/O read function 6897 * @ioclose: an I/O close function 6898 * @ioctx: an I/O handler 6899 * @URL: the base URL to use for the document 6900 * @encoding: the document encoding, or NULL 6901 * @options: a combination of htmlParserOption(s) 6902 * 6903 * parse an HTML document from I/O functions and source and build a tree. 6904 * This reuses the existing @ctxt parser context 6905 * 6906 * Returns the resulting document tree 6907 */ 6908 htmlDocPtr 6909 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, 6910 xmlInputCloseCallback ioclose, void *ioctx, 6911 const char *URL, 6912 const char *encoding, int options) 6913 { 6914 xmlParserInputBufferPtr input; 6915 xmlParserInputPtr stream; 6916 6917 if (ioread == NULL) 6918 return (NULL); 6919 if (ctxt == NULL) 6920 return (NULL); 6921 6922 htmlCtxtReset(ctxt); 6923 6924 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 6925 XML_CHAR_ENCODING_NONE); 6926 if (input == NULL) 6927 return (NULL); 6928 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6929 if (stream == NULL) { 6930 xmlFreeParserInputBuffer(input); 6931 return (NULL); 6932 } 6933 inputPush(ctxt, stream); 6934 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6935 } 6936 6937 #define bottom_HTMLparser 6938 #include "elfgcchack.h" 6939 #endif /* LIBXML_HTML_ENABLED */ 6940