1 /* 2 * HTMLparser.c : an HTML 4.0 non-verifying parser 3 * 4 * See Copyright for the status of this software. 5 * 6 * daniel (at) veillard.com 7 */ 8 9 #define IN_LIBXML 10 #include "libxml.h" 11 #ifdef LIBXML_HTML_ENABLED 12 13 #include <string.h> 14 #ifdef HAVE_CTYPE_H 15 #include <ctype.h> 16 #endif 17 #ifdef HAVE_STDLIB_H 18 #include <stdlib.h> 19 #endif 20 #ifdef HAVE_SYS_STAT_H 21 #include <sys/stat.h> 22 #endif 23 #ifdef HAVE_FCNTL_H 24 #include <fcntl.h> 25 #endif 26 #ifdef HAVE_UNISTD_H 27 #include <unistd.h> 28 #endif 29 #ifdef HAVE_ZLIB_H 30 #include <zlib.h> 31 #endif 32 33 #include <libxml/xmlmemory.h> 34 #include <libxml/tree.h> 35 #include <libxml/parser.h> 36 #include <libxml/parserInternals.h> 37 #include <libxml/xmlerror.h> 38 #include <libxml/HTMLparser.h> 39 #include <libxml/HTMLtree.h> 40 #include <libxml/entities.h> 41 #include <libxml/encoding.h> 42 #include <libxml/valid.h> 43 #include <libxml/xmlIO.h> 44 #include <libxml/globals.h> 45 #include <libxml/uri.h> 46 47 #define HTML_MAX_NAMELEN 1000 48 #define HTML_PARSER_BIG_BUFFER_SIZE 1000 49 #define HTML_PARSER_BUFFER_SIZE 100 50 51 /* #define DEBUG */ 52 /* #define DEBUG_PUSH */ 53 54 static int htmlOmittedDefaultValue = 1; 55 56 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len, 57 xmlChar end, xmlChar end2, xmlChar end3); 58 static void htmlParseComment(htmlParserCtxtPtr ctxt); 59 60 /************************************************************************ 61 * * 62 * Some factorized error routines * 63 * * 64 ************************************************************************/ 65 66 /** 67 * htmlErrMemory: 68 * @ctxt: an HTML parser context 69 * @extra: extra informations 70 * 71 * Handle a redefinition of attribute error 72 */ 73 static void 74 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra) 75 { 76 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 77 (ctxt->instate == XML_PARSER_EOF)) 78 return; 79 if (ctxt != NULL) { 80 ctxt->errNo = XML_ERR_NO_MEMORY; 81 ctxt->instate = XML_PARSER_EOF; 82 ctxt->disableSAX = 1; 83 } 84 if (extra) 85 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 86 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 87 NULL, NULL, 0, 0, 88 "Memory allocation failed : %s\n", extra); 89 else 90 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 91 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL, 92 NULL, NULL, 0, 0, "Memory allocation failed\n"); 93 } 94 95 /** 96 * htmlParseErr: 97 * @ctxt: an HTML parser context 98 * @error: the error number 99 * @msg: the error message 100 * @str1: string infor 101 * @str2: string infor 102 * 103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints 104 */ 105 static void 106 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, 107 const char *msg, const xmlChar *str1, const xmlChar *str2) 108 { 109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 110 (ctxt->instate == XML_PARSER_EOF)) 111 return; 112 if (ctxt != NULL) 113 ctxt->errNo = error; 114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 115 XML_ERR_ERROR, NULL, 0, 116 (const char *) str1, (const char *) str2, 117 NULL, 0, 0, 118 msg, str1, str2); 119 if (ctxt != NULL) 120 ctxt->wellFormed = 0; 121 } 122 123 /** 124 * htmlParseErrInt: 125 * @ctxt: an HTML parser context 126 * @error: the error number 127 * @msg: the error message 128 * @val: integer info 129 * 130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints 131 */ 132 static void 133 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error, 134 const char *msg, int val) 135 { 136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 137 (ctxt->instate == XML_PARSER_EOF)) 138 return; 139 if (ctxt != NULL) 140 ctxt->errNo = error; 141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 142 XML_ERR_ERROR, NULL, 0, NULL, NULL, 143 NULL, val, 0, msg, val); 144 if (ctxt != NULL) 145 ctxt->wellFormed = 0; 146 } 147 148 /************************************************************************ 149 * * 150 * Parser stacks related functions and macros * 151 * * 152 ************************************************************************/ 153 154 /** 155 * htmlnamePush: 156 * @ctxt: an HTML parser context 157 * @value: the element name 158 * 159 * Pushes a new element name on top of the name stack 160 * 161 * Returns 0 in case of error, the index in the stack otherwise 162 */ 163 static int 164 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value) 165 { 166 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head"))) 167 ctxt->html = 3; 168 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body"))) 169 ctxt->html = 10; 170 if (ctxt->nameNr >= ctxt->nameMax) { 171 ctxt->nameMax *= 2; 172 ctxt->nameTab = (const xmlChar * *) 173 xmlRealloc((xmlChar * *)ctxt->nameTab, 174 ctxt->nameMax * 175 sizeof(ctxt->nameTab[0])); 176 if (ctxt->nameTab == NULL) { 177 htmlErrMemory(ctxt, NULL); 178 return (0); 179 } 180 } 181 ctxt->nameTab[ctxt->nameNr] = value; 182 ctxt->name = value; 183 return (ctxt->nameNr++); 184 } 185 /** 186 * htmlnamePop: 187 * @ctxt: an HTML parser context 188 * 189 * Pops the top element name from the name stack 190 * 191 * Returns the name just removed 192 */ 193 static const xmlChar * 194 htmlnamePop(htmlParserCtxtPtr ctxt) 195 { 196 const xmlChar *ret; 197 198 if (ctxt->nameNr <= 0) 199 return (NULL); 200 ctxt->nameNr--; 201 if (ctxt->nameNr < 0) 202 return (NULL); 203 if (ctxt->nameNr > 0) 204 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1]; 205 else 206 ctxt->name = NULL; 207 ret = ctxt->nameTab[ctxt->nameNr]; 208 ctxt->nameTab[ctxt->nameNr] = NULL; 209 return (ret); 210 } 211 212 /** 213 * htmlNodeInfoPush: 214 * @ctxt: an HTML parser context 215 * @value: the node info 216 * 217 * Pushes a new element name on top of the node info stack 218 * 219 * Returns 0 in case of error, the index in the stack otherwise 220 */ 221 static int 222 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value) 223 { 224 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) { 225 if (ctxt->nodeInfoMax == 0) 226 ctxt->nodeInfoMax = 5; 227 ctxt->nodeInfoMax *= 2; 228 ctxt->nodeInfoTab = (htmlParserNodeInfo *) 229 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab, 230 ctxt->nodeInfoMax * 231 sizeof(ctxt->nodeInfoTab[0])); 232 if (ctxt->nodeInfoTab == NULL) { 233 htmlErrMemory(ctxt, NULL); 234 return (0); 235 } 236 } 237 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value; 238 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr]; 239 return (ctxt->nodeInfoNr++); 240 } 241 242 /** 243 * htmlNodeInfoPop: 244 * @ctxt: an HTML parser context 245 * 246 * Pops the top element name from the node info stack 247 * 248 * Returns 0 in case of error, the pointer to NodeInfo otherwise 249 */ 250 static htmlParserNodeInfo * 251 htmlNodeInfoPop(htmlParserCtxtPtr ctxt) 252 { 253 if (ctxt->nodeInfoNr <= 0) 254 return (NULL); 255 ctxt->nodeInfoNr--; 256 if (ctxt->nodeInfoNr < 0) 257 return (NULL); 258 if (ctxt->nodeInfoNr > 0) 259 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1]; 260 else 261 ctxt->nodeInfo = NULL; 262 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr]; 263 } 264 265 /* 266 * Macros for accessing the content. Those should be used only by the parser, 267 * and not exported. 268 * 269 * Dirty macros, i.e. one need to make assumption on the context to use them 270 * 271 * CUR_PTR return the current pointer to the xmlChar to be parsed. 272 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled 273 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled 274 * in UNICODE mode. This should be used internally by the parser 275 * only to compare to ASCII values otherwise it would break when 276 * running with UTF-8 encoding. 277 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only 278 * to compare on ASCII based substring. 279 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR 280 * it should be used only to compare on ASCII based substring. 281 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined 282 * strings without newlines within the parser. 283 * 284 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding 285 * 286 * CURRENT Returns the current char value, with the full decoding of 287 * UTF-8 if we are using this mode. It returns an int. 288 * NEXT Skip to the next character, this does the proper decoding 289 * in UTF-8 mode. It also pop-up unfinished entities on the fly. 290 * NEXTL(l) Skip the current unicode character of l xmlChars long. 291 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly 292 */ 293 294 #define UPPER (toupper(*ctxt->input->cur)) 295 296 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val) 297 298 #define NXT(val) ctxt->input->cur[(val)] 299 300 #define UPP(val) (toupper(ctxt->input->cur[(val)])) 301 302 #define CUR_PTR ctxt->input->cur 303 304 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \ 305 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \ 306 xmlParserInputShrink(ctxt->input) 307 308 #define GROW if ((ctxt->progressive == 0) && \ 309 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \ 310 xmlParserInputGrow(ctxt->input, INPUT_CHUNK) 311 312 #define CURRENT ((int) (*ctxt->input->cur)) 313 314 #define SKIP_BLANKS htmlSkipBlankChars(ctxt) 315 316 /* Inported from XML */ 317 318 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */ 319 #define CUR ((int) (*ctxt->input->cur)) 320 #define NEXT xmlNextChar(ctxt) 321 322 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur)) 323 324 325 #define NEXTL(l) do { \ 326 if (*(ctxt->input->cur) == '\n') { \ 327 ctxt->input->line++; ctxt->input->col = 1; \ 328 } else ctxt->input->col++; \ 329 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \ 330 } while (0) 331 332 /************ 333 \ 334 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \ 335 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt); 336 ************/ 337 338 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l) 339 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l) 340 341 #define COPY_BUF(l,b,i,v) \ 342 if (l == 1) b[i++] = (xmlChar) v; \ 343 else i += xmlCopyChar(l,&b[i],v) 344 345 /** 346 * htmlFindEncoding: 347 * @the HTML parser context 348 * 349 * Ty to find and encoding in the current data available in the input 350 * buffer this is needed to try to switch to the proper encoding when 351 * one face a character error. 352 * That's an heuristic, since it's operating outside of parsing it could 353 * try to use a meta which had been commented out, that's the reason it 354 * should only be used in case of error, not as a default. 355 * 356 * Returns an encoding string or NULL if not found, the string need to 357 * be freed 358 */ 359 static xmlChar * 360 htmlFindEncoding(xmlParserCtxtPtr ctxt) { 361 const xmlChar *start, *cur, *end; 362 363 if ((ctxt == NULL) || (ctxt->input == NULL) || 364 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) || 365 (ctxt->input->buf->encoder != NULL)) 366 return(NULL); 367 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL)) 368 return(NULL); 369 370 start = ctxt->input->cur; 371 end = ctxt->input->end; 372 /* we also expect the input buffer to be zero terminated */ 373 if (*end != 0) 374 return(NULL); 375 376 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV"); 377 if (cur == NULL) 378 return(NULL); 379 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT"); 380 if (cur == NULL) 381 return(NULL); 382 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET="); 383 if (cur == NULL) 384 return(NULL); 385 cur += 8; 386 start = cur; 387 while (((*cur >= 'A') && (*cur <= 'Z')) || 388 ((*cur >= 'a') && (*cur <= 'z')) || 389 ((*cur >= '0') && (*cur <= '9')) || 390 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/')) 391 cur++; 392 if (cur == start) 393 return(NULL); 394 return(xmlStrndup(start, cur - start)); 395 } 396 397 /** 398 * htmlCurrentChar: 399 * @ctxt: the HTML parser context 400 * @len: pointer to the length of the char read 401 * 402 * The current char value, if using UTF-8 this may actually span multiple 403 * bytes in the input buffer. Implement the end of line normalization: 404 * 2.11 End-of-Line Handling 405 * If the encoding is unspecified, in the case we find an ISO-Latin-1 406 * char, then the encoding converter is plugged in automatically. 407 * 408 * Returns the current char value and its length 409 */ 410 411 static int 412 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { 413 if (ctxt->instate == XML_PARSER_EOF) 414 return(0); 415 416 if (ctxt->token != 0) { 417 *len = 0; 418 return(ctxt->token); 419 } 420 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { 421 /* 422 * We are supposed to handle UTF8, check it's valid 423 * From rfc2044: encoding of the Unicode values on UTF-8: 424 * 425 * UCS-4 range (hex.) UTF-8 octet sequence (binary) 426 * 0000 0000-0000 007F 0xxxxxxx 427 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx 428 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 429 * 430 * Check for the 0x110000 limit too 431 */ 432 const unsigned char *cur = ctxt->input->cur; 433 unsigned char c; 434 unsigned int val; 435 436 c = *cur; 437 if (c & 0x80) { 438 if (cur[1] == 0) { 439 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 440 cur = ctxt->input->cur; 441 } 442 if ((cur[1] & 0xc0) != 0x80) 443 goto encoding_error; 444 if ((c & 0xe0) == 0xe0) { 445 446 if (cur[2] == 0) { 447 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 448 cur = ctxt->input->cur; 449 } 450 if ((cur[2] & 0xc0) != 0x80) 451 goto encoding_error; 452 if ((c & 0xf0) == 0xf0) { 453 if (cur[3] == 0) { 454 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 455 cur = ctxt->input->cur; 456 } 457 if (((c & 0xf8) != 0xf0) || 458 ((cur[3] & 0xc0) != 0x80)) 459 goto encoding_error; 460 /* 4-byte code */ 461 *len = 4; 462 val = (cur[0] & 0x7) << 18; 463 val |= (cur[1] & 0x3f) << 12; 464 val |= (cur[2] & 0x3f) << 6; 465 val |= cur[3] & 0x3f; 466 } else { 467 /* 3-byte code */ 468 *len = 3; 469 val = (cur[0] & 0xf) << 12; 470 val |= (cur[1] & 0x3f) << 6; 471 val |= cur[2] & 0x3f; 472 } 473 } else { 474 /* 2-byte code */ 475 *len = 2; 476 val = (cur[0] & 0x1f) << 6; 477 val |= cur[1] & 0x3f; 478 } 479 if (!IS_CHAR(val)) { 480 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 481 "Char 0x%X out of allowed range\n", val); 482 } 483 return(val); 484 } else { 485 if ((*ctxt->input->cur == 0) && 486 (ctxt->input->cur < ctxt->input->end)) { 487 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 488 "Char 0x%X out of allowed range\n", 0); 489 *len = 1; 490 return(' '); 491 } 492 /* 1-byte code */ 493 *len = 1; 494 return((int) *ctxt->input->cur); 495 } 496 } 497 /* 498 * Assume it's a fixed length encoding (1) with 499 * a compatible encoding for the ASCII set, since 500 * XML constructs only use < 128 chars 501 */ 502 *len = 1; 503 if ((int) *ctxt->input->cur < 0x80) 504 return((int) *ctxt->input->cur); 505 506 /* 507 * Humm this is bad, do an automatic flow conversion 508 */ 509 { 510 xmlChar * guess; 511 xmlCharEncodingHandlerPtr handler; 512 513 guess = htmlFindEncoding(ctxt); 514 if (guess == NULL) { 515 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); 516 } else { 517 if (ctxt->input->encoding != NULL) 518 xmlFree((xmlChar *) ctxt->input->encoding); 519 ctxt->input->encoding = guess; 520 handler = xmlFindCharEncodingHandler((const char *) guess); 521 if (handler != NULL) { 522 xmlSwitchToEncoding(ctxt, handler); 523 } else { 524 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 525 "Unsupported encoding %s", guess, NULL); 526 } 527 } 528 ctxt->charset = XML_CHAR_ENCODING_UTF8; 529 } 530 531 return(xmlCurrentChar(ctxt, len)); 532 533 encoding_error: 534 /* 535 * If we detect an UTF8 error that probably mean that the 536 * input encoding didn't get properly advertized in the 537 * declaration header. Report the error and switch the encoding 538 * to ISO-Latin-1 (if you don't like this policy, just declare the 539 * encoding !) 540 */ 541 { 542 char buffer[150]; 543 544 if (ctxt->input->end - ctxt->input->cur >= 4) { 545 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", 546 ctxt->input->cur[0], ctxt->input->cur[1], 547 ctxt->input->cur[2], ctxt->input->cur[3]); 548 } else { 549 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]); 550 } 551 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 552 "Input is not proper UTF-8, indicate encoding !\n", 553 BAD_CAST buffer, NULL); 554 } 555 556 ctxt->charset = XML_CHAR_ENCODING_8859_1; 557 *len = 1; 558 return((int) *ctxt->input->cur); 559 } 560 561 /** 562 * htmlSkipBlankChars: 563 * @ctxt: the HTML parser context 564 * 565 * skip all blanks character found at that point in the input streams. 566 * 567 * Returns the number of space chars skipped 568 */ 569 570 static int 571 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) { 572 int res = 0; 573 574 while (IS_BLANK_CH(*(ctxt->input->cur))) { 575 if ((*ctxt->input->cur == 0) && 576 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { 577 xmlPopInput(ctxt); 578 } else { 579 if (*(ctxt->input->cur) == '\n') { 580 ctxt->input->line++; ctxt->input->col = 1; 581 } else ctxt->input->col++; 582 ctxt->input->cur++; 583 ctxt->nbChars++; 584 if (*ctxt->input->cur == 0) 585 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 586 } 587 res++; 588 } 589 return(res); 590 } 591 592 593 594 /************************************************************************ 595 * * 596 * The list of HTML elements and their properties * 597 * * 598 ************************************************************************/ 599 600 /* 601 * Start Tag: 1 means the start tag can be ommited 602 * End Tag: 1 means the end tag can be ommited 603 * 2 means it's forbidden (empty elements) 604 * 3 means the tag is stylistic and should be closed easily 605 * Depr: this element is deprecated 606 * DTD: 1 means that this element is valid only in the Loose DTD 607 * 2 means that this element is valid only in the Frameset DTD 608 * 609 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description 610 , subElements , impliedsubelt , Attributes, userdata 611 */ 612 613 /* Definitions and a couple of vars for HTML Elements */ 614 615 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small" 616 #define NB_FONTSTYLE 8 617 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym" 618 #define NB_PHRASE 10 619 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe" 620 #define NB_SPECIAL 16 621 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL 622 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL 623 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address" 624 #define NB_BLOCK NB_HEADING + NB_LIST + 14 625 #define FORMCTRL "input", "select", "textarea", "label", "button" 626 #define NB_FORMCTRL 5 627 #define PCDATA 628 #define NB_PCDATA 0 629 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6" 630 #define NB_HEADING 6 631 #define LIST "ul", "ol", "dir", "menu" 632 #define NB_LIST 4 633 #define MODIFIER 634 #define NB_MODIFIER 0 635 #define FLOW BLOCK,INLINE 636 #define NB_FLOW NB_BLOCK + NB_INLINE 637 #define EMPTY NULL 638 639 640 static const char* const html_flow[] = { FLOW, NULL } ; 641 static const char* const html_inline[] = { INLINE, NULL } ; 642 643 /* placeholders: elts with content but no subelements */ 644 static const char* const html_pcdata[] = { NULL } ; 645 #define html_cdata html_pcdata 646 647 648 /* ... and for HTML Attributes */ 649 650 #define COREATTRS "id", "class", "style", "title" 651 #define NB_COREATTRS 4 652 #define I18N "lang", "dir" 653 #define NB_I18N 2 654 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup" 655 #define NB_EVENTS 9 656 #define ATTRS COREATTRS,I18N,EVENTS 657 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS 658 #define CELLHALIGN "align", "char", "charoff" 659 #define NB_CELLHALIGN 3 660 #define CELLVALIGN "valign" 661 #define NB_CELLVALIGN 1 662 663 static const char* const html_attrs[] = { ATTRS, NULL } ; 664 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ; 665 static const char* const core_attrs[] = { COREATTRS, NULL } ; 666 static const char* const i18n_attrs[] = { I18N, NULL } ; 667 668 669 /* Other declarations that should go inline ... */ 670 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name", 671 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords", 672 "tabindex", "onfocus", "onblur", NULL } ; 673 static const char* const target_attr[] = { "target", NULL } ; 674 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ; 675 static const char* const alt_attr[] = { "alt", NULL } ; 676 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ; 677 static const char* const href_attrs[] = { "href", NULL } ; 678 static const char* const clear_attrs[] = { "clear", NULL } ; 679 static const char* const inline_p[] = { INLINE, "p", NULL } ; 680 681 static const char* const flow_param[] = { FLOW, "param", NULL } ; 682 static const char* const applet_attrs[] = { COREATTRS , "codebase", 683 "archive", "alt", "name", "height", "width", "align", 684 "hspace", "vspace", NULL } ; 685 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref", 686 "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 687 static const char* const basefont_attrs[] = 688 { "id", "size", "color", "face", NULL } ; 689 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ; 690 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ; 691 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ; 692 static const char* const body_depr[] = { "background", "bgcolor", "text", 693 "link", "vlink", "alink", NULL } ; 694 static const char* const button_attrs[] = { ATTRS, "name", "value", "type", 695 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 696 697 698 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ; 699 static const char* const col_elt[] = { "col", NULL } ; 700 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ; 701 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ; 702 static const char* const dl_contents[] = { "dt", "dd", NULL } ; 703 static const char* const compact_attr[] = { "compact", NULL } ; 704 static const char* const label_attr[] = { "label", NULL } ; 705 static const char* const fieldset_contents[] = { FLOW, "legend" } ; 706 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ; 707 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ; 708 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ; 709 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ; 710 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ; 711 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ; 712 static const char* const head_attrs[] = { I18N, "profile", NULL } ; 713 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ; 714 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ; 715 static const char* const version_attr[] = { "version", NULL } ; 716 static const char* const html_content[] = { "head", "body", "frameset", NULL } ; 717 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ; 718 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ; 719 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ; 720 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ; 721 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ; 722 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ; 723 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ; 724 static const char* const align_attr[] = { "align", NULL } ; 725 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ; 726 static const char* const map_contents[] = { BLOCK, "area", NULL } ; 727 static const char* const name_attr[] = { "name", NULL } ; 728 static const char* const action_attr[] = { "action", NULL } ; 729 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ; 730 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ; 731 static const char* const content_attr[] = { "content", NULL } ; 732 static const char* const type_attr[] = { "type", NULL } ; 733 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ; 734 static const char* const object_contents[] = { FLOW, "param", NULL } ; 735 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ; 736 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ; 737 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ; 738 static const char* const option_elt[] = { "option", NULL } ; 739 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ; 740 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ; 741 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ; 742 static const char* const width_attr[] = { "width", NULL } ; 743 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ; 744 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ; 745 static const char* const language_attr[] = { "language", NULL } ; 746 static const char* const select_content[] = { "optgroup", "option", NULL } ; 747 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ; 748 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ; 749 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ; 750 static const char* const table_depr[] = { "align", "bgcolor", NULL } ; 751 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ; 752 static const char* const tr_elt[] = { "tr", NULL } ; 753 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ; 754 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ; 755 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ; 756 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ; 757 static const char* const tr_contents[] = { "th", "td", NULL } ; 758 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ; 759 static const char* const li_elt[] = { "li", NULL } ; 760 static const char* const ul_depr[] = { "type", "compact", NULL} ; 761 static const char* const dir_attr[] = { "dir", NULL} ; 762 763 #define DECL (const char**) 764 765 static const htmlElemDesc 766 html40ElementTable[] = { 767 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ", 768 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL 769 }, 770 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form", 771 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 772 }, 773 { "acronym", 0, 0, 0, 0, 0, 0, 1, "", 774 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 775 }, 776 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ", 777 DECL inline_p , NULL , DECL html_attrs, NULL, NULL 778 }, 779 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ", 780 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL 781 }, 782 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ", 783 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr 784 }, 785 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style", 786 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 787 }, 788 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ", 789 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs 790 }, 791 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " , 792 EMPTY , NULL , NULL, DECL basefont_attrs, NULL 793 }, 794 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ", 795 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr 796 }, 797 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style", 798 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 799 }, 800 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ", 801 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL 802 }, 803 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ", 804 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL 805 }, 806 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ", 807 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL 808 }, 809 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ", 810 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL 811 }, 812 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ", 813 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 814 }, 815 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ", 816 DECL html_flow , NULL , NULL, DECL html_attrs, NULL 817 }, 818 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation", 819 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 820 }, 821 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment", 822 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 823 }, 824 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ", 825 EMPTY , NULL , DECL col_attrs , NULL, NULL 826 }, 827 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ", 828 DECL col_elt , "col" , DECL col_attrs , NULL, NULL 829 }, 830 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ", 831 DECL html_flow , NULL , DECL html_attrs, NULL, NULL 832 }, 833 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ", 834 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL 835 }, 836 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition", 837 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 838 }, 839 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list", 840 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL 841 }, 842 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container", 843 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL 844 }, 845 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ", 846 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL 847 }, 848 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ", 849 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 850 }, 851 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis", 852 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 853 }, 854 { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ", 855 EMPTY, NULL, DECL embed_attrs, NULL, NULL 856 }, 857 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ", 858 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL 859 }, 860 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ", 861 DECL html_inline, NULL, NULL, DECL font_attrs, NULL 862 }, 863 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ", 864 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr 865 }, 866 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " , 867 EMPTY, NULL, NULL, DECL frame_attrs, NULL 868 }, 869 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" , 870 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL 871 }, 872 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ", 873 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 874 }, 875 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ", 876 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 877 }, 878 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ", 879 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 880 }, 881 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ", 882 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 883 }, 884 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ", 885 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 886 }, 887 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ", 888 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 889 }, 890 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ", 891 DECL head_contents, NULL, DECL head_attrs, NULL, NULL 892 }, 893 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " , 894 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL 895 }, 896 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ", 897 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL 898 }, 899 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style", 900 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 901 }, 902 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ", 903 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL 904 }, 905 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ", 906 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs 907 }, 908 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ", 909 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL 910 }, 911 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text", 912 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL 913 }, 914 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ", 915 EMPTY, NULL, NULL, DECL prompt_attrs, NULL 916 }, 917 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user", 918 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 919 }, 920 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ", 921 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL 922 }, 923 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ", 924 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL 925 }, 926 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ", 927 DECL html_flow, NULL, DECL html_attrs, NULL, NULL 928 }, 929 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ", 930 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL 931 }, 932 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ", 933 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr 934 }, 935 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ", 936 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL 937 }, 938 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ", 939 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr 940 }, 941 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ", 942 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL 943 }, 944 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ", 945 DECL html_flow, "div", DECL html_attrs, NULL, NULL 946 }, 947 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ", 948 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL 949 }, 950 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ", 951 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL 952 }, 953 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ", 954 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr 955 }, 956 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " , 957 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL 958 }, 959 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ", 960 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 961 }, 962 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ", 963 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr 964 }, 965 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ", 966 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL 967 }, 968 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ", 969 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL 970 }, 971 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style", 972 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 973 }, 974 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.", 975 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 976 }, 977 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ", 978 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr 979 }, 980 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ", 981 DECL select_content, NULL, DECL select_attrs, NULL, NULL 982 }, 983 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style", 984 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 985 }, 986 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ", 987 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 988 }, 989 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text", 990 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 991 }, 992 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis", 993 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 994 }, 995 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ", 996 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr 997 }, 998 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript", 999 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1000 }, 1001 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ", 1002 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1003 }, 1004 { "table", 0, 0, 0, 0, 0, 0, 0, "", 1005 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL 1006 }, 1007 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ", 1008 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1009 }, 1010 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell", 1011 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 1012 }, 1013 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ", 1014 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr 1015 }, 1016 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ", 1017 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1018 }, 1019 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell", 1020 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 1021 }, 1022 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ", 1023 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1024 }, 1025 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ", 1026 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL 1027 }, 1028 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ", 1029 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL 1030 }, 1031 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style", 1032 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1033 }, 1034 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style", 1035 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 1036 }, 1037 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ", 1038 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL 1039 }, 1040 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument", 1041 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1042 } 1043 }; 1044 1045 /* 1046 * start tags that imply the end of current element 1047 */ 1048 static const char * const htmlStartClose[] = { 1049 "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6", 1050 "dl", "ul", "ol", "menu", "dir", "address", "pre", 1051 "listing", "xmp", "head", NULL, 1052 "head", "p", NULL, 1053 "title", "p", NULL, 1054 "body", "head", "style", "link", "title", "p", NULL, 1055 "frameset", "head", "style", "link", "title", "p", NULL, 1056 "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address", 1057 "pre", "listing", "xmp", "head", "li", NULL, 1058 "hr", "p", "head", NULL, 1059 "h1", "p", "head", NULL, 1060 "h2", "p", "head", NULL, 1061 "h3", "p", "head", NULL, 1062 "h4", "p", "head", NULL, 1063 "h5", "p", "head", NULL, 1064 "h6", "p", "head", NULL, 1065 "dir", "p", "head", NULL, 1066 "address", "p", "head", "ul", NULL, 1067 "pre", "p", "head", "ul", NULL, 1068 "listing", "p", "head", NULL, 1069 "xmp", "p", "head", NULL, 1070 "blockquote", "p", "head", NULL, 1071 "dl", "p", "dt", "menu", "dir", "address", "pre", "listing", 1072 "xmp", "head", NULL, 1073 "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp", 1074 "head", "dd", NULL, 1075 "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp", 1076 "head", "dt", NULL, 1077 "ul", "p", "head", "ol", "menu", "dir", "address", "pre", 1078 "listing", "xmp", NULL, 1079 "ol", "p", "head", "ul", NULL, 1080 "menu", "p", "head", "ul", NULL, 1081 "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL, 1082 "div", "p", "head", NULL, 1083 "noscript", "p", "head", NULL, 1084 "center", "font", "b", "i", "p", "head", NULL, 1085 "a", "a", NULL, 1086 "caption", "p", NULL, 1087 "colgroup", "caption", "colgroup", "col", "p", NULL, 1088 "col", "caption", "col", "p", NULL, 1089 "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre", 1090 "listing", "xmp", "a", NULL, 1091 "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 1092 "td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 1093 "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL, 1094 "thead", "caption", "col", "colgroup", NULL, 1095 "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead", 1096 "tbody", "p", NULL, 1097 "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead", 1098 "tfoot", "tbody", "p", NULL, 1099 "optgroup", "option", NULL, 1100 "option", "option", NULL, 1101 "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", 1102 "pre", "listing", "xmp", "a", NULL, 1103 NULL 1104 }; 1105 1106 /* 1107 * The list of HTML elements which are supposed not to have 1108 * CDATA content and where a p element will be implied 1109 * 1110 * TODO: extend that list by reading the HTML SGML DTD on 1111 * implied paragraph 1112 */ 1113 static const char *const htmlNoContentElements[] = { 1114 "html", 1115 "head", 1116 NULL 1117 }; 1118 1119 /* 1120 * The list of HTML attributes which are of content %Script; 1121 * NOTE: when adding ones, check htmlIsScriptAttribute() since 1122 * it assumes the name starts with 'on' 1123 */ 1124 static const char *const htmlScriptAttributes[] = { 1125 "onclick", 1126 "ondblclick", 1127 "onmousedown", 1128 "onmouseup", 1129 "onmouseover", 1130 "onmousemove", 1131 "onmouseout", 1132 "onkeypress", 1133 "onkeydown", 1134 "onkeyup", 1135 "onload", 1136 "onunload", 1137 "onfocus", 1138 "onblur", 1139 "onsubmit", 1140 "onrest", 1141 "onchange", 1142 "onselect" 1143 }; 1144 1145 /* 1146 * This table is used by the htmlparser to know what to do with 1147 * broken html pages. By assigning different priorities to different 1148 * elements the parser can decide how to handle extra endtags. 1149 * Endtags are only allowed to close elements with lower or equal 1150 * priority. 1151 */ 1152 1153 typedef struct { 1154 const char *name; 1155 int priority; 1156 } elementPriority; 1157 1158 static const elementPriority htmlEndPriority[] = { 1159 {"div", 150}, 1160 {"td", 160}, 1161 {"th", 160}, 1162 {"tr", 170}, 1163 {"thead", 180}, 1164 {"tbody", 180}, 1165 {"tfoot", 180}, 1166 {"table", 190}, 1167 {"head", 200}, 1168 {"body", 200}, 1169 {"html", 220}, 1170 {NULL, 100} /* Default priority */ 1171 }; 1172 1173 static const char** htmlStartCloseIndex[100]; 1174 static int htmlStartCloseIndexinitialized = 0; 1175 1176 /************************************************************************ 1177 * * 1178 * functions to handle HTML specific data * 1179 * * 1180 ************************************************************************/ 1181 1182 /** 1183 * htmlInitAutoClose: 1184 * 1185 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. 1186 * This is not reentrant. Call xmlInitParser() once before processing in 1187 * case of use in multithreaded programs. 1188 */ 1189 void 1190 htmlInitAutoClose(void) { 1191 int indx, i = 0; 1192 1193 if (htmlStartCloseIndexinitialized) return; 1194 1195 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL; 1196 indx = 0; 1197 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) { 1198 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i]; 1199 while (htmlStartClose[i] != NULL) i++; 1200 i++; 1201 } 1202 htmlStartCloseIndexinitialized = 1; 1203 } 1204 1205 /** 1206 * htmlTagLookup: 1207 * @tag: The tag name in lowercase 1208 * 1209 * Lookup the HTML tag in the ElementTable 1210 * 1211 * Returns the related htmlElemDescPtr or NULL if not found. 1212 */ 1213 const htmlElemDesc * 1214 htmlTagLookup(const xmlChar *tag) { 1215 unsigned int i; 1216 1217 for (i = 0; i < (sizeof(html40ElementTable) / 1218 sizeof(html40ElementTable[0]));i++) { 1219 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name)) 1220 return((htmlElemDescPtr) &html40ElementTable[i]); 1221 } 1222 return(NULL); 1223 } 1224 1225 /** 1226 * htmlGetEndPriority: 1227 * @name: The name of the element to look up the priority for. 1228 * 1229 * Return value: The "endtag" priority. 1230 **/ 1231 static int 1232 htmlGetEndPriority (const xmlChar *name) { 1233 int i = 0; 1234 1235 while ((htmlEndPriority[i].name != NULL) && 1236 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name))) 1237 i++; 1238 1239 return(htmlEndPriority[i].priority); 1240 } 1241 1242 1243 /** 1244 * htmlCheckAutoClose: 1245 * @newtag: The new tag name 1246 * @oldtag: The old tag name 1247 * 1248 * Checks whether the new tag is one of the registered valid tags for 1249 * closing old. 1250 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. 1251 * 1252 * Returns 0 if no, 1 if yes. 1253 */ 1254 static int 1255 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag) 1256 { 1257 int i, indx; 1258 const char **closed = NULL; 1259 1260 if (htmlStartCloseIndexinitialized == 0) 1261 htmlInitAutoClose(); 1262 1263 /* inefficient, but not a big deal */ 1264 for (indx = 0; indx < 100; indx++) { 1265 closed = htmlStartCloseIndex[indx]; 1266 if (closed == NULL) 1267 return (0); 1268 if (xmlStrEqual(BAD_CAST * closed, newtag)) 1269 break; 1270 } 1271 1272 i = closed - htmlStartClose; 1273 i++; 1274 while (htmlStartClose[i] != NULL) { 1275 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) { 1276 return (1); 1277 } 1278 i++; 1279 } 1280 return (0); 1281 } 1282 1283 /** 1284 * htmlAutoCloseOnClose: 1285 * @ctxt: an HTML parser context 1286 * @newtag: The new tag name 1287 * @force: force the tag closure 1288 * 1289 * The HTML DTD allows an ending tag to implicitly close other tags. 1290 */ 1291 static void 1292 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1293 { 1294 const htmlElemDesc *info; 1295 int i, priority; 1296 1297 priority = htmlGetEndPriority(newtag); 1298 1299 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1300 1301 if (xmlStrEqual(newtag, ctxt->nameTab[i])) 1302 break; 1303 /* 1304 * A missplaced endtag can only close elements with lower 1305 * or equal priority, so if we find an element with higher 1306 * priority before we find an element with 1307 * matching name, we just ignore this endtag 1308 */ 1309 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority) 1310 return; 1311 } 1312 if (i < 0) 1313 return; 1314 1315 while (!xmlStrEqual(newtag, ctxt->name)) { 1316 info = htmlTagLookup(ctxt->name); 1317 if ((info != NULL) && (info->endTag == 3)) { 1318 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 1319 "Opening and ending tag mismatch: %s and %s\n", 1320 newtag, ctxt->name); 1321 } 1322 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1323 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1324 htmlnamePop(ctxt); 1325 } 1326 } 1327 1328 /** 1329 * htmlAutoCloseOnEnd: 1330 * @ctxt: an HTML parser context 1331 * 1332 * Close all remaining tags at the end of the stream 1333 */ 1334 static void 1335 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) 1336 { 1337 int i; 1338 1339 if (ctxt->nameNr == 0) 1340 return; 1341 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1342 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1343 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1344 htmlnamePop(ctxt); 1345 } 1346 } 1347 1348 /** 1349 * htmlAutoClose: 1350 * @ctxt: an HTML parser context 1351 * @newtag: The new tag name or NULL 1352 * 1353 * The HTML DTD allows a tag to implicitly close other tags. 1354 * The list is kept in htmlStartClose array. This function is 1355 * called when a new tag has been detected and generates the 1356 * appropriates closes if possible/needed. 1357 * If newtag is NULL this mean we are at the end of the resource 1358 * and we should check 1359 */ 1360 static void 1361 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1362 { 1363 while ((newtag != NULL) && (ctxt->name != NULL) && 1364 (htmlCheckAutoClose(newtag, ctxt->name))) { 1365 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1366 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1367 htmlnamePop(ctxt); 1368 } 1369 if (newtag == NULL) { 1370 htmlAutoCloseOnEnd(ctxt); 1371 return; 1372 } 1373 while ((newtag == NULL) && (ctxt->name != NULL) && 1374 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) || 1375 (xmlStrEqual(ctxt->name, BAD_CAST "body")) || 1376 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) { 1377 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1378 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1379 htmlnamePop(ctxt); 1380 } 1381 } 1382 1383 /** 1384 * htmlAutoCloseTag: 1385 * @doc: the HTML document 1386 * @name: The tag name 1387 * @elem: the HTML element 1388 * 1389 * The HTML DTD allows a tag to implicitly close other tags. 1390 * The list is kept in htmlStartClose array. This function checks 1391 * if the element or one of it's children would autoclose the 1392 * given tag. 1393 * 1394 * Returns 1 if autoclose, 0 otherwise 1395 */ 1396 int 1397 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) { 1398 htmlNodePtr child; 1399 1400 if (elem == NULL) return(1); 1401 if (xmlStrEqual(name, elem->name)) return(0); 1402 if (htmlCheckAutoClose(elem->name, name)) return(1); 1403 child = elem->children; 1404 while (child != NULL) { 1405 if (htmlAutoCloseTag(doc, name, child)) return(1); 1406 child = child->next; 1407 } 1408 return(0); 1409 } 1410 1411 /** 1412 * htmlIsAutoClosed: 1413 * @doc: the HTML document 1414 * @elem: the HTML element 1415 * 1416 * The HTML DTD allows a tag to implicitly close other tags. 1417 * The list is kept in htmlStartClose array. This function checks 1418 * if a tag is autoclosed by one of it's child 1419 * 1420 * Returns 1 if autoclosed, 0 otherwise 1421 */ 1422 int 1423 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) { 1424 htmlNodePtr child; 1425 1426 if (elem == NULL) return(1); 1427 child = elem->children; 1428 while (child != NULL) { 1429 if (htmlAutoCloseTag(doc, elem->name, child)) return(1); 1430 child = child->next; 1431 } 1432 return(0); 1433 } 1434 1435 /** 1436 * htmlCheckImplied: 1437 * @ctxt: an HTML parser context 1438 * @newtag: The new tag name 1439 * 1440 * The HTML DTD allows a tag to exists only implicitly 1441 * called when a new tag has been detected and generates the 1442 * appropriates implicit tags if missing 1443 */ 1444 static void 1445 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { 1446 int i; 1447 1448 if (ctxt->options & HTML_PARSE_NOIMPLIED) 1449 return; 1450 if (!htmlOmittedDefaultValue) 1451 return; 1452 if (xmlStrEqual(newtag, BAD_CAST"html")) 1453 return; 1454 if (ctxt->nameNr <= 0) { 1455 htmlnamePush(ctxt, BAD_CAST"html"); 1456 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1457 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL); 1458 } 1459 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head"))) 1460 return; 1461 if ((ctxt->nameNr <= 1) && 1462 ((xmlStrEqual(newtag, BAD_CAST"script")) || 1463 (xmlStrEqual(newtag, BAD_CAST"style")) || 1464 (xmlStrEqual(newtag, BAD_CAST"meta")) || 1465 (xmlStrEqual(newtag, BAD_CAST"link")) || 1466 (xmlStrEqual(newtag, BAD_CAST"title")) || 1467 (xmlStrEqual(newtag, BAD_CAST"base")))) { 1468 if (ctxt->html >= 3) { 1469 /* we already saw or generated an <head> before */ 1470 return; 1471 } 1472 /* 1473 * dropped OBJECT ... i you put it first BODY will be 1474 * assumed ! 1475 */ 1476 htmlnamePush(ctxt, BAD_CAST"head"); 1477 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1478 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL); 1479 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) && 1480 (!xmlStrEqual(newtag, BAD_CAST"frame")) && 1481 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) { 1482 if (ctxt->html >= 10) { 1483 /* we already saw or generated a <body> before */ 1484 return; 1485 } 1486 for (i = 0;i < ctxt->nameNr;i++) { 1487 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) { 1488 return; 1489 } 1490 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) { 1491 return; 1492 } 1493 } 1494 1495 htmlnamePush(ctxt, BAD_CAST"body"); 1496 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1497 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL); 1498 } 1499 } 1500 1501 /** 1502 * htmlCheckParagraph 1503 * @ctxt: an HTML parser context 1504 * 1505 * Check whether a p element need to be implied before inserting 1506 * characters in the current element. 1507 * 1508 * Returns 1 if a paragraph has been inserted, 0 if not and -1 1509 * in case of error. 1510 */ 1511 1512 static int 1513 htmlCheckParagraph(htmlParserCtxtPtr ctxt) { 1514 const xmlChar *tag; 1515 int i; 1516 1517 if (ctxt == NULL) 1518 return(-1); 1519 tag = ctxt->name; 1520 if (tag == NULL) { 1521 htmlAutoClose(ctxt, BAD_CAST"p"); 1522 htmlCheckImplied(ctxt, BAD_CAST"p"); 1523 htmlnamePush(ctxt, BAD_CAST"p"); 1524 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1525 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 1526 return(1); 1527 } 1528 if (!htmlOmittedDefaultValue) 1529 return(0); 1530 for (i = 0; htmlNoContentElements[i] != NULL; i++) { 1531 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) { 1532 htmlAutoClose(ctxt, BAD_CAST"p"); 1533 htmlCheckImplied(ctxt, BAD_CAST"p"); 1534 htmlnamePush(ctxt, BAD_CAST"p"); 1535 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1536 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 1537 return(1); 1538 } 1539 } 1540 return(0); 1541 } 1542 1543 /** 1544 * htmlIsScriptAttribute: 1545 * @name: an attribute name 1546 * 1547 * Check if an attribute is of content type Script 1548 * 1549 * Returns 1 is the attribute is a script 0 otherwise 1550 */ 1551 int 1552 htmlIsScriptAttribute(const xmlChar *name) { 1553 unsigned int i; 1554 1555 if (name == NULL) 1556 return(0); 1557 /* 1558 * all script attributes start with 'on' 1559 */ 1560 if ((name[0] != 'o') || (name[1] != 'n')) 1561 return(0); 1562 for (i = 0; 1563 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]); 1564 i++) { 1565 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i])) 1566 return(1); 1567 } 1568 return(0); 1569 } 1570 1571 /************************************************************************ 1572 * * 1573 * The list of HTML predefined entities * 1574 * * 1575 ************************************************************************/ 1576 1577 1578 static const htmlEntityDesc html40EntitiesTable[] = { 1579 /* 1580 * the 4 absolute ones, plus apostrophe. 1581 */ 1582 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" }, 1583 { 38, "amp", "ampersand, U+0026 ISOnum" }, 1584 { 39, "apos", "single quote" }, 1585 { 60, "lt", "less-than sign, U+003C ISOnum" }, 1586 { 62, "gt", "greater-than sign, U+003E ISOnum" }, 1587 1588 /* 1589 * A bunch still in the 128-255 range 1590 * Replacing them depend really on the charset used. 1591 */ 1592 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" }, 1593 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" }, 1594 { 162, "cent", "cent sign, U+00A2 ISOnum" }, 1595 { 163, "pound","pound sign, U+00A3 ISOnum" }, 1596 { 164, "curren","currency sign, U+00A4 ISOnum" }, 1597 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" }, 1598 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" }, 1599 { 167, "sect", "section sign, U+00A7 ISOnum" }, 1600 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" }, 1601 { 169, "copy", "copyright sign, U+00A9 ISOnum" }, 1602 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" }, 1603 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" }, 1604 { 172, "not", "not sign, U+00AC ISOnum" }, 1605 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" }, 1606 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" }, 1607 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" }, 1608 { 176, "deg", "degree sign, U+00B0 ISOnum" }, 1609 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" }, 1610 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" }, 1611 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" }, 1612 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" }, 1613 { 181, "micro","micro sign, U+00B5 ISOnum" }, 1614 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" }, 1615 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" }, 1616 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" }, 1617 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" }, 1618 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" }, 1619 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" }, 1620 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" }, 1621 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" }, 1622 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" }, 1623 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" }, 1624 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" }, 1625 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" }, 1626 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" }, 1627 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" }, 1628 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" }, 1629 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" }, 1630 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" }, 1631 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" }, 1632 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" }, 1633 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" }, 1634 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" }, 1635 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" }, 1636 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" }, 1637 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" }, 1638 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" }, 1639 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" }, 1640 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" }, 1641 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" }, 1642 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" }, 1643 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" }, 1644 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" }, 1645 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" }, 1646 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" }, 1647 { 215, "times","multiplication sign, U+00D7 ISOnum" }, 1648 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" }, 1649 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" }, 1650 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" }, 1651 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" }, 1652 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" }, 1653 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" }, 1654 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" }, 1655 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" }, 1656 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" }, 1657 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" }, 1658 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" }, 1659 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" }, 1660 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" }, 1661 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" }, 1662 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" }, 1663 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" }, 1664 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" }, 1665 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" }, 1666 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" }, 1667 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" }, 1668 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" }, 1669 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" }, 1670 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" }, 1671 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" }, 1672 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" }, 1673 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" }, 1674 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" }, 1675 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" }, 1676 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" }, 1677 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" }, 1678 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" }, 1679 { 247, "divide","division sign, U+00F7 ISOnum" }, 1680 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" }, 1681 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" }, 1682 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" }, 1683 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" }, 1684 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" }, 1685 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" }, 1686 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" }, 1687 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" }, 1688 1689 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" }, 1690 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" }, 1691 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" }, 1692 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" }, 1693 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" }, 1694 1695 /* 1696 * Anything below should really be kept as entities references 1697 */ 1698 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" }, 1699 1700 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" }, 1701 { 732, "tilde","small tilde, U+02DC ISOdia" }, 1702 1703 { 913, "Alpha","greek capital letter alpha, U+0391" }, 1704 { 914, "Beta", "greek capital letter beta, U+0392" }, 1705 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" }, 1706 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" }, 1707 { 917, "Epsilon","greek capital letter epsilon, U+0395" }, 1708 { 918, "Zeta", "greek capital letter zeta, U+0396" }, 1709 { 919, "Eta", "greek capital letter eta, U+0397" }, 1710 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" }, 1711 { 921, "Iota", "greek capital letter iota, U+0399" }, 1712 { 922, "Kappa","greek capital letter kappa, U+039A" }, 1713 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" }, 1714 { 924, "Mu", "greek capital letter mu, U+039C" }, 1715 { 925, "Nu", "greek capital letter nu, U+039D" }, 1716 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" }, 1717 { 927, "Omicron","greek capital letter omicron, U+039F" }, 1718 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" }, 1719 { 929, "Rho", "greek capital letter rho, U+03A1" }, 1720 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" }, 1721 { 932, "Tau", "greek capital letter tau, U+03A4" }, 1722 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" }, 1723 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" }, 1724 { 935, "Chi", "greek capital letter chi, U+03A7" }, 1725 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" }, 1726 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" }, 1727 1728 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" }, 1729 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" }, 1730 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" }, 1731 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" }, 1732 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" }, 1733 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" }, 1734 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" }, 1735 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" }, 1736 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" }, 1737 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" }, 1738 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" }, 1739 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" }, 1740 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" }, 1741 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" }, 1742 { 959, "omicron","greek small letter omicron, U+03BF NEW" }, 1743 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" }, 1744 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" }, 1745 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" }, 1746 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" }, 1747 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" }, 1748 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" }, 1749 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" }, 1750 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" }, 1751 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" }, 1752 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" }, 1753 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" }, 1754 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" }, 1755 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" }, 1756 1757 { 8194, "ensp", "en space, U+2002 ISOpub" }, 1758 { 8195, "emsp", "em space, U+2003 ISOpub" }, 1759 { 8201, "thinsp","thin space, U+2009 ISOpub" }, 1760 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" }, 1761 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" }, 1762 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" }, 1763 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" }, 1764 { 8211, "ndash","en dash, U+2013 ISOpub" }, 1765 { 8212, "mdash","em dash, U+2014 ISOpub" }, 1766 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" }, 1767 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" }, 1768 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" }, 1769 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" }, 1770 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" }, 1771 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" }, 1772 { 8224, "dagger","dagger, U+2020 ISOpub" }, 1773 { 8225, "Dagger","double dagger, U+2021 ISOpub" }, 1774 1775 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" }, 1776 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" }, 1777 1778 { 8240, "permil","per mille sign, U+2030 ISOtech" }, 1779 1780 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" }, 1781 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" }, 1782 1783 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" }, 1784 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" }, 1785 1786 { 8254, "oline","overline = spacing overscore, U+203E NEW" }, 1787 { 8260, "frasl","fraction slash, U+2044 NEW" }, 1788 1789 { 8364, "euro", "euro sign, U+20AC NEW" }, 1790 1791 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" }, 1792 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" }, 1793 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" }, 1794 { 8482, "trade","trade mark sign, U+2122 ISOnum" }, 1795 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" }, 1796 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" }, 1797 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" }, 1798 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" }, 1799 { 8595, "darr", "downwards arrow, U+2193 ISOnum" }, 1800 { 8596, "harr", "left right arrow, U+2194 ISOamsa" }, 1801 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" }, 1802 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" }, 1803 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" }, 1804 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" }, 1805 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" }, 1806 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" }, 1807 1808 { 8704, "forall","for all, U+2200 ISOtech" }, 1809 { 8706, "part", "partial differential, U+2202 ISOtech" }, 1810 { 8707, "exist","there exists, U+2203 ISOtech" }, 1811 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" }, 1812 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" }, 1813 { 8712, "isin", "element of, U+2208 ISOtech" }, 1814 { 8713, "notin","not an element of, U+2209 ISOtech" }, 1815 { 8715, "ni", "contains as member, U+220B ISOtech" }, 1816 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" }, 1817 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" }, 1818 { 8722, "minus","minus sign, U+2212 ISOtech" }, 1819 { 8727, "lowast","asterisk operator, U+2217 ISOtech" }, 1820 { 8730, "radic","square root = radical sign, U+221A ISOtech" }, 1821 { 8733, "prop", "proportional to, U+221D ISOtech" }, 1822 { 8734, "infin","infinity, U+221E ISOtech" }, 1823 { 8736, "ang", "angle, U+2220 ISOamso" }, 1824 { 8743, "and", "logical and = wedge, U+2227 ISOtech" }, 1825 { 8744, "or", "logical or = vee, U+2228 ISOtech" }, 1826 { 8745, "cap", "intersection = cap, U+2229 ISOtech" }, 1827 { 8746, "cup", "union = cup, U+222A ISOtech" }, 1828 { 8747, "int", "integral, U+222B ISOtech" }, 1829 { 8756, "there4","therefore, U+2234 ISOtech" }, 1830 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" }, 1831 { 8773, "cong", "approximately equal to, U+2245 ISOtech" }, 1832 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" }, 1833 { 8800, "ne", "not equal to, U+2260 ISOtech" }, 1834 { 8801, "equiv","identical to, U+2261 ISOtech" }, 1835 { 8804, "le", "less-than or equal to, U+2264 ISOtech" }, 1836 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" }, 1837 { 8834, "sub", "subset of, U+2282 ISOtech" }, 1838 { 8835, "sup", "superset of, U+2283 ISOtech" }, 1839 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" }, 1840 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" }, 1841 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" }, 1842 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" }, 1843 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" }, 1844 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" }, 1845 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" }, 1846 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" }, 1847 { 8969, "rceil","right ceiling, U+2309 ISOamsc" }, 1848 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" }, 1849 { 8971, "rfloor","right floor, U+230B ISOamsc" }, 1850 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" }, 1851 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" }, 1852 { 9674, "loz", "lozenge, U+25CA ISOpub" }, 1853 1854 { 9824, "spades","black spade suit, U+2660 ISOpub" }, 1855 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" }, 1856 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" }, 1857 { 9830, "diams","black diamond suit, U+2666 ISOpub" }, 1858 1859 }; 1860 1861 /************************************************************************ 1862 * * 1863 * Commodity functions to handle entities * 1864 * * 1865 ************************************************************************/ 1866 1867 /* 1868 * Macro used to grow the current buffer. 1869 */ 1870 #define growBuffer(buffer) { \ 1871 xmlChar *tmp; \ 1872 buffer##_size *= 2; \ 1873 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \ 1874 if (tmp == NULL) { \ 1875 htmlErrMemory(ctxt, "growing buffer\n"); \ 1876 xmlFree(buffer); \ 1877 return(NULL); \ 1878 } \ 1879 buffer = tmp; \ 1880 } 1881 1882 /** 1883 * htmlEntityLookup: 1884 * @name: the entity name 1885 * 1886 * Lookup the given entity in EntitiesTable 1887 * 1888 * TODO: the linear scan is really ugly, an hash table is really needed. 1889 * 1890 * Returns the associated htmlEntityDescPtr if found, NULL otherwise. 1891 */ 1892 const htmlEntityDesc * 1893 htmlEntityLookup(const xmlChar *name) { 1894 unsigned int i; 1895 1896 for (i = 0;i < (sizeof(html40EntitiesTable)/ 1897 sizeof(html40EntitiesTable[0]));i++) { 1898 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) { 1899 return((htmlEntityDescPtr) &html40EntitiesTable[i]); 1900 } 1901 } 1902 return(NULL); 1903 } 1904 1905 /** 1906 * htmlEntityValueLookup: 1907 * @value: the entity's unicode value 1908 * 1909 * Lookup the given entity in EntitiesTable 1910 * 1911 * TODO: the linear scan is really ugly, an hash table is really needed. 1912 * 1913 * Returns the associated htmlEntityDescPtr if found, NULL otherwise. 1914 */ 1915 const htmlEntityDesc * 1916 htmlEntityValueLookup(unsigned int value) { 1917 unsigned int i; 1918 1919 for (i = 0;i < (sizeof(html40EntitiesTable)/ 1920 sizeof(html40EntitiesTable[0]));i++) { 1921 if (html40EntitiesTable[i].value >= value) { 1922 if (html40EntitiesTable[i].value > value) 1923 break; 1924 return((htmlEntityDescPtr) &html40EntitiesTable[i]); 1925 } 1926 } 1927 return(NULL); 1928 } 1929 1930 /** 1931 * UTF8ToHtml: 1932 * @out: a pointer to an array of bytes to store the result 1933 * @outlen: the length of @out 1934 * @in: a pointer to an array of UTF-8 chars 1935 * @inlen: the length of @in 1936 * 1937 * Take a block of UTF-8 chars in and try to convert it to an ASCII 1938 * plus HTML entities block of chars out. 1939 * 1940 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise 1941 * The value of @inlen after return is the number of octets consumed 1942 * as the return value is positive, else unpredictable. 1943 * The value of @outlen after return is the number of octets consumed. 1944 */ 1945 int 1946 UTF8ToHtml(unsigned char* out, int *outlen, 1947 const unsigned char* in, int *inlen) { 1948 const unsigned char* processed = in; 1949 const unsigned char* outend; 1950 const unsigned char* outstart = out; 1951 const unsigned char* instart = in; 1952 const unsigned char* inend; 1953 unsigned int c, d; 1954 int trailing; 1955 1956 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1); 1957 if (in == NULL) { 1958 /* 1959 * initialization nothing to do 1960 */ 1961 *outlen = 0; 1962 *inlen = 0; 1963 return(0); 1964 } 1965 inend = in + (*inlen); 1966 outend = out + (*outlen); 1967 while (in < inend) { 1968 d = *in++; 1969 if (d < 0x80) { c= d; trailing= 0; } 1970 else if (d < 0xC0) { 1971 /* trailing byte in leading position */ 1972 *outlen = out - outstart; 1973 *inlen = processed - instart; 1974 return(-2); 1975 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 1976 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 1977 else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 1978 else { 1979 /* no chance for this in Ascii */ 1980 *outlen = out - outstart; 1981 *inlen = processed - instart; 1982 return(-2); 1983 } 1984 1985 if (inend - in < trailing) { 1986 break; 1987 } 1988 1989 for ( ; trailing; trailing--) { 1990 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) 1991 break; 1992 c <<= 6; 1993 c |= d & 0x3F; 1994 } 1995 1996 /* assertion: c is a single UTF-4 value */ 1997 if (c < 0x80) { 1998 if (out + 1 >= outend) 1999 break; 2000 *out++ = c; 2001 } else { 2002 int len; 2003 const htmlEntityDesc * ent; 2004 const char *cp; 2005 char nbuf[16]; 2006 2007 /* 2008 * Try to lookup a predefined HTML entity for it 2009 */ 2010 2011 ent = htmlEntityValueLookup(c); 2012 if (ent == NULL) { 2013 snprintf(nbuf, sizeof(nbuf), "#%u", c); 2014 cp = nbuf; 2015 } 2016 else 2017 cp = ent->name; 2018 len = strlen(cp); 2019 if (out + 2 + len >= outend) 2020 break; 2021 *out++ = '&'; 2022 memcpy(out, cp, len); 2023 out += len; 2024 *out++ = ';'; 2025 } 2026 processed = in; 2027 } 2028 *outlen = out - outstart; 2029 *inlen = processed - instart; 2030 return(0); 2031 } 2032 2033 /** 2034 * htmlEncodeEntities: 2035 * @out: a pointer to an array of bytes to store the result 2036 * @outlen: the length of @out 2037 * @in: a pointer to an array of UTF-8 chars 2038 * @inlen: the length of @in 2039 * @quoteChar: the quote character to escape (' or ") or zero. 2040 * 2041 * Take a block of UTF-8 chars in and try to convert it to an ASCII 2042 * plus HTML entities block of chars out. 2043 * 2044 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise 2045 * The value of @inlen after return is the number of octets consumed 2046 * as the return value is positive, else unpredictable. 2047 * The value of @outlen after return is the number of octets consumed. 2048 */ 2049 int 2050 htmlEncodeEntities(unsigned char* out, int *outlen, 2051 const unsigned char* in, int *inlen, int quoteChar) { 2052 const unsigned char* processed = in; 2053 const unsigned char* outend; 2054 const unsigned char* outstart = out; 2055 const unsigned char* instart = in; 2056 const unsigned char* inend; 2057 unsigned int c, d; 2058 int trailing; 2059 2060 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) 2061 return(-1); 2062 outend = out + (*outlen); 2063 inend = in + (*inlen); 2064 while (in < inend) { 2065 d = *in++; 2066 if (d < 0x80) { c= d; trailing= 0; } 2067 else if (d < 0xC0) { 2068 /* trailing byte in leading position */ 2069 *outlen = out - outstart; 2070 *inlen = processed - instart; 2071 return(-2); 2072 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 2073 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 2074 else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 2075 else { 2076 /* no chance for this in Ascii */ 2077 *outlen = out - outstart; 2078 *inlen = processed - instart; 2079 return(-2); 2080 } 2081 2082 if (inend - in < trailing) 2083 break; 2084 2085 while (trailing--) { 2086 if (((d= *in++) & 0xC0) != 0x80) { 2087 *outlen = out - outstart; 2088 *inlen = processed - instart; 2089 return(-2); 2090 } 2091 c <<= 6; 2092 c |= d & 0x3F; 2093 } 2094 2095 /* assertion: c is a single UTF-4 value */ 2096 if ((c < 0x80) && (c != (unsigned int) quoteChar) && 2097 (c != '&') && (c != '<') && (c != '>')) { 2098 if (out >= outend) 2099 break; 2100 *out++ = c; 2101 } else { 2102 const htmlEntityDesc * ent; 2103 const char *cp; 2104 char nbuf[16]; 2105 int len; 2106 2107 /* 2108 * Try to lookup a predefined HTML entity for it 2109 */ 2110 ent = htmlEntityValueLookup(c); 2111 if (ent == NULL) { 2112 snprintf(nbuf, sizeof(nbuf), "#%u", c); 2113 cp = nbuf; 2114 } 2115 else 2116 cp = ent->name; 2117 len = strlen(cp); 2118 if (out + 2 + len > outend) 2119 break; 2120 *out++ = '&'; 2121 memcpy(out, cp, len); 2122 out += len; 2123 *out++ = ';'; 2124 } 2125 processed = in; 2126 } 2127 *outlen = out - outstart; 2128 *inlen = processed - instart; 2129 return(0); 2130 } 2131 2132 /************************************************************************ 2133 * * 2134 * Commodity functions to handle streams * 2135 * * 2136 ************************************************************************/ 2137 2138 /** 2139 * htmlNewInputStream: 2140 * @ctxt: an HTML parser context 2141 * 2142 * Create a new input stream structure 2143 * Returns the new input stream or NULL 2144 */ 2145 static htmlParserInputPtr 2146 htmlNewInputStream(htmlParserCtxtPtr ctxt) { 2147 htmlParserInputPtr input; 2148 2149 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput)); 2150 if (input == NULL) { 2151 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n"); 2152 return(NULL); 2153 } 2154 memset(input, 0, sizeof(htmlParserInput)); 2155 input->filename = NULL; 2156 input->directory = NULL; 2157 input->base = NULL; 2158 input->cur = NULL; 2159 input->buf = NULL; 2160 input->line = 1; 2161 input->col = 1; 2162 input->buf = NULL; 2163 input->free = NULL; 2164 input->version = NULL; 2165 input->consumed = 0; 2166 input->length = 0; 2167 return(input); 2168 } 2169 2170 2171 /************************************************************************ 2172 * * 2173 * Commodity functions, cleanup needed ? * 2174 * * 2175 ************************************************************************/ 2176 /* 2177 * all tags allowing pc data from the html 4.01 loose dtd 2178 * NOTE: it might be more apropriate to integrate this information 2179 * into the html40ElementTable array but I don't want to risk any 2180 * binary incomptibility 2181 */ 2182 static const char *allowPCData[] = { 2183 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big", 2184 "blockquote", "body", "button", "caption", "center", "cite", "code", 2185 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2", 2186 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend", 2187 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp", 2188 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var" 2189 }; 2190 2191 /** 2192 * areBlanks: 2193 * @ctxt: an HTML parser context 2194 * @str: a xmlChar * 2195 * @len: the size of @str 2196 * 2197 * Is this a sequence of blank chars that one can ignore ? 2198 * 2199 * Returns 1 if ignorable 0 otherwise. 2200 */ 2201 2202 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) { 2203 unsigned int i; 2204 int j; 2205 xmlNodePtr lastChild; 2206 xmlDtdPtr dtd; 2207 2208 for (j = 0;j < len;j++) 2209 if (!(IS_BLANK_CH(str[j]))) return(0); 2210 2211 if (CUR == 0) return(1); 2212 if (CUR != '<') return(0); 2213 if (ctxt->name == NULL) 2214 return(1); 2215 if (xmlStrEqual(ctxt->name, BAD_CAST"html")) 2216 return(1); 2217 if (xmlStrEqual(ctxt->name, BAD_CAST"head")) 2218 return(1); 2219 2220 /* Only strip CDATA children of the body tag for strict HTML DTDs */ 2221 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) { 2222 dtd = xmlGetIntSubset(ctxt->myDoc); 2223 if (dtd != NULL && dtd->ExternalID != NULL) { 2224 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") || 2225 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN")) 2226 return(1); 2227 } 2228 } 2229 2230 if (ctxt->node == NULL) return(0); 2231 lastChild = xmlGetLastChild(ctxt->node); 2232 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE)) 2233 lastChild = lastChild->prev; 2234 if (lastChild == NULL) { 2235 if ((ctxt->node->type != XML_ELEMENT_NODE) && 2236 (ctxt->node->content != NULL)) return(0); 2237 /* keep ws in constructs like ...<b> </b>... 2238 for all tags "b" allowing PCDATA */ 2239 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2240 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) { 2241 return(0); 2242 } 2243 } 2244 } else if (xmlNodeIsText(lastChild)) { 2245 return(0); 2246 } else { 2247 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p> 2248 for all tags "p" allowing PCDATA */ 2249 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2250 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) { 2251 return(0); 2252 } 2253 } 2254 } 2255 return(1); 2256 } 2257 2258 /** 2259 * htmlNewDocNoDtD: 2260 * @URI: URI for the dtd, or NULL 2261 * @ExternalID: the external ID of the DTD, or NULL 2262 * 2263 * Creates a new HTML document without a DTD node if @URI and @ExternalID 2264 * are NULL 2265 * 2266 * Returns a new document, do not initialize the DTD if not provided 2267 */ 2268 htmlDocPtr 2269 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) { 2270 xmlDocPtr cur; 2271 2272 /* 2273 * Allocate a new document and fill the fields. 2274 */ 2275 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc)); 2276 if (cur == NULL) { 2277 htmlErrMemory(NULL, "HTML document creation failed\n"); 2278 return(NULL); 2279 } 2280 memset(cur, 0, sizeof(xmlDoc)); 2281 2282 cur->type = XML_HTML_DOCUMENT_NODE; 2283 cur->version = NULL; 2284 cur->intSubset = NULL; 2285 cur->doc = cur; 2286 cur->name = NULL; 2287 cur->children = NULL; 2288 cur->extSubset = NULL; 2289 cur->oldNs = NULL; 2290 cur->encoding = NULL; 2291 cur->standalone = 1; 2292 cur->compression = 0; 2293 cur->ids = NULL; 2294 cur->refs = NULL; 2295 cur->_private = NULL; 2296 cur->charset = XML_CHAR_ENCODING_UTF8; 2297 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT; 2298 if ((ExternalID != NULL) || 2299 (URI != NULL)) 2300 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI); 2301 return(cur); 2302 } 2303 2304 /** 2305 * htmlNewDoc: 2306 * @URI: URI for the dtd, or NULL 2307 * @ExternalID: the external ID of the DTD, or NULL 2308 * 2309 * Creates a new HTML document 2310 * 2311 * Returns a new document 2312 */ 2313 htmlDocPtr 2314 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) { 2315 if ((URI == NULL) && (ExternalID == NULL)) 2316 return(htmlNewDocNoDtD( 2317 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd", 2318 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN")); 2319 2320 return(htmlNewDocNoDtD(URI, ExternalID)); 2321 } 2322 2323 2324 /************************************************************************ 2325 * * 2326 * The parser itself * 2327 * Relates to http://www.w3.org/TR/html40 * 2328 * * 2329 ************************************************************************/ 2330 2331 /************************************************************************ 2332 * * 2333 * The parser itself * 2334 * * 2335 ************************************************************************/ 2336 2337 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt); 2338 2339 /** 2340 * htmlParseHTMLName: 2341 * @ctxt: an HTML parser context 2342 * 2343 * parse an HTML tag or attribute name, note that we convert it to lowercase 2344 * since HTML names are not case-sensitive. 2345 * 2346 * Returns the Tag Name parsed or NULL 2347 */ 2348 2349 static const xmlChar * 2350 htmlParseHTMLName(htmlParserCtxtPtr ctxt) { 2351 int i = 0; 2352 xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 2353 2354 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') && 2355 (CUR != ':') && (CUR != '.')) return(NULL); 2356 2357 while ((i < HTML_PARSER_BUFFER_SIZE) && 2358 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) || 2359 (CUR == ':') || (CUR == '-') || (CUR == '_') || 2360 (CUR == '.'))) { 2361 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20; 2362 else loc[i] = CUR; 2363 i++; 2364 2365 NEXT; 2366 } 2367 2368 return(xmlDictLookup(ctxt->dict, loc, i)); 2369 } 2370 2371 2372 /** 2373 * htmlParseHTMLName_nonInvasive: 2374 * @ctxt: an HTML parser context 2375 * 2376 * parse an HTML tag or attribute name, note that we convert it to lowercase 2377 * since HTML names are not case-sensitive, this doesn't consume the data 2378 * from the stream, it's a look-ahead 2379 * 2380 * Returns the Tag Name parsed or NULL 2381 */ 2382 2383 static const xmlChar * 2384 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) { 2385 int i = 0; 2386 xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 2387 2388 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') && 2389 (NXT(1) != ':')) return(NULL); 2390 2391 while ((i < HTML_PARSER_BUFFER_SIZE) && 2392 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) || 2393 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) { 2394 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20; 2395 else loc[i] = NXT(1+i); 2396 i++; 2397 } 2398 2399 return(xmlDictLookup(ctxt->dict, loc, i)); 2400 } 2401 2402 2403 /** 2404 * htmlParseName: 2405 * @ctxt: an HTML parser context 2406 * 2407 * parse an HTML name, this routine is case sensitive. 2408 * 2409 * Returns the Name parsed or NULL 2410 */ 2411 2412 static const xmlChar * 2413 htmlParseName(htmlParserCtxtPtr ctxt) { 2414 const xmlChar *in; 2415 const xmlChar *ret; 2416 int count = 0; 2417 2418 GROW; 2419 2420 /* 2421 * Accelerator for simple ASCII names 2422 */ 2423 in = ctxt->input->cur; 2424 if (((*in >= 0x61) && (*in <= 0x7A)) || 2425 ((*in >= 0x41) && (*in <= 0x5A)) || 2426 (*in == '_') || (*in == ':')) { 2427 in++; 2428 while (((*in >= 0x61) && (*in <= 0x7A)) || 2429 ((*in >= 0x41) && (*in <= 0x5A)) || 2430 ((*in >= 0x30) && (*in <= 0x39)) || 2431 (*in == '_') || (*in == '-') || 2432 (*in == ':') || (*in == '.')) 2433 in++; 2434 if ((*in > 0) && (*in < 0x80)) { 2435 count = in - ctxt->input->cur; 2436 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count); 2437 ctxt->input->cur = in; 2438 ctxt->nbChars += count; 2439 ctxt->input->col += count; 2440 return(ret); 2441 } 2442 } 2443 return(htmlParseNameComplex(ctxt)); 2444 } 2445 2446 static const xmlChar * 2447 htmlParseNameComplex(xmlParserCtxtPtr ctxt) { 2448 int len = 0, l; 2449 int c; 2450 int count = 0; 2451 2452 /* 2453 * Handler for more complex cases 2454 */ 2455 GROW; 2456 c = CUR_CHAR(l); 2457 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */ 2458 (!IS_LETTER(c) && (c != '_') && 2459 (c != ':'))) { 2460 return(NULL); 2461 } 2462 2463 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */ 2464 ((IS_LETTER(c)) || (IS_DIGIT(c)) || 2465 (c == '.') || (c == '-') || 2466 (c == '_') || (c == ':') || 2467 (IS_COMBINING(c)) || 2468 (IS_EXTENDER(c)))) { 2469 if (count++ > 100) { 2470 count = 0; 2471 GROW; 2472 } 2473 len += l; 2474 NEXTL(l); 2475 c = CUR_CHAR(l); 2476 } 2477 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len)); 2478 } 2479 2480 2481 /** 2482 * htmlParseHTMLAttribute: 2483 * @ctxt: an HTML parser context 2484 * @stop: a char stop value 2485 * 2486 * parse an HTML attribute value till the stop (quote), if 2487 * stop is 0 then it stops at the first space 2488 * 2489 * Returns the attribute parsed or NULL 2490 */ 2491 2492 static xmlChar * 2493 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) { 2494 xmlChar *buffer = NULL; 2495 int buffer_size = 0; 2496 xmlChar *out = NULL; 2497 const xmlChar *name = NULL; 2498 const xmlChar *cur = NULL; 2499 const htmlEntityDesc * ent; 2500 2501 /* 2502 * allocate a translation buffer. 2503 */ 2504 buffer_size = HTML_PARSER_BUFFER_SIZE; 2505 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar)); 2506 if (buffer == NULL) { 2507 htmlErrMemory(ctxt, "buffer allocation failed\n"); 2508 return(NULL); 2509 } 2510 out = buffer; 2511 2512 /* 2513 * Ok loop until we reach one of the ending chars 2514 */ 2515 while ((CUR != 0) && (CUR != stop)) { 2516 if ((stop == 0) && (CUR == '>')) break; 2517 if ((stop == 0) && (IS_BLANK_CH(CUR))) break; 2518 if (CUR == '&') { 2519 if (NXT(1) == '#') { 2520 unsigned int c; 2521 int bits; 2522 2523 c = htmlParseCharRef(ctxt); 2524 if (c < 0x80) 2525 { *out++ = c; bits= -6; } 2526 else if (c < 0x800) 2527 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2528 else if (c < 0x10000) 2529 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2530 else 2531 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2532 2533 for ( ; bits >= 0; bits-= 6) { 2534 *out++ = ((c >> bits) & 0x3F) | 0x80; 2535 } 2536 2537 if (out - buffer > buffer_size - 100) { 2538 int indx = out - buffer; 2539 2540 growBuffer(buffer); 2541 out = &buffer[indx]; 2542 } 2543 } else { 2544 ent = htmlParseEntityRef(ctxt, &name); 2545 if (name == NULL) { 2546 *out++ = '&'; 2547 if (out - buffer > buffer_size - 100) { 2548 int indx = out - buffer; 2549 2550 growBuffer(buffer); 2551 out = &buffer[indx]; 2552 } 2553 } else if (ent == NULL) { 2554 *out++ = '&'; 2555 cur = name; 2556 while (*cur != 0) { 2557 if (out - buffer > buffer_size - 100) { 2558 int indx = out - buffer; 2559 2560 growBuffer(buffer); 2561 out = &buffer[indx]; 2562 } 2563 *out++ = *cur++; 2564 } 2565 } else { 2566 unsigned int c; 2567 int bits; 2568 2569 if (out - buffer > buffer_size - 100) { 2570 int indx = out - buffer; 2571 2572 growBuffer(buffer); 2573 out = &buffer[indx]; 2574 } 2575 c = ent->value; 2576 if (c < 0x80) 2577 { *out++ = c; bits= -6; } 2578 else if (c < 0x800) 2579 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2580 else if (c < 0x10000) 2581 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2582 else 2583 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2584 2585 for ( ; bits >= 0; bits-= 6) { 2586 *out++ = ((c >> bits) & 0x3F) | 0x80; 2587 } 2588 } 2589 } 2590 } else { 2591 unsigned int c; 2592 int bits, l; 2593 2594 if (out - buffer > buffer_size - 100) { 2595 int indx = out - buffer; 2596 2597 growBuffer(buffer); 2598 out = &buffer[indx]; 2599 } 2600 c = CUR_CHAR(l); 2601 if (c < 0x80) 2602 { *out++ = c; bits= -6; } 2603 else if (c < 0x800) 2604 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2605 else if (c < 0x10000) 2606 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2607 else 2608 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2609 2610 for ( ; bits >= 0; bits-= 6) { 2611 *out++ = ((c >> bits) & 0x3F) | 0x80; 2612 } 2613 NEXT; 2614 } 2615 } 2616 *out = 0; 2617 return(buffer); 2618 } 2619 2620 /** 2621 * htmlParseEntityRef: 2622 * @ctxt: an HTML parser context 2623 * @str: location to store the entity name 2624 * 2625 * parse an HTML ENTITY references 2626 * 2627 * [68] EntityRef ::= '&' Name ';' 2628 * 2629 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise, 2630 * if non-NULL *str will have to be freed by the caller. 2631 */ 2632 const htmlEntityDesc * 2633 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) { 2634 const xmlChar *name; 2635 const htmlEntityDesc * ent = NULL; 2636 2637 if (str != NULL) *str = NULL; 2638 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL); 2639 2640 if (CUR == '&') { 2641 NEXT; 2642 name = htmlParseName(ctxt); 2643 if (name == NULL) { 2644 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 2645 "htmlParseEntityRef: no name\n", NULL, NULL); 2646 } else { 2647 GROW; 2648 if (CUR == ';') { 2649 if (str != NULL) 2650 *str = name; 2651 2652 /* 2653 * Lookup the entity in the table. 2654 */ 2655 ent = htmlEntityLookup(name); 2656 if (ent != NULL) /* OK that's ugly !!! */ 2657 NEXT; 2658 } else { 2659 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING, 2660 "htmlParseEntityRef: expecting ';'\n", 2661 NULL, NULL); 2662 if (str != NULL) 2663 *str = name; 2664 } 2665 } 2666 } 2667 return(ent); 2668 } 2669 2670 /** 2671 * htmlParseAttValue: 2672 * @ctxt: an HTML parser context 2673 * 2674 * parse a value for an attribute 2675 * Note: the parser won't do substitution of entities here, this 2676 * will be handled later in xmlStringGetNodeList, unless it was 2677 * asked for ctxt->replaceEntities != 0 2678 * 2679 * Returns the AttValue parsed or NULL. 2680 */ 2681 2682 static xmlChar * 2683 htmlParseAttValue(htmlParserCtxtPtr ctxt) { 2684 xmlChar *ret = NULL; 2685 2686 if (CUR == '"') { 2687 NEXT; 2688 ret = htmlParseHTMLAttribute(ctxt, '"'); 2689 if (CUR != '"') { 2690 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 2691 "AttValue: \" expected\n", NULL, NULL); 2692 } else 2693 NEXT; 2694 } else if (CUR == '\'') { 2695 NEXT; 2696 ret = htmlParseHTMLAttribute(ctxt, '\''); 2697 if (CUR != '\'') { 2698 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 2699 "AttValue: ' expected\n", NULL, NULL); 2700 } else 2701 NEXT; 2702 } else { 2703 /* 2704 * That's an HTMLism, the attribute value may not be quoted 2705 */ 2706 ret = htmlParseHTMLAttribute(ctxt, 0); 2707 if (ret == NULL) { 2708 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE, 2709 "AttValue: no value found\n", NULL, NULL); 2710 } 2711 } 2712 return(ret); 2713 } 2714 2715 /** 2716 * htmlParseSystemLiteral: 2717 * @ctxt: an HTML parser context 2718 * 2719 * parse an HTML Literal 2720 * 2721 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") 2722 * 2723 * Returns the SystemLiteral parsed or NULL 2724 */ 2725 2726 static xmlChar * 2727 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { 2728 const xmlChar *q; 2729 xmlChar *ret = NULL; 2730 2731 if (CUR == '"') { 2732 NEXT; 2733 q = CUR_PTR; 2734 while ((IS_CHAR_CH(CUR)) && (CUR != '"')) 2735 NEXT; 2736 if (!IS_CHAR_CH(CUR)) { 2737 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2738 "Unfinished SystemLiteral\n", NULL, NULL); 2739 } else { 2740 ret = xmlStrndup(q, CUR_PTR - q); 2741 NEXT; 2742 } 2743 } else if (CUR == '\'') { 2744 NEXT; 2745 q = CUR_PTR; 2746 while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) 2747 NEXT; 2748 if (!IS_CHAR_CH(CUR)) { 2749 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2750 "Unfinished SystemLiteral\n", NULL, NULL); 2751 } else { 2752 ret = xmlStrndup(q, CUR_PTR - q); 2753 NEXT; 2754 } 2755 } else { 2756 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2757 " or ' expected\n", NULL, NULL); 2758 } 2759 2760 return(ret); 2761 } 2762 2763 /** 2764 * htmlParsePubidLiteral: 2765 * @ctxt: an HTML parser context 2766 * 2767 * parse an HTML public literal 2768 * 2769 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" 2770 * 2771 * Returns the PubidLiteral parsed or NULL. 2772 */ 2773 2774 static xmlChar * 2775 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { 2776 const xmlChar *q; 2777 xmlChar *ret = NULL; 2778 /* 2779 * Name ::= (Letter | '_') (NameChar)* 2780 */ 2781 if (CUR == '"') { 2782 NEXT; 2783 q = CUR_PTR; 2784 while (IS_PUBIDCHAR_CH(CUR)) NEXT; 2785 if (CUR != '"') { 2786 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2787 "Unfinished PubidLiteral\n", NULL, NULL); 2788 } else { 2789 ret = xmlStrndup(q, CUR_PTR - q); 2790 NEXT; 2791 } 2792 } else if (CUR == '\'') { 2793 NEXT; 2794 q = CUR_PTR; 2795 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')) 2796 NEXT; 2797 if (CUR != '\'') { 2798 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2799 "Unfinished PubidLiteral\n", NULL, NULL); 2800 } else { 2801 ret = xmlStrndup(q, CUR_PTR - q); 2802 NEXT; 2803 } 2804 } else { 2805 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2806 "PubidLiteral \" or ' expected\n", NULL, NULL); 2807 } 2808 2809 return(ret); 2810 } 2811 2812 /** 2813 * htmlParseScript: 2814 * @ctxt: an HTML parser context 2815 * 2816 * parse the content of an HTML SCRIPT or STYLE element 2817 * http://www.w3.org/TR/html4/sgml/dtd.html#Script 2818 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet 2819 * http://www.w3.org/TR/html4/types.html#type-script 2820 * http://www.w3.org/TR/html4/types.html#h-6.15 2821 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1 2822 * 2823 * Script data ( %Script; in the DTD) can be the content of the SCRIPT 2824 * element and the value of intrinsic event attributes. User agents must 2825 * not evaluate script data as HTML markup but instead must pass it on as 2826 * data to a script engine. 2827 * NOTES: 2828 * - The content is passed like CDATA 2829 * - the attributes for style and scripting "onXXX" are also described 2830 * as CDATA but SGML allows entities references in attributes so their 2831 * processing is identical as other attributes 2832 */ 2833 static void 2834 htmlParseScript(htmlParserCtxtPtr ctxt) { 2835 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; 2836 int nbchar = 0; 2837 int cur,l; 2838 2839 SHRINK; 2840 cur = CUR_CHAR(l); 2841 while (IS_CHAR_CH(cur)) { 2842 if ((cur == '<') && (NXT(1) == '/')) { 2843 /* 2844 * One should break here, the specification is clear: 2845 * Authors should therefore escape "</" within the content. 2846 * Escape mechanisms are specific to each scripting or 2847 * style sheet language. 2848 * 2849 * In recovery mode, only break if end tag match the 2850 * current tag, effectively ignoring all tags inside the 2851 * script/style block and treating the entire block as 2852 * CDATA. 2853 */ 2854 if (ctxt->recovery) { 2855 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2, 2856 xmlStrlen(ctxt->name)) == 0) 2857 { 2858 break; /* while */ 2859 } else { 2860 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 2861 "Element %s embeds close tag\n", 2862 ctxt->name, NULL); 2863 } 2864 } else { 2865 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) || 2866 ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) 2867 { 2868 break; /* while */ 2869 } 2870 } 2871 } 2872 COPY_BUF(l,buf,nbchar,cur); 2873 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 2874 if (ctxt->sax->cdataBlock!= NULL) { 2875 /* 2876 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2877 */ 2878 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 2879 } else if (ctxt->sax->characters != NULL) { 2880 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2881 } 2882 nbchar = 0; 2883 } 2884 GROW; 2885 NEXTL(l); 2886 cur = CUR_CHAR(l); 2887 } 2888 2889 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) { 2890 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 2891 "Invalid char in CDATA 0x%X\n", cur); 2892 if (ctxt->input->cur < ctxt->input->end) { 2893 NEXT; 2894 } 2895 } 2896 2897 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2898 if (ctxt->sax->cdataBlock!= NULL) { 2899 /* 2900 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2901 */ 2902 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 2903 } else if (ctxt->sax->characters != NULL) { 2904 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2905 } 2906 } 2907 } 2908 2909 2910 /** 2911 * htmlParseCharData: 2912 * @ctxt: an HTML parser context 2913 * 2914 * parse a CharData section. 2915 * if we are within a CDATA section ']]>' marks an end of section. 2916 * 2917 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) 2918 */ 2919 2920 static void 2921 htmlParseCharData(htmlParserCtxtPtr ctxt) { 2922 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; 2923 int nbchar = 0; 2924 int cur, l; 2925 int chunk = 0; 2926 2927 SHRINK; 2928 cur = CUR_CHAR(l); 2929 while (((cur != '<') || (ctxt->token == '<')) && 2930 ((cur != '&') || (ctxt->token == '&')) && 2931 (cur != 0)) { 2932 if (!(IS_CHAR(cur))) { 2933 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 2934 "Invalid char in CDATA 0x%X\n", cur); 2935 } else { 2936 COPY_BUF(l,buf,nbchar,cur); 2937 } 2938 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 2939 /* 2940 * Ok the segment is to be consumed as chars. 2941 */ 2942 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2943 if (areBlanks(ctxt, buf, nbchar)) { 2944 if (ctxt->sax->ignorableWhitespace != NULL) 2945 ctxt->sax->ignorableWhitespace(ctxt->userData, 2946 buf, nbchar); 2947 } else { 2948 htmlCheckParagraph(ctxt); 2949 if (ctxt->sax->characters != NULL) 2950 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2951 } 2952 } 2953 nbchar = 0; 2954 } 2955 NEXTL(l); 2956 chunk++; 2957 if (chunk > HTML_PARSER_BUFFER_SIZE) { 2958 chunk = 0; 2959 SHRINK; 2960 GROW; 2961 } 2962 cur = CUR_CHAR(l); 2963 if (cur == 0) { 2964 SHRINK; 2965 GROW; 2966 cur = CUR_CHAR(l); 2967 } 2968 } 2969 if (nbchar != 0) { 2970 buf[nbchar] = 0; 2971 2972 /* 2973 * Ok the segment is to be consumed as chars. 2974 */ 2975 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2976 if (areBlanks(ctxt, buf, nbchar)) { 2977 if (ctxt->sax->ignorableWhitespace != NULL) 2978 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar); 2979 } else { 2980 htmlCheckParagraph(ctxt); 2981 if (ctxt->sax->characters != NULL) 2982 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2983 } 2984 } 2985 } else { 2986 /* 2987 * Loop detection 2988 */ 2989 if (cur == 0) 2990 ctxt->instate = XML_PARSER_EOF; 2991 } 2992 } 2993 2994 /** 2995 * htmlParseExternalID: 2996 * @ctxt: an HTML parser context 2997 * @publicID: a xmlChar** receiving PubidLiteral 2998 * 2999 * Parse an External ID or a Public ID 3000 * 3001 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral 3002 * | 'PUBLIC' S PubidLiteral S SystemLiteral 3003 * 3004 * [83] PublicID ::= 'PUBLIC' S PubidLiteral 3005 * 3006 * Returns the function returns SystemLiteral and in the second 3007 * case publicID receives PubidLiteral, is strict is off 3008 * it is possible to return NULL and have publicID set. 3009 */ 3010 3011 static xmlChar * 3012 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) { 3013 xmlChar *URI = NULL; 3014 3015 if ((UPPER == 'S') && (UPP(1) == 'Y') && 3016 (UPP(2) == 'S') && (UPP(3) == 'T') && 3017 (UPP(4) == 'E') && (UPP(5) == 'M')) { 3018 SKIP(6); 3019 if (!IS_BLANK_CH(CUR)) { 3020 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3021 "Space required after 'SYSTEM'\n", NULL, NULL); 3022 } 3023 SKIP_BLANKS; 3024 URI = htmlParseSystemLiteral(ctxt); 3025 if (URI == NULL) { 3026 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED, 3027 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL); 3028 } 3029 } else if ((UPPER == 'P') && (UPP(1) == 'U') && 3030 (UPP(2) == 'B') && (UPP(3) == 'L') && 3031 (UPP(4) == 'I') && (UPP(5) == 'C')) { 3032 SKIP(6); 3033 if (!IS_BLANK_CH(CUR)) { 3034 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3035 "Space required after 'PUBLIC'\n", NULL, NULL); 3036 } 3037 SKIP_BLANKS; 3038 *publicID = htmlParsePubidLiteral(ctxt); 3039 if (*publicID == NULL) { 3040 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED, 3041 "htmlParseExternalID: PUBLIC, no Public Identifier\n", 3042 NULL, NULL); 3043 } 3044 SKIP_BLANKS; 3045 if ((CUR == '"') || (CUR == '\'')) { 3046 URI = htmlParseSystemLiteral(ctxt); 3047 } 3048 } 3049 return(URI); 3050 } 3051 3052 /** 3053 * xmlParsePI: 3054 * @ctxt: an XML parser context 3055 * 3056 * parse an XML Processing Instruction. 3057 * 3058 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' 3059 */ 3060 static void 3061 htmlParsePI(htmlParserCtxtPtr ctxt) { 3062 xmlChar *buf = NULL; 3063 int len = 0; 3064 int size = HTML_PARSER_BUFFER_SIZE; 3065 int cur, l; 3066 const xmlChar *target; 3067 xmlParserInputState state; 3068 int count = 0; 3069 3070 if ((RAW == '<') && (NXT(1) == '?')) { 3071 state = ctxt->instate; 3072 ctxt->instate = XML_PARSER_PI; 3073 /* 3074 * this is a Processing Instruction. 3075 */ 3076 SKIP(2); 3077 SHRINK; 3078 3079 /* 3080 * Parse the target name and check for special support like 3081 * namespace. 3082 */ 3083 target = htmlParseName(ctxt); 3084 if (target != NULL) { 3085 if (RAW == '>') { 3086 SKIP(1); 3087 3088 /* 3089 * SAX: PI detected. 3090 */ 3091 if ((ctxt->sax) && (!ctxt->disableSAX) && 3092 (ctxt->sax->processingInstruction != NULL)) 3093 ctxt->sax->processingInstruction(ctxt->userData, 3094 target, NULL); 3095 ctxt->instate = state; 3096 return; 3097 } 3098 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 3099 if (buf == NULL) { 3100 htmlErrMemory(ctxt, NULL); 3101 ctxt->instate = state; 3102 return; 3103 } 3104 cur = CUR; 3105 if (!IS_BLANK(cur)) { 3106 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3107 "ParsePI: PI %s space expected\n", target, NULL); 3108 } 3109 SKIP_BLANKS; 3110 cur = CUR_CHAR(l); 3111 while (IS_CHAR(cur) && (cur != '>')) { 3112 if (len + 5 >= size) { 3113 xmlChar *tmp; 3114 3115 size *= 2; 3116 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 3117 if (tmp == NULL) { 3118 htmlErrMemory(ctxt, NULL); 3119 xmlFree(buf); 3120 ctxt->instate = state; 3121 return; 3122 } 3123 buf = tmp; 3124 } 3125 count++; 3126 if (count > 50) { 3127 GROW; 3128 count = 0; 3129 } 3130 COPY_BUF(l,buf,len,cur); 3131 NEXTL(l); 3132 cur = CUR_CHAR(l); 3133 if (cur == 0) { 3134 SHRINK; 3135 GROW; 3136 cur = CUR_CHAR(l); 3137 } 3138 } 3139 buf[len] = 0; 3140 if (cur != '>') { 3141 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED, 3142 "ParsePI: PI %s never end ...\n", target, NULL); 3143 } else { 3144 SKIP(1); 3145 3146 /* 3147 * SAX: PI detected. 3148 */ 3149 if ((ctxt->sax) && (!ctxt->disableSAX) && 3150 (ctxt->sax->processingInstruction != NULL)) 3151 ctxt->sax->processingInstruction(ctxt->userData, 3152 target, buf); 3153 } 3154 xmlFree(buf); 3155 } else { 3156 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED, 3157 "PI is not started correctly", NULL, NULL); 3158 } 3159 ctxt->instate = state; 3160 } 3161 } 3162 3163 /** 3164 * htmlParseComment: 3165 * @ctxt: an HTML parser context 3166 * 3167 * Parse an XML (SGML) comment <!-- .... --> 3168 * 3169 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' 3170 */ 3171 static void 3172 htmlParseComment(htmlParserCtxtPtr ctxt) { 3173 xmlChar *buf = NULL; 3174 int len; 3175 int size = HTML_PARSER_BUFFER_SIZE; 3176 int q, ql; 3177 int r, rl; 3178 int cur, l; 3179 xmlParserInputState state; 3180 3181 /* 3182 * Check that there is a comment right here. 3183 */ 3184 if ((RAW != '<') || (NXT(1) != '!') || 3185 (NXT(2) != '-') || (NXT(3) != '-')) return; 3186 3187 state = ctxt->instate; 3188 ctxt->instate = XML_PARSER_COMMENT; 3189 SHRINK; 3190 SKIP(4); 3191 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 3192 if (buf == NULL) { 3193 htmlErrMemory(ctxt, "buffer allocation failed\n"); 3194 ctxt->instate = state; 3195 return; 3196 } 3197 q = CUR_CHAR(ql); 3198 NEXTL(ql); 3199 r = CUR_CHAR(rl); 3200 NEXTL(rl); 3201 cur = CUR_CHAR(l); 3202 len = 0; 3203 while (IS_CHAR(cur) && 3204 ((cur != '>') || 3205 (r != '-') || (q != '-'))) { 3206 if (len + 5 >= size) { 3207 xmlChar *tmp; 3208 3209 size *= 2; 3210 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 3211 if (tmp == NULL) { 3212 xmlFree(buf); 3213 htmlErrMemory(ctxt, "growing buffer failed\n"); 3214 ctxt->instate = state; 3215 return; 3216 } 3217 buf = tmp; 3218 } 3219 COPY_BUF(ql,buf,len,q); 3220 q = r; 3221 ql = rl; 3222 r = cur; 3223 rl = l; 3224 NEXTL(l); 3225 cur = CUR_CHAR(l); 3226 if (cur == 0) { 3227 SHRINK; 3228 GROW; 3229 cur = CUR_CHAR(l); 3230 } 3231 } 3232 buf[len] = 0; 3233 if (!IS_CHAR(cur)) { 3234 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, 3235 "Comment not terminated \n<!--%.50s\n", buf, NULL); 3236 xmlFree(buf); 3237 } else { 3238 NEXT; 3239 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) && 3240 (!ctxt->disableSAX)) 3241 ctxt->sax->comment(ctxt->userData, buf); 3242 xmlFree(buf); 3243 } 3244 ctxt->instate = state; 3245 } 3246 3247 /** 3248 * htmlParseCharRef: 3249 * @ctxt: an HTML parser context 3250 * 3251 * parse Reference declarations 3252 * 3253 * [66] CharRef ::= '&#' [0-9]+ ';' | 3254 * '&#x' [0-9a-fA-F]+ ';' 3255 * 3256 * Returns the value parsed (as an int) 3257 */ 3258 int 3259 htmlParseCharRef(htmlParserCtxtPtr ctxt) { 3260 int val = 0; 3261 3262 if ((ctxt == NULL) || (ctxt->input == NULL)) { 3263 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3264 "htmlParseCharRef: context error\n", 3265 NULL, NULL); 3266 return(0); 3267 } 3268 if ((CUR == '&') && (NXT(1) == '#') && 3269 ((NXT(2) == 'x') || NXT(2) == 'X')) { 3270 SKIP(3); 3271 while (CUR != ';') { 3272 if ((CUR >= '0') && (CUR <= '9')) 3273 val = val * 16 + (CUR - '0'); 3274 else if ((CUR >= 'a') && (CUR <= 'f')) 3275 val = val * 16 + (CUR - 'a') + 10; 3276 else if ((CUR >= 'A') && (CUR <= 'F')) 3277 val = val * 16 + (CUR - 'A') + 10; 3278 else { 3279 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF, 3280 "htmlParseCharRef: missing semicolon\n", 3281 NULL, NULL); 3282 break; 3283 } 3284 NEXT; 3285 } 3286 if (CUR == ';') 3287 NEXT; 3288 } else if ((CUR == '&') && (NXT(1) == '#')) { 3289 SKIP(2); 3290 while (CUR != ';') { 3291 if ((CUR >= '0') && (CUR <= '9')) 3292 val = val * 10 + (CUR - '0'); 3293 else { 3294 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF, 3295 "htmlParseCharRef: missing semicolon\n", 3296 NULL, NULL); 3297 break; 3298 } 3299 NEXT; 3300 } 3301 if (CUR == ';') 3302 NEXT; 3303 } else { 3304 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF, 3305 "htmlParseCharRef: invalid value\n", NULL, NULL); 3306 } 3307 /* 3308 * Check the value IS_CHAR ... 3309 */ 3310 if (IS_CHAR(val)) { 3311 return(val); 3312 } else { 3313 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 3314 "htmlParseCharRef: invalid xmlChar value %d\n", 3315 val); 3316 } 3317 return(0); 3318 } 3319 3320 3321 /** 3322 * htmlParseDocTypeDecl: 3323 * @ctxt: an HTML parser context 3324 * 3325 * parse a DOCTYPE declaration 3326 * 3327 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? 3328 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>' 3329 */ 3330 3331 static void 3332 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) { 3333 const xmlChar *name; 3334 xmlChar *ExternalID = NULL; 3335 xmlChar *URI = NULL; 3336 3337 /* 3338 * We know that '<!DOCTYPE' has been detected. 3339 */ 3340 SKIP(9); 3341 3342 SKIP_BLANKS; 3343 3344 /* 3345 * Parse the DOCTYPE name. 3346 */ 3347 name = htmlParseName(ctxt); 3348 if (name == NULL) { 3349 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3350 "htmlParseDocTypeDecl : no DOCTYPE name !\n", 3351 NULL, NULL); 3352 } 3353 /* 3354 * Check that upper(name) == "HTML" !!!!!!!!!!!!! 3355 */ 3356 3357 SKIP_BLANKS; 3358 3359 /* 3360 * Check for SystemID and ExternalID 3361 */ 3362 URI = htmlParseExternalID(ctxt, &ExternalID); 3363 SKIP_BLANKS; 3364 3365 /* 3366 * We should be at the end of the DOCTYPE declaration. 3367 */ 3368 if (CUR != '>') { 3369 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED, 3370 "DOCTYPE improperly terminated\n", NULL, NULL); 3371 /* We shouldn't try to resynchronize ... */ 3372 } 3373 NEXT; 3374 3375 /* 3376 * Create or update the document accordingly to the DOCTYPE 3377 */ 3378 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) && 3379 (!ctxt->disableSAX)) 3380 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI); 3381 3382 /* 3383 * Cleanup, since we don't use all those identifiers 3384 */ 3385 if (URI != NULL) xmlFree(URI); 3386 if (ExternalID != NULL) xmlFree(ExternalID); 3387 } 3388 3389 /** 3390 * htmlParseAttribute: 3391 * @ctxt: an HTML parser context 3392 * @value: a xmlChar ** used to store the value of the attribute 3393 * 3394 * parse an attribute 3395 * 3396 * [41] Attribute ::= Name Eq AttValue 3397 * 3398 * [25] Eq ::= S? '=' S? 3399 * 3400 * With namespace: 3401 * 3402 * [NS 11] Attribute ::= QName Eq AttValue 3403 * 3404 * Also the case QName == xmlns:??? is handled independently as a namespace 3405 * definition. 3406 * 3407 * Returns the attribute name, and the value in *value. 3408 */ 3409 3410 static const xmlChar * 3411 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { 3412 const xmlChar *name; 3413 xmlChar *val = NULL; 3414 3415 *value = NULL; 3416 name = htmlParseHTMLName(ctxt); 3417 if (name == NULL) { 3418 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3419 "error parsing attribute name\n", NULL, NULL); 3420 return(NULL); 3421 } 3422 3423 /* 3424 * read the value 3425 */ 3426 SKIP_BLANKS; 3427 if (CUR == '=') { 3428 NEXT; 3429 SKIP_BLANKS; 3430 val = htmlParseAttValue(ctxt); 3431 } 3432 3433 *value = val; 3434 return(name); 3435 } 3436 3437 /** 3438 * htmlCheckEncoding: 3439 * @ctxt: an HTML parser context 3440 * @attvalue: the attribute value 3441 * 3442 * Checks an http-equiv attribute from a Meta tag to detect 3443 * the encoding 3444 * If a new encoding is detected the parser is switched to decode 3445 * it and pass UTF8 3446 */ 3447 static void 3448 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { 3449 const xmlChar *encoding; 3450 3451 if ((ctxt == NULL) || (attvalue == NULL) || 3452 (ctxt->options & HTML_PARSE_IGNORE_ENC)) 3453 return; 3454 3455 /* do not change encoding */ 3456 if (ctxt->input->encoding != NULL) 3457 return; 3458 3459 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset="); 3460 if (encoding != NULL) { 3461 encoding += 8; 3462 } else { 3463 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset ="); 3464 if (encoding != NULL) 3465 encoding += 9; 3466 } 3467 if (encoding != NULL) { 3468 xmlCharEncoding enc; 3469 xmlCharEncodingHandlerPtr handler; 3470 3471 while ((*encoding == ' ') || (*encoding == '\t')) encoding++; 3472 3473 if (ctxt->input->encoding != NULL) 3474 xmlFree((xmlChar *) ctxt->input->encoding); 3475 ctxt->input->encoding = xmlStrdup(encoding); 3476 3477 enc = xmlParseCharEncoding((const char *) encoding); 3478 /* 3479 * registered set of known encodings 3480 */ 3481 if (enc != XML_CHAR_ENCODING_ERROR) { 3482 if (((enc == XML_CHAR_ENCODING_UTF16LE) || 3483 (enc == XML_CHAR_ENCODING_UTF16BE) || 3484 (enc == XML_CHAR_ENCODING_UCS4LE) || 3485 (enc == XML_CHAR_ENCODING_UCS4BE)) && 3486 (ctxt->input->buf != NULL) && 3487 (ctxt->input->buf->encoder == NULL)) { 3488 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 3489 "htmlCheckEncoding: wrong encoding meta\n", 3490 NULL, NULL); 3491 } else { 3492 xmlSwitchEncoding(ctxt, enc); 3493 } 3494 ctxt->charset = XML_CHAR_ENCODING_UTF8; 3495 } else { 3496 /* 3497 * fallback for unknown encodings 3498 */ 3499 handler = xmlFindCharEncodingHandler((const char *) encoding); 3500 if (handler != NULL) { 3501 xmlSwitchToEncoding(ctxt, handler); 3502 ctxt->charset = XML_CHAR_ENCODING_UTF8; 3503 } else { 3504 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 3505 "htmlCheckEncoding: unknown encoding %s\n", 3506 encoding, NULL); 3507 } 3508 } 3509 3510 if ((ctxt->input->buf != NULL) && 3511 (ctxt->input->buf->encoder != NULL) && 3512 (ctxt->input->buf->raw != NULL) && 3513 (ctxt->input->buf->buffer != NULL)) { 3514 int nbchars; 3515 int processed; 3516 3517 /* 3518 * convert as much as possible to the parser reading buffer. 3519 */ 3520 processed = ctxt->input->cur - ctxt->input->base; 3521 xmlBufferShrink(ctxt->input->buf->buffer, processed); 3522 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder, 3523 ctxt->input->buf->buffer, 3524 ctxt->input->buf->raw); 3525 if (nbchars < 0) { 3526 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 3527 "htmlCheckEncoding: encoder error\n", 3528 NULL, NULL); 3529 } 3530 ctxt->input->base = 3531 ctxt->input->cur = ctxt->input->buf->buffer->content; 3532 ctxt->input->end = 3533 &ctxt->input->base[ctxt->input->buf->buffer->use]; 3534 } 3535 } 3536 } 3537 3538 /** 3539 * htmlCheckMeta: 3540 * @ctxt: an HTML parser context 3541 * @atts: the attributes values 3542 * 3543 * Checks an attributes from a Meta tag 3544 */ 3545 static void 3546 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) { 3547 int i; 3548 const xmlChar *att, *value; 3549 int http = 0; 3550 const xmlChar *content = NULL; 3551 3552 if ((ctxt == NULL) || (atts == NULL)) 3553 return; 3554 3555 i = 0; 3556 att = atts[i++]; 3557 while (att != NULL) { 3558 value = atts[i++]; 3559 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv")) 3560 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 3561 http = 1; 3562 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content"))) 3563 content = value; 3564 att = atts[i++]; 3565 } 3566 if ((http) && (content != NULL)) 3567 htmlCheckEncoding(ctxt, content); 3568 3569 } 3570 3571 /** 3572 * htmlParseStartTag: 3573 * @ctxt: an HTML parser context 3574 * 3575 * parse a start of tag either for rule element or 3576 * EmptyElement. In both case we don't parse the tag closing chars. 3577 * 3578 * [40] STag ::= '<' Name (S Attribute)* S? '>' 3579 * 3580 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' 3581 * 3582 * With namespace: 3583 * 3584 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>' 3585 * 3586 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>' 3587 * 3588 * Returns 0 in case of success, -1 in case of error and 1 if discarded 3589 */ 3590 3591 static int 3592 htmlParseStartTag(htmlParserCtxtPtr ctxt) { 3593 const xmlChar *name; 3594 const xmlChar *attname; 3595 xmlChar *attvalue; 3596 const xmlChar **atts; 3597 int nbatts = 0; 3598 int maxatts; 3599 int meta = 0; 3600 int i; 3601 int discardtag = 0; 3602 3603 if (ctxt->instate == XML_PARSER_EOF) 3604 return(-1); 3605 if ((ctxt == NULL) || (ctxt->input == NULL)) { 3606 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3607 "htmlParseStartTag: context error\n", NULL, NULL); 3608 return -1; 3609 } 3610 if (CUR != '<') return -1; 3611 NEXT; 3612 3613 atts = ctxt->atts; 3614 maxatts = ctxt->maxatts; 3615 3616 GROW; 3617 name = htmlParseHTMLName(ctxt); 3618 if (name == NULL) { 3619 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3620 "htmlParseStartTag: invalid element name\n", 3621 NULL, NULL); 3622 /* Dump the bogus tag like browsers do */ 3623 while ((IS_CHAR_CH(CUR)) && (CUR != '>') && 3624 (ctxt->instate != XML_PARSER_EOF)) 3625 NEXT; 3626 return -1; 3627 } 3628 if (xmlStrEqual(name, BAD_CAST"meta")) 3629 meta = 1; 3630 3631 /* 3632 * Check for auto-closure of HTML elements. 3633 */ 3634 htmlAutoClose(ctxt, name); 3635 3636 /* 3637 * Check for implied HTML elements. 3638 */ 3639 htmlCheckImplied(ctxt, name); 3640 3641 /* 3642 * Avoid html at any level > 0, head at any level != 1 3643 * or any attempt to recurse body 3644 */ 3645 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) { 3646 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3647 "htmlParseStartTag: misplaced <html> tag\n", 3648 name, NULL); 3649 discardtag = 1; 3650 ctxt->depth++; 3651 } 3652 if ((ctxt->nameNr != 1) && 3653 (xmlStrEqual(name, BAD_CAST"head"))) { 3654 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3655 "htmlParseStartTag: misplaced <head> tag\n", 3656 name, NULL); 3657 discardtag = 1; 3658 ctxt->depth++; 3659 } 3660 if (xmlStrEqual(name, BAD_CAST"body")) { 3661 int indx; 3662 for (indx = 0;indx < ctxt->nameNr;indx++) { 3663 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) { 3664 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3665 "htmlParseStartTag: misplaced <body> tag\n", 3666 name, NULL); 3667 discardtag = 1; 3668 ctxt->depth++; 3669 } 3670 } 3671 } 3672 3673 /* 3674 * Now parse the attributes, it ends up with the ending 3675 * 3676 * (S Attribute)* S? 3677 */ 3678 SKIP_BLANKS; 3679 while ((IS_CHAR_CH(CUR)) && 3680 (CUR != '>') && 3681 ((CUR != '/') || (NXT(1) != '>'))) { 3682 long cons = ctxt->nbChars; 3683 3684 GROW; 3685 attname = htmlParseAttribute(ctxt, &attvalue); 3686 if (attname != NULL) { 3687 3688 /* 3689 * Well formedness requires at most one declaration of an attribute 3690 */ 3691 for (i = 0; i < nbatts;i += 2) { 3692 if (xmlStrEqual(atts[i], attname)) { 3693 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED, 3694 "Attribute %s redefined\n", attname, NULL); 3695 if (attvalue != NULL) 3696 xmlFree(attvalue); 3697 goto failed; 3698 } 3699 } 3700 3701 /* 3702 * Add the pair to atts 3703 */ 3704 if (atts == NULL) { 3705 maxatts = 22; /* allow for 10 attrs by default */ 3706 atts = (const xmlChar **) 3707 xmlMalloc(maxatts * sizeof(xmlChar *)); 3708 if (atts == NULL) { 3709 htmlErrMemory(ctxt, NULL); 3710 if (attvalue != NULL) 3711 xmlFree(attvalue); 3712 goto failed; 3713 } 3714 ctxt->atts = atts; 3715 ctxt->maxatts = maxatts; 3716 } else if (nbatts + 4 > maxatts) { 3717 const xmlChar **n; 3718 3719 maxatts *= 2; 3720 n = (const xmlChar **) xmlRealloc((void *) atts, 3721 maxatts * sizeof(const xmlChar *)); 3722 if (n == NULL) { 3723 htmlErrMemory(ctxt, NULL); 3724 if (attvalue != NULL) 3725 xmlFree(attvalue); 3726 goto failed; 3727 } 3728 atts = n; 3729 ctxt->atts = atts; 3730 ctxt->maxatts = maxatts; 3731 } 3732 atts[nbatts++] = attname; 3733 atts[nbatts++] = attvalue; 3734 atts[nbatts] = NULL; 3735 atts[nbatts + 1] = NULL; 3736 } 3737 else { 3738 if (attvalue != NULL) 3739 xmlFree(attvalue); 3740 /* Dump the bogus attribute string up to the next blank or 3741 * the end of the tag. */ 3742 while ((IS_CHAR_CH(CUR)) && 3743 !(IS_BLANK_CH(CUR)) && (CUR != '>') && 3744 ((CUR != '/') || (NXT(1) != '>'))) 3745 NEXT; 3746 } 3747 3748 failed: 3749 SKIP_BLANKS; 3750 if (cons == ctxt->nbChars) { 3751 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3752 "htmlParseStartTag: problem parsing attributes\n", 3753 NULL, NULL); 3754 break; 3755 } 3756 } 3757 3758 /* 3759 * Handle specific association to the META tag 3760 */ 3761 if (meta && (nbatts != 0)) 3762 htmlCheckMeta(ctxt, atts); 3763 3764 /* 3765 * SAX: Start of Element ! 3766 */ 3767 if (!discardtag) { 3768 htmlnamePush(ctxt, name); 3769 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) { 3770 if (nbatts != 0) 3771 ctxt->sax->startElement(ctxt->userData, name, atts); 3772 else 3773 ctxt->sax->startElement(ctxt->userData, name, NULL); 3774 } 3775 } 3776 3777 if (atts != NULL) { 3778 for (i = 1;i < nbatts;i += 2) { 3779 if (atts[i] != NULL) 3780 xmlFree((xmlChar *) atts[i]); 3781 } 3782 } 3783 3784 return(discardtag); 3785 } 3786 3787 /** 3788 * htmlParseEndTag: 3789 * @ctxt: an HTML parser context 3790 * 3791 * parse an end of tag 3792 * 3793 * [42] ETag ::= '</' Name S? '>' 3794 * 3795 * With namespace 3796 * 3797 * [NS 9] ETag ::= '</' QName S? '>' 3798 * 3799 * Returns 1 if the current level should be closed. 3800 */ 3801 3802 static int 3803 htmlParseEndTag(htmlParserCtxtPtr ctxt) 3804 { 3805 const xmlChar *name; 3806 const xmlChar *oldname; 3807 int i, ret; 3808 3809 if ((CUR != '<') || (NXT(1) != '/')) { 3810 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED, 3811 "htmlParseEndTag: '</' not found\n", NULL, NULL); 3812 return (0); 3813 } 3814 SKIP(2); 3815 3816 name = htmlParseHTMLName(ctxt); 3817 if (name == NULL) 3818 return (0); 3819 /* 3820 * We should definitely be at the ending "S? '>'" part 3821 */ 3822 SKIP_BLANKS; 3823 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) { 3824 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 3825 "End tag : expected '>'\n", NULL, NULL); 3826 if (ctxt->recovery) { 3827 /* 3828 * We're not at the ending > !! 3829 * Error, unless in recover mode where we search forwards 3830 * until we find a > 3831 */ 3832 while (CUR != '\0' && CUR != '>') NEXT; 3833 NEXT; 3834 } 3835 } else 3836 NEXT; 3837 3838 /* 3839 * if we ignored misplaced tags in htmlParseStartTag don't pop them 3840 * out now. 3841 */ 3842 if ((ctxt->depth > 0) && 3843 (xmlStrEqual(name, BAD_CAST "html") || 3844 xmlStrEqual(name, BAD_CAST "body") || 3845 xmlStrEqual(name, BAD_CAST "head"))) { 3846 ctxt->depth--; 3847 return (0); 3848 } 3849 3850 /* 3851 * If the name read is not one of the element in the parsing stack 3852 * then return, it's just an error. 3853 */ 3854 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 3855 if (xmlStrEqual(name, ctxt->nameTab[i])) 3856 break; 3857 } 3858 if (i < 0) { 3859 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 3860 "Unexpected end tag : %s\n", name, NULL); 3861 return (0); 3862 } 3863 3864 3865 /* 3866 * Check for auto-closure of HTML elements. 3867 */ 3868 3869 htmlAutoCloseOnClose(ctxt, name); 3870 3871 /* 3872 * Well formedness constraints, opening and closing must match. 3873 * With the exception that the autoclose may have popped stuff out 3874 * of the stack. 3875 */ 3876 if (!xmlStrEqual(name, ctxt->name)) { 3877 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) { 3878 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 3879 "Opening and ending tag mismatch: %s and %s\n", 3880 name, ctxt->name); 3881 } 3882 } 3883 3884 /* 3885 * SAX: End of Tag 3886 */ 3887 oldname = ctxt->name; 3888 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) { 3889 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 3890 ctxt->sax->endElement(ctxt->userData, name); 3891 htmlnamePop(ctxt); 3892 ret = 1; 3893 } else { 3894 ret = 0; 3895 } 3896 3897 return (ret); 3898 } 3899 3900 3901 /** 3902 * htmlParseReference: 3903 * @ctxt: an HTML parser context 3904 * 3905 * parse and handle entity references in content, 3906 * this will end-up in a call to character() since this is either a 3907 * CharRef, or a predefined entity. 3908 */ 3909 static void 3910 htmlParseReference(htmlParserCtxtPtr ctxt) { 3911 const htmlEntityDesc * ent; 3912 xmlChar out[6]; 3913 const xmlChar *name; 3914 if (CUR != '&') return; 3915 3916 if (NXT(1) == '#') { 3917 unsigned int c; 3918 int bits, i = 0; 3919 3920 c = htmlParseCharRef(ctxt); 3921 if (c == 0) 3922 return; 3923 3924 if (c < 0x80) { out[i++]= c; bits= -6; } 3925 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } 3926 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } 3927 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } 3928 3929 for ( ; bits >= 0; bits-= 6) { 3930 out[i++]= ((c >> bits) & 0x3F) | 0x80; 3931 } 3932 out[i] = 0; 3933 3934 htmlCheckParagraph(ctxt); 3935 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 3936 ctxt->sax->characters(ctxt->userData, out, i); 3937 } else { 3938 ent = htmlParseEntityRef(ctxt, &name); 3939 if (name == NULL) { 3940 htmlCheckParagraph(ctxt); 3941 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 3942 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); 3943 return; 3944 } 3945 if ((ent == NULL) || !(ent->value > 0)) { 3946 htmlCheckParagraph(ctxt); 3947 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) { 3948 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); 3949 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name)); 3950 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */ 3951 } 3952 } else { 3953 unsigned int c; 3954 int bits, i = 0; 3955 3956 c = ent->value; 3957 if (c < 0x80) 3958 { out[i++]= c; bits= -6; } 3959 else if (c < 0x800) 3960 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } 3961 else if (c < 0x10000) 3962 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } 3963 else 3964 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } 3965 3966 for ( ; bits >= 0; bits-= 6) { 3967 out[i++]= ((c >> bits) & 0x3F) | 0x80; 3968 } 3969 out[i] = 0; 3970 3971 htmlCheckParagraph(ctxt); 3972 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 3973 ctxt->sax->characters(ctxt->userData, out, i); 3974 } 3975 } 3976 } 3977 3978 /** 3979 * htmlParseContent: 3980 * @ctxt: an HTML parser context 3981 * 3982 * Parse a content: comment, sub-element, reference or text. 3983 * Kept for compatibility with old code 3984 */ 3985 3986 static void 3987 htmlParseContent(htmlParserCtxtPtr ctxt) { 3988 xmlChar *currentNode; 3989 int depth; 3990 const xmlChar *name; 3991 3992 currentNode = xmlStrdup(ctxt->name); 3993 depth = ctxt->nameNr; 3994 while (1) { 3995 long cons = ctxt->nbChars; 3996 3997 GROW; 3998 3999 if (ctxt->instate == XML_PARSER_EOF) 4000 break; 4001 4002 /* 4003 * Our tag or one of it's parent or children is ending. 4004 */ 4005 if ((CUR == '<') && (NXT(1) == '/')) { 4006 if (htmlParseEndTag(ctxt) && 4007 ((currentNode != NULL) || (ctxt->nameNr == 0))) { 4008 if (currentNode != NULL) 4009 xmlFree(currentNode); 4010 return; 4011 } 4012 continue; /* while */ 4013 } 4014 4015 else if ((CUR == '<') && 4016 ((IS_ASCII_LETTER(NXT(1))) || 4017 (NXT(1) == '_') || (NXT(1) == ':'))) { 4018 name = htmlParseHTMLName_nonInvasive(ctxt); 4019 if (name == NULL) { 4020 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 4021 "htmlParseStartTag: invalid element name\n", 4022 NULL, NULL); 4023 /* Dump the bogus tag like browsers do */ 4024 while ((IS_CHAR_CH(CUR)) && (CUR != '>')) 4025 NEXT; 4026 4027 if (currentNode != NULL) 4028 xmlFree(currentNode); 4029 return; 4030 } 4031 4032 if (ctxt->name != NULL) { 4033 if (htmlCheckAutoClose(name, ctxt->name) == 1) { 4034 htmlAutoClose(ctxt, name); 4035 continue; 4036 } 4037 } 4038 } 4039 4040 /* 4041 * Has this node been popped out during parsing of 4042 * the next element 4043 */ 4044 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) && 4045 (!xmlStrEqual(currentNode, ctxt->name))) 4046 { 4047 if (currentNode != NULL) xmlFree(currentNode); 4048 return; 4049 } 4050 4051 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) || 4052 (xmlStrEqual(currentNode, BAD_CAST"style")))) { 4053 /* 4054 * Handle SCRIPT/STYLE separately 4055 */ 4056 htmlParseScript(ctxt); 4057 } else { 4058 /* 4059 * Sometimes DOCTYPE arrives in the middle of the document 4060 */ 4061 if ((CUR == '<') && (NXT(1) == '!') && 4062 (UPP(2) == 'D') && (UPP(3) == 'O') && 4063 (UPP(4) == 'C') && (UPP(5) == 'T') && 4064 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4065 (UPP(8) == 'E')) { 4066 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 4067 "Misplaced DOCTYPE declaration\n", 4068 BAD_CAST "DOCTYPE" , NULL); 4069 htmlParseDocTypeDecl(ctxt); 4070 } 4071 4072 /* 4073 * First case : a comment 4074 */ 4075 if ((CUR == '<') && (NXT(1) == '!') && 4076 (NXT(2) == '-') && (NXT(3) == '-')) { 4077 htmlParseComment(ctxt); 4078 } 4079 4080 /* 4081 * Second case : a Processing Instruction. 4082 */ 4083 else if ((CUR == '<') && (NXT(1) == '?')) { 4084 htmlParsePI(ctxt); 4085 } 4086 4087 /* 4088 * Third case : a sub-element. 4089 */ 4090 else if (CUR == '<') { 4091 htmlParseElement(ctxt); 4092 } 4093 4094 /* 4095 * Fourth case : a reference. If if has not been resolved, 4096 * parsing returns it's Name, create the node 4097 */ 4098 else if (CUR == '&') { 4099 htmlParseReference(ctxt); 4100 } 4101 4102 /* 4103 * Fifth case : end of the resource 4104 */ 4105 else if (CUR == 0) { 4106 htmlAutoCloseOnEnd(ctxt); 4107 break; 4108 } 4109 4110 /* 4111 * Last case, text. Note that References are handled directly. 4112 */ 4113 else { 4114 htmlParseCharData(ctxt); 4115 } 4116 4117 if (cons == ctxt->nbChars) { 4118 if (ctxt->node != NULL) { 4119 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4120 "detected an error in element content\n", 4121 NULL, NULL); 4122 } 4123 break; 4124 } 4125 } 4126 GROW; 4127 } 4128 if (currentNode != NULL) xmlFree(currentNode); 4129 } 4130 4131 /** 4132 * htmlParseElement: 4133 * @ctxt: an HTML parser context 4134 * 4135 * parse an HTML element, this is highly recursive 4136 * this is kept for compatibility with previous code versions 4137 * 4138 * [39] element ::= EmptyElemTag | STag content ETag 4139 * 4140 * [41] Attribute ::= Name Eq AttValue 4141 */ 4142 4143 void 4144 htmlParseElement(htmlParserCtxtPtr ctxt) { 4145 const xmlChar *name; 4146 xmlChar *currentNode = NULL; 4147 const htmlElemDesc * info; 4148 htmlParserNodeInfo node_info; 4149 int failed; 4150 int depth; 4151 const xmlChar *oldptr; 4152 4153 if ((ctxt == NULL) || (ctxt->input == NULL)) { 4154 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4155 "htmlParseElement: context error\n", NULL, NULL); 4156 return; 4157 } 4158 4159 if (ctxt->instate == XML_PARSER_EOF) 4160 return; 4161 4162 /* Capture start position */ 4163 if (ctxt->record_info) { 4164 node_info.begin_pos = ctxt->input->consumed + 4165 (CUR_PTR - ctxt->input->base); 4166 node_info.begin_line = ctxt->input->line; 4167 } 4168 4169 failed = htmlParseStartTag(ctxt); 4170 name = ctxt->name; 4171 if ((failed == -1) || (name == NULL)) { 4172 if (CUR == '>') 4173 NEXT; 4174 return; 4175 } 4176 4177 /* 4178 * Lookup the info for that element. 4179 */ 4180 info = htmlTagLookup(name); 4181 if (info == NULL) { 4182 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 4183 "Tag %s invalid\n", name, NULL); 4184 } 4185 4186 /* 4187 * Check for an Empty Element labeled the XML/SGML way 4188 */ 4189 if ((CUR == '/') && (NXT(1) == '>')) { 4190 SKIP(2); 4191 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4192 ctxt->sax->endElement(ctxt->userData, name); 4193 htmlnamePop(ctxt); 4194 return; 4195 } 4196 4197 if (CUR == '>') { 4198 NEXT; 4199 } else { 4200 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 4201 "Couldn't find end of Start Tag %s\n", name, NULL); 4202 4203 /* 4204 * end of parsing of this node. 4205 */ 4206 if (xmlStrEqual(name, ctxt->name)) { 4207 nodePop(ctxt); 4208 htmlnamePop(ctxt); 4209 } 4210 4211 /* 4212 * Capture end position and add node 4213 */ 4214 if (ctxt->record_info) { 4215 node_info.end_pos = ctxt->input->consumed + 4216 (CUR_PTR - ctxt->input->base); 4217 node_info.end_line = ctxt->input->line; 4218 node_info.node = ctxt->node; 4219 xmlParserAddNodeInfo(ctxt, &node_info); 4220 } 4221 return; 4222 } 4223 4224 /* 4225 * Check for an Empty Element from DTD definition 4226 */ 4227 if ((info != NULL) && (info->empty)) { 4228 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4229 ctxt->sax->endElement(ctxt->userData, name); 4230 htmlnamePop(ctxt); 4231 return; 4232 } 4233 4234 /* 4235 * Parse the content of the element: 4236 */ 4237 currentNode = xmlStrdup(ctxt->name); 4238 depth = ctxt->nameNr; 4239 while (IS_CHAR_CH(CUR)) { 4240 oldptr = ctxt->input->cur; 4241 htmlParseContent(ctxt); 4242 if (oldptr==ctxt->input->cur) break; 4243 if (ctxt->nameNr < depth) break; 4244 } 4245 4246 /* 4247 * Capture end position and add node 4248 */ 4249 if ( currentNode != NULL && ctxt->record_info ) { 4250 node_info.end_pos = ctxt->input->consumed + 4251 (CUR_PTR - ctxt->input->base); 4252 node_info.end_line = ctxt->input->line; 4253 node_info.node = ctxt->node; 4254 xmlParserAddNodeInfo(ctxt, &node_info); 4255 } 4256 if (!IS_CHAR_CH(CUR)) { 4257 htmlAutoCloseOnEnd(ctxt); 4258 } 4259 4260 if (currentNode != NULL) 4261 xmlFree(currentNode); 4262 } 4263 4264 static void 4265 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) { 4266 /* 4267 * Capture end position and add node 4268 */ 4269 if ( ctxt->node != NULL && ctxt->record_info ) { 4270 ctxt->nodeInfo->end_pos = ctxt->input->consumed + 4271 (CUR_PTR - ctxt->input->base); 4272 ctxt->nodeInfo->end_line = ctxt->input->line; 4273 ctxt->nodeInfo->node = ctxt->node; 4274 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo); 4275 htmlNodeInfoPop(ctxt); 4276 } 4277 if (!IS_CHAR_CH(CUR)) { 4278 htmlAutoCloseOnEnd(ctxt); 4279 } 4280 } 4281 4282 /** 4283 * htmlParseElementInternal: 4284 * @ctxt: an HTML parser context 4285 * 4286 * parse an HTML element, new version, non recursive 4287 * 4288 * [39] element ::= EmptyElemTag | STag content ETag 4289 * 4290 * [41] Attribute ::= Name Eq AttValue 4291 */ 4292 4293 static void 4294 htmlParseElementInternal(htmlParserCtxtPtr ctxt) { 4295 const xmlChar *name; 4296 const htmlElemDesc * info; 4297 htmlParserNodeInfo node_info; 4298 int failed; 4299 4300 if ((ctxt == NULL) || (ctxt->input == NULL)) { 4301 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4302 "htmlParseElementInternal: context error\n", NULL, NULL); 4303 return; 4304 } 4305 4306 if (ctxt->instate == XML_PARSER_EOF) 4307 return; 4308 4309 /* Capture start position */ 4310 if (ctxt->record_info) { 4311 node_info.begin_pos = ctxt->input->consumed + 4312 (CUR_PTR - ctxt->input->base); 4313 node_info.begin_line = ctxt->input->line; 4314 } 4315 4316 failed = htmlParseStartTag(ctxt); 4317 name = ctxt->name; 4318 if ((failed == -1) || (name == NULL)) { 4319 if (CUR == '>') 4320 NEXT; 4321 return; 4322 } 4323 4324 /* 4325 * Lookup the info for that element. 4326 */ 4327 info = htmlTagLookup(name); 4328 if (info == NULL) { 4329 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 4330 "Tag %s invalid\n", name, NULL); 4331 } 4332 4333 /* 4334 * Check for an Empty Element labeled the XML/SGML way 4335 */ 4336 if ((CUR == '/') && (NXT(1) == '>')) { 4337 SKIP(2); 4338 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4339 ctxt->sax->endElement(ctxt->userData, name); 4340 htmlnamePop(ctxt); 4341 return; 4342 } 4343 4344 if (CUR == '>') { 4345 NEXT; 4346 } else { 4347 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 4348 "Couldn't find end of Start Tag %s\n", name, NULL); 4349 4350 /* 4351 * end of parsing of this node. 4352 */ 4353 if (xmlStrEqual(name, ctxt->name)) { 4354 nodePop(ctxt); 4355 htmlnamePop(ctxt); 4356 } 4357 4358 if (ctxt->record_info) 4359 htmlNodeInfoPush(ctxt, &node_info); 4360 htmlParserFinishElementParsing(ctxt); 4361 return; 4362 } 4363 4364 /* 4365 * Check for an Empty Element from DTD definition 4366 */ 4367 if ((info != NULL) && (info->empty)) { 4368 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4369 ctxt->sax->endElement(ctxt->userData, name); 4370 htmlnamePop(ctxt); 4371 return; 4372 } 4373 4374 if (ctxt->record_info) 4375 htmlNodeInfoPush(ctxt, &node_info); 4376 } 4377 4378 /** 4379 * htmlParseContentInternal: 4380 * @ctxt: an HTML parser context 4381 * 4382 * Parse a content: comment, sub-element, reference or text. 4383 * New version for non recursive htmlParseElementInternal 4384 */ 4385 4386 static void 4387 htmlParseContentInternal(htmlParserCtxtPtr ctxt) { 4388 xmlChar *currentNode; 4389 int depth; 4390 const xmlChar *name; 4391 4392 currentNode = xmlStrdup(ctxt->name); 4393 depth = ctxt->nameNr; 4394 while (1) { 4395 long cons = ctxt->nbChars; 4396 4397 GROW; 4398 4399 if (ctxt->instate == XML_PARSER_EOF) 4400 break; 4401 4402 /* 4403 * Our tag or one of it's parent or children is ending. 4404 */ 4405 if ((CUR == '<') && (NXT(1) == '/')) { 4406 if (htmlParseEndTag(ctxt) && 4407 ((currentNode != NULL) || (ctxt->nameNr == 0))) { 4408 if (currentNode != NULL) 4409 xmlFree(currentNode); 4410 4411 currentNode = xmlStrdup(ctxt->name); 4412 depth = ctxt->nameNr; 4413 } 4414 continue; /* while */ 4415 } 4416 4417 else if ((CUR == '<') && 4418 ((IS_ASCII_LETTER(NXT(1))) || 4419 (NXT(1) == '_') || (NXT(1) == ':'))) { 4420 name = htmlParseHTMLName_nonInvasive(ctxt); 4421 if (name == NULL) { 4422 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 4423 "htmlParseStartTag: invalid element name\n", 4424 NULL, NULL); 4425 /* Dump the bogus tag like browsers do */ 4426 while ((IS_CHAR_CH(CUR)) && (CUR != '>')) 4427 NEXT; 4428 4429 htmlParserFinishElementParsing(ctxt); 4430 if (currentNode != NULL) 4431 xmlFree(currentNode); 4432 4433 currentNode = xmlStrdup(ctxt->name); 4434 depth = ctxt->nameNr; 4435 continue; 4436 } 4437 4438 if (ctxt->name != NULL) { 4439 if (htmlCheckAutoClose(name, ctxt->name) == 1) { 4440 htmlAutoClose(ctxt, name); 4441 continue; 4442 } 4443 } 4444 } 4445 4446 /* 4447 * Has this node been popped out during parsing of 4448 * the next element 4449 */ 4450 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) && 4451 (!xmlStrEqual(currentNode, ctxt->name))) 4452 { 4453 htmlParserFinishElementParsing(ctxt); 4454 if (currentNode != NULL) xmlFree(currentNode); 4455 4456 currentNode = xmlStrdup(ctxt->name); 4457 depth = ctxt->nameNr; 4458 continue; 4459 } 4460 4461 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) || 4462 (xmlStrEqual(currentNode, BAD_CAST"style")))) { 4463 /* 4464 * Handle SCRIPT/STYLE separately 4465 */ 4466 htmlParseScript(ctxt); 4467 } else { 4468 /* 4469 * Sometimes DOCTYPE arrives in the middle of the document 4470 */ 4471 if ((CUR == '<') && (NXT(1) == '!') && 4472 (UPP(2) == 'D') && (UPP(3) == 'O') && 4473 (UPP(4) == 'C') && (UPP(5) == 'T') && 4474 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4475 (UPP(8) == 'E')) { 4476 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 4477 "Misplaced DOCTYPE declaration\n", 4478 BAD_CAST "DOCTYPE" , NULL); 4479 htmlParseDocTypeDecl(ctxt); 4480 } 4481 4482 /* 4483 * First case : a comment 4484 */ 4485 if ((CUR == '<') && (NXT(1) == '!') && 4486 (NXT(2) == '-') && (NXT(3) == '-')) { 4487 htmlParseComment(ctxt); 4488 } 4489 4490 /* 4491 * Second case : a Processing Instruction. 4492 */ 4493 else if ((CUR == '<') && (NXT(1) == '?')) { 4494 htmlParsePI(ctxt); 4495 } 4496 4497 /* 4498 * Third case : a sub-element. 4499 */ 4500 else if (CUR == '<') { 4501 htmlParseElementInternal(ctxt); 4502 if (currentNode != NULL) xmlFree(currentNode); 4503 4504 currentNode = xmlStrdup(ctxt->name); 4505 depth = ctxt->nameNr; 4506 } 4507 4508 /* 4509 * Fourth case : a reference. If if has not been resolved, 4510 * parsing returns it's Name, create the node 4511 */ 4512 else if (CUR == '&') { 4513 htmlParseReference(ctxt); 4514 } 4515 4516 /* 4517 * Fifth case : end of the resource 4518 */ 4519 else if (CUR == 0) { 4520 htmlAutoCloseOnEnd(ctxt); 4521 break; 4522 } 4523 4524 /* 4525 * Last case, text. Note that References are handled directly. 4526 */ 4527 else { 4528 htmlParseCharData(ctxt); 4529 } 4530 4531 if (cons == ctxt->nbChars) { 4532 if (ctxt->node != NULL) { 4533 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4534 "detected an error in element content\n", 4535 NULL, NULL); 4536 } 4537 break; 4538 } 4539 } 4540 GROW; 4541 } 4542 if (currentNode != NULL) xmlFree(currentNode); 4543 } 4544 4545 /** 4546 * htmlParseContent: 4547 * @ctxt: an HTML parser context 4548 * 4549 * Parse a content: comment, sub-element, reference or text. 4550 * This is the entry point when called from parser.c 4551 */ 4552 4553 void 4554 __htmlParseContent(void *ctxt) { 4555 if (ctxt != NULL) 4556 htmlParseContentInternal((htmlParserCtxtPtr) ctxt); 4557 } 4558 4559 /** 4560 * htmlParseDocument: 4561 * @ctxt: an HTML parser context 4562 * 4563 * parse an HTML document (and build a tree if using the standard SAX 4564 * interface). 4565 * 4566 * Returns 0, -1 in case of error. the parser context is augmented 4567 * as a result of the parsing. 4568 */ 4569 4570 int 4571 htmlParseDocument(htmlParserCtxtPtr ctxt) { 4572 xmlChar start[4]; 4573 xmlCharEncoding enc; 4574 xmlDtdPtr dtd; 4575 4576 xmlInitParser(); 4577 4578 htmlDefaultSAXHandlerInit(); 4579 4580 if ((ctxt == NULL) || (ctxt->input == NULL)) { 4581 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4582 "htmlParseDocument: context error\n", NULL, NULL); 4583 return(XML_ERR_INTERNAL_ERROR); 4584 } 4585 ctxt->html = 1; 4586 ctxt->linenumbers = 1; 4587 GROW; 4588 /* 4589 * SAX: beginning of the document processing. 4590 */ 4591 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) 4592 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator); 4593 4594 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) && 4595 ((ctxt->input->end - ctxt->input->cur) >= 4)) { 4596 /* 4597 * Get the 4 first bytes and decode the charset 4598 * if enc != XML_CHAR_ENCODING_NONE 4599 * plug some encoding conversion routines. 4600 */ 4601 start[0] = RAW; 4602 start[1] = NXT(1); 4603 start[2] = NXT(2); 4604 start[3] = NXT(3); 4605 enc = xmlDetectCharEncoding(&start[0], 4); 4606 if (enc != XML_CHAR_ENCODING_NONE) { 4607 xmlSwitchEncoding(ctxt, enc); 4608 } 4609 } 4610 4611 /* 4612 * Wipe out everything which is before the first '<' 4613 */ 4614 SKIP_BLANKS; 4615 if (CUR == 0) { 4616 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY, 4617 "Document is empty\n", NULL, NULL); 4618 } 4619 4620 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX)) 4621 ctxt->sax->startDocument(ctxt->userData); 4622 4623 4624 /* 4625 * Parse possible comments and PIs before any content 4626 */ 4627 while (((CUR == '<') && (NXT(1) == '!') && 4628 (NXT(2) == '-') && (NXT(3) == '-')) || 4629 ((CUR == '<') && (NXT(1) == '?'))) { 4630 htmlParseComment(ctxt); 4631 htmlParsePI(ctxt); 4632 SKIP_BLANKS; 4633 } 4634 4635 4636 /* 4637 * Then possibly doc type declaration(s) and more Misc 4638 * (doctypedecl Misc*)? 4639 */ 4640 if ((CUR == '<') && (NXT(1) == '!') && 4641 (UPP(2) == 'D') && (UPP(3) == 'O') && 4642 (UPP(4) == 'C') && (UPP(5) == 'T') && 4643 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4644 (UPP(8) == 'E')) { 4645 htmlParseDocTypeDecl(ctxt); 4646 } 4647 SKIP_BLANKS; 4648 4649 /* 4650 * Parse possible comments and PIs before any content 4651 */ 4652 while (((CUR == '<') && (NXT(1) == '!') && 4653 (NXT(2) == '-') && (NXT(3) == '-')) || 4654 ((CUR == '<') && (NXT(1) == '?'))) { 4655 htmlParseComment(ctxt); 4656 htmlParsePI(ctxt); 4657 SKIP_BLANKS; 4658 } 4659 4660 /* 4661 * Time to start parsing the tree itself 4662 */ 4663 htmlParseContentInternal(ctxt); 4664 4665 /* 4666 * autoclose 4667 */ 4668 if (CUR == 0) 4669 htmlAutoCloseOnEnd(ctxt); 4670 4671 4672 /* 4673 * SAX: end of the document processing. 4674 */ 4675 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 4676 ctxt->sax->endDocument(ctxt->userData); 4677 4678 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) { 4679 dtd = xmlGetIntSubset(ctxt->myDoc); 4680 if (dtd == NULL) 4681 ctxt->myDoc->intSubset = 4682 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 4683 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 4684 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 4685 } 4686 if (! ctxt->wellFormed) return(-1); 4687 return(0); 4688 } 4689 4690 4691 /************************************************************************ 4692 * * 4693 * Parser contexts handling * 4694 * * 4695 ************************************************************************/ 4696 4697 /** 4698 * htmlInitParserCtxt: 4699 * @ctxt: an HTML parser context 4700 * 4701 * Initialize a parser context 4702 * 4703 * Returns 0 in case of success and -1 in case of error 4704 */ 4705 4706 static int 4707 htmlInitParserCtxt(htmlParserCtxtPtr ctxt) 4708 { 4709 htmlSAXHandler *sax; 4710 4711 if (ctxt == NULL) return(-1); 4712 memset(ctxt, 0, sizeof(htmlParserCtxt)); 4713 4714 ctxt->dict = xmlDictCreate(); 4715 if (ctxt->dict == NULL) { 4716 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4717 return(-1); 4718 } 4719 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler)); 4720 if (sax == NULL) { 4721 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4722 return(-1); 4723 } 4724 else 4725 memset(sax, 0, sizeof(htmlSAXHandler)); 4726 4727 /* Allocate the Input stack */ 4728 ctxt->inputTab = (htmlParserInputPtr *) 4729 xmlMalloc(5 * sizeof(htmlParserInputPtr)); 4730 if (ctxt->inputTab == NULL) { 4731 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4732 ctxt->inputNr = 0; 4733 ctxt->inputMax = 0; 4734 ctxt->input = NULL; 4735 return(-1); 4736 } 4737 ctxt->inputNr = 0; 4738 ctxt->inputMax = 5; 4739 ctxt->input = NULL; 4740 ctxt->version = NULL; 4741 ctxt->encoding = NULL; 4742 ctxt->standalone = -1; 4743 ctxt->instate = XML_PARSER_START; 4744 4745 /* Allocate the Node stack */ 4746 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr)); 4747 if (ctxt->nodeTab == NULL) { 4748 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4749 ctxt->nodeNr = 0; 4750 ctxt->nodeMax = 0; 4751 ctxt->node = NULL; 4752 ctxt->inputNr = 0; 4753 ctxt->inputMax = 0; 4754 ctxt->input = NULL; 4755 return(-1); 4756 } 4757 ctxt->nodeNr = 0; 4758 ctxt->nodeMax = 10; 4759 ctxt->node = NULL; 4760 4761 /* Allocate the Name stack */ 4762 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *)); 4763 if (ctxt->nameTab == NULL) { 4764 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4765 ctxt->nameNr = 0; 4766 ctxt->nameMax = 0; 4767 ctxt->name = NULL; 4768 ctxt->nodeNr = 0; 4769 ctxt->nodeMax = 0; 4770 ctxt->node = NULL; 4771 ctxt->inputNr = 0; 4772 ctxt->inputMax = 0; 4773 ctxt->input = NULL; 4774 return(-1); 4775 } 4776 ctxt->nameNr = 0; 4777 ctxt->nameMax = 10; 4778 ctxt->name = NULL; 4779 4780 ctxt->nodeInfoTab = NULL; 4781 ctxt->nodeInfoNr = 0; 4782 ctxt->nodeInfoMax = 0; 4783 4784 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler; 4785 else { 4786 ctxt->sax = sax; 4787 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 4788 } 4789 ctxt->userData = ctxt; 4790 ctxt->myDoc = NULL; 4791 ctxt->wellFormed = 1; 4792 ctxt->replaceEntities = 0; 4793 ctxt->linenumbers = xmlLineNumbersDefaultValue; 4794 ctxt->html = 1; 4795 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0; 4796 ctxt->vctxt.userData = ctxt; 4797 ctxt->vctxt.error = xmlParserValidityError; 4798 ctxt->vctxt.warning = xmlParserValidityWarning; 4799 ctxt->record_info = 0; 4800 ctxt->validate = 0; 4801 ctxt->nbChars = 0; 4802 ctxt->checkIndex = 0; 4803 ctxt->catalogs = NULL; 4804 xmlInitNodeInfoSeq(&ctxt->node_seq); 4805 return(0); 4806 } 4807 4808 /** 4809 * htmlFreeParserCtxt: 4810 * @ctxt: an HTML parser context 4811 * 4812 * Free all the memory used by a parser context. However the parsed 4813 * document in ctxt->myDoc is not freed. 4814 */ 4815 4816 void 4817 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt) 4818 { 4819 xmlFreeParserCtxt(ctxt); 4820 } 4821 4822 /** 4823 * htmlNewParserCtxt: 4824 * 4825 * Allocate and initialize a new parser context. 4826 * 4827 * Returns the htmlParserCtxtPtr or NULL in case of allocation error 4828 */ 4829 4830 htmlParserCtxtPtr 4831 htmlNewParserCtxt(void) 4832 { 4833 xmlParserCtxtPtr ctxt; 4834 4835 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt)); 4836 if (ctxt == NULL) { 4837 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n"); 4838 return(NULL); 4839 } 4840 memset(ctxt, 0, sizeof(xmlParserCtxt)); 4841 if (htmlInitParserCtxt(ctxt) < 0) { 4842 htmlFreeParserCtxt(ctxt); 4843 return(NULL); 4844 } 4845 return(ctxt); 4846 } 4847 4848 /** 4849 * htmlCreateMemoryParserCtxt: 4850 * @buffer: a pointer to a char array 4851 * @size: the size of the array 4852 * 4853 * Create a parser context for an HTML in-memory document. 4854 * 4855 * Returns the new parser context or NULL 4856 */ 4857 htmlParserCtxtPtr 4858 htmlCreateMemoryParserCtxt(const char *buffer, int size) { 4859 xmlParserCtxtPtr ctxt; 4860 xmlParserInputPtr input; 4861 xmlParserInputBufferPtr buf; 4862 4863 if (buffer == NULL) 4864 return(NULL); 4865 if (size <= 0) 4866 return(NULL); 4867 4868 ctxt = htmlNewParserCtxt(); 4869 if (ctxt == NULL) 4870 return(NULL); 4871 4872 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); 4873 if (buf == NULL) return(NULL); 4874 4875 input = xmlNewInputStream(ctxt); 4876 if (input == NULL) { 4877 xmlFreeParserCtxt(ctxt); 4878 return(NULL); 4879 } 4880 4881 input->filename = NULL; 4882 input->buf = buf; 4883 input->base = input->buf->buffer->content; 4884 input->cur = input->buf->buffer->content; 4885 input->end = &input->buf->buffer->content[input->buf->buffer->use]; 4886 4887 inputPush(ctxt, input); 4888 return(ctxt); 4889 } 4890 4891 /** 4892 * htmlCreateDocParserCtxt: 4893 * @cur: a pointer to an array of xmlChar 4894 * @encoding: a free form C string describing the HTML document encoding, or NULL 4895 * 4896 * Create a parser context for an HTML document. 4897 * 4898 * TODO: check the need to add encoding handling there 4899 * 4900 * Returns the new parser context or NULL 4901 */ 4902 static htmlParserCtxtPtr 4903 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) { 4904 int len; 4905 htmlParserCtxtPtr ctxt; 4906 4907 if (cur == NULL) 4908 return(NULL); 4909 len = xmlStrlen(cur); 4910 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len); 4911 if (ctxt == NULL) 4912 return(NULL); 4913 4914 if (encoding != NULL) { 4915 xmlCharEncoding enc; 4916 xmlCharEncodingHandlerPtr handler; 4917 4918 if (ctxt->input->encoding != NULL) 4919 xmlFree((xmlChar *) ctxt->input->encoding); 4920 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding); 4921 4922 enc = xmlParseCharEncoding(encoding); 4923 /* 4924 * registered set of known encodings 4925 */ 4926 if (enc != XML_CHAR_ENCODING_ERROR) { 4927 xmlSwitchEncoding(ctxt, enc); 4928 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { 4929 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 4930 "Unsupported encoding %s\n", 4931 (const xmlChar *) encoding, NULL); 4932 } 4933 } else { 4934 /* 4935 * fallback for unknown encodings 4936 */ 4937 handler = xmlFindCharEncodingHandler((const char *) encoding); 4938 if (handler != NULL) { 4939 xmlSwitchToEncoding(ctxt, handler); 4940 } else { 4941 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 4942 "Unsupported encoding %s\n", 4943 (const xmlChar *) encoding, NULL); 4944 } 4945 } 4946 } 4947 return(ctxt); 4948 } 4949 4950 #ifdef LIBXML_PUSH_ENABLED 4951 /************************************************************************ 4952 * * 4953 * Progressive parsing interfaces * 4954 * * 4955 ************************************************************************/ 4956 4957 /** 4958 * htmlParseLookupSequence: 4959 * @ctxt: an HTML parser context 4960 * @first: the first char to lookup 4961 * @next: the next char to lookup or zero 4962 * @third: the next char to lookup or zero 4963 * @comment: flag to force checking inside comments 4964 * 4965 * Try to find if a sequence (first, next, third) or just (first next) or 4966 * (first) is available in the input stream. 4967 * This function has a side effect of (possibly) incrementing ctxt->checkIndex 4968 * to avoid rescanning sequences of bytes, it DOES change the state of the 4969 * parser, do not use liberally. 4970 * This is basically similar to xmlParseLookupSequence() 4971 * 4972 * Returns the index to the current parsing point if the full sequence 4973 * is available, -1 otherwise. 4974 */ 4975 static int 4976 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, 4977 xmlChar next, xmlChar third, int iscomment, 4978 int ignoreattrval) 4979 { 4980 int base, len; 4981 htmlParserInputPtr in; 4982 const xmlChar *buf; 4983 int incomment = 0; 4984 int invalue = 0; 4985 char valdellim = 0x0; 4986 4987 in = ctxt->input; 4988 if (in == NULL) 4989 return (-1); 4990 4991 base = in->cur - in->base; 4992 if (base < 0) 4993 return (-1); 4994 4995 if (ctxt->checkIndex > base) 4996 base = ctxt->checkIndex; 4997 4998 if (in->buf == NULL) { 4999 buf = in->base; 5000 len = in->length; 5001 } else { 5002 buf = in->buf->buffer->content; 5003 len = in->buf->buffer->use; 5004 } 5005 5006 /* take into account the sequence length */ 5007 if (third) 5008 len -= 2; 5009 else if (next) 5010 len--; 5011 for (; base < len; base++) { 5012 if ((!incomment) && (base + 4 < len) && (!iscomment)) { 5013 if ((buf[base] == '<') && (buf[base + 1] == '!') && 5014 (buf[base + 2] == '-') && (buf[base + 3] == '-')) { 5015 incomment = 1; 5016 /* do not increment past <! - some people use <!--> */ 5017 base += 2; 5018 } 5019 } 5020 if (ignoreattrval) { 5021 if (buf[base] == '"' || buf[base] == '\'') { 5022 if (invalue) { 5023 if (buf[base] == valdellim) { 5024 invalue = 0; 5025 continue; 5026 } 5027 } else { 5028 valdellim = buf[base]; 5029 invalue = 1; 5030 continue; 5031 } 5032 } else if (invalue) { 5033 continue; 5034 } 5035 } 5036 if (incomment) { 5037 if (base + 3 > len) 5038 return (-1); 5039 if ((buf[base] == '-') && (buf[base + 1] == '-') && 5040 (buf[base + 2] == '>')) { 5041 incomment = 0; 5042 base += 2; 5043 } 5044 continue; 5045 } 5046 if (buf[base] == first) { 5047 if (third != 0) { 5048 if ((buf[base + 1] != next) || (buf[base + 2] != third)) 5049 continue; 5050 } else if (next != 0) { 5051 if (buf[base + 1] != next) 5052 continue; 5053 } 5054 ctxt->checkIndex = 0; 5055 #ifdef DEBUG_PUSH 5056 if (next == 0) 5057 xmlGenericError(xmlGenericErrorContext, 5058 "HPP: lookup '%c' found at %d\n", 5059 first, base); 5060 else if (third == 0) 5061 xmlGenericError(xmlGenericErrorContext, 5062 "HPP: lookup '%c%c' found at %d\n", 5063 first, next, base); 5064 else 5065 xmlGenericError(xmlGenericErrorContext, 5066 "HPP: lookup '%c%c%c' found at %d\n", 5067 first, next, third, base); 5068 #endif 5069 return (base - (in->cur - in->base)); 5070 } 5071 } 5072 if ((!incomment) && (!invalue)) 5073 ctxt->checkIndex = base; 5074 #ifdef DEBUG_PUSH 5075 if (next == 0) 5076 xmlGenericError(xmlGenericErrorContext, 5077 "HPP: lookup '%c' failed\n", first); 5078 else if (third == 0) 5079 xmlGenericError(xmlGenericErrorContext, 5080 "HPP: lookup '%c%c' failed\n", first, next); 5081 else 5082 xmlGenericError(xmlGenericErrorContext, 5083 "HPP: lookup '%c%c%c' failed\n", first, next, 5084 third); 5085 #endif 5086 return (-1); 5087 } 5088 5089 /** 5090 * htmlParseLookupChars: 5091 * @ctxt: an HTML parser context 5092 * @stop: Array of chars, which stop the lookup. 5093 * @stopLen: Length of stop-Array 5094 * 5095 * Try to find if any char of the stop-Array is available in the input 5096 * stream. 5097 * This function has a side effect of (possibly) incrementing ctxt->checkIndex 5098 * to avoid rescanning sequences of bytes, it DOES change the state of the 5099 * parser, do not use liberally. 5100 * 5101 * Returns the index to the current parsing point if a stopChar 5102 * is available, -1 otherwise. 5103 */ 5104 static int 5105 htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop, 5106 int stopLen) 5107 { 5108 int base, len; 5109 htmlParserInputPtr in; 5110 const xmlChar *buf; 5111 int incomment = 0; 5112 int i; 5113 5114 in = ctxt->input; 5115 if (in == NULL) 5116 return (-1); 5117 5118 base = in->cur - in->base; 5119 if (base < 0) 5120 return (-1); 5121 5122 if (ctxt->checkIndex > base) 5123 base = ctxt->checkIndex; 5124 5125 if (in->buf == NULL) { 5126 buf = in->base; 5127 len = in->length; 5128 } else { 5129 buf = in->buf->buffer->content; 5130 len = in->buf->buffer->use; 5131 } 5132 5133 for (; base < len; base++) { 5134 if (!incomment && (base + 4 < len)) { 5135 if ((buf[base] == '<') && (buf[base + 1] == '!') && 5136 (buf[base + 2] == '-') && (buf[base + 3] == '-')) { 5137 incomment = 1; 5138 /* do not increment past <! - some people use <!--> */ 5139 base += 2; 5140 } 5141 } 5142 if (incomment) { 5143 if (base + 3 > len) 5144 return (-1); 5145 if ((buf[base] == '-') && (buf[base + 1] == '-') && 5146 (buf[base + 2] == '>')) { 5147 incomment = 0; 5148 base += 2; 5149 } 5150 continue; 5151 } 5152 for (i = 0; i < stopLen; ++i) { 5153 if (buf[base] == stop[i]) { 5154 ctxt->checkIndex = 0; 5155 return (base - (in->cur - in->base)); 5156 } 5157 } 5158 } 5159 ctxt->checkIndex = base; 5160 return (-1); 5161 } 5162 5163 /** 5164 * htmlParseTryOrFinish: 5165 * @ctxt: an HTML parser context 5166 * @terminate: last chunk indicator 5167 * 5168 * Try to progress on parsing 5169 * 5170 * Returns zero if no parsing was possible 5171 */ 5172 static int 5173 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { 5174 int ret = 0; 5175 htmlParserInputPtr in; 5176 int avail = 0; 5177 xmlChar cur, next; 5178 5179 #ifdef DEBUG_PUSH 5180 switch (ctxt->instate) { 5181 case XML_PARSER_EOF: 5182 xmlGenericError(xmlGenericErrorContext, 5183 "HPP: try EOF\n"); break; 5184 case XML_PARSER_START: 5185 xmlGenericError(xmlGenericErrorContext, 5186 "HPP: try START\n"); break; 5187 case XML_PARSER_MISC: 5188 xmlGenericError(xmlGenericErrorContext, 5189 "HPP: try MISC\n");break; 5190 case XML_PARSER_COMMENT: 5191 xmlGenericError(xmlGenericErrorContext, 5192 "HPP: try COMMENT\n");break; 5193 case XML_PARSER_PROLOG: 5194 xmlGenericError(xmlGenericErrorContext, 5195 "HPP: try PROLOG\n");break; 5196 case XML_PARSER_START_TAG: 5197 xmlGenericError(xmlGenericErrorContext, 5198 "HPP: try START_TAG\n");break; 5199 case XML_PARSER_CONTENT: 5200 xmlGenericError(xmlGenericErrorContext, 5201 "HPP: try CONTENT\n");break; 5202 case XML_PARSER_CDATA_SECTION: 5203 xmlGenericError(xmlGenericErrorContext, 5204 "HPP: try CDATA_SECTION\n");break; 5205 case XML_PARSER_END_TAG: 5206 xmlGenericError(xmlGenericErrorContext, 5207 "HPP: try END_TAG\n");break; 5208 case XML_PARSER_ENTITY_DECL: 5209 xmlGenericError(xmlGenericErrorContext, 5210 "HPP: try ENTITY_DECL\n");break; 5211 case XML_PARSER_ENTITY_VALUE: 5212 xmlGenericError(xmlGenericErrorContext, 5213 "HPP: try ENTITY_VALUE\n");break; 5214 case XML_PARSER_ATTRIBUTE_VALUE: 5215 xmlGenericError(xmlGenericErrorContext, 5216 "HPP: try ATTRIBUTE_VALUE\n");break; 5217 case XML_PARSER_DTD: 5218 xmlGenericError(xmlGenericErrorContext, 5219 "HPP: try DTD\n");break; 5220 case XML_PARSER_EPILOG: 5221 xmlGenericError(xmlGenericErrorContext, 5222 "HPP: try EPILOG\n");break; 5223 case XML_PARSER_PI: 5224 xmlGenericError(xmlGenericErrorContext, 5225 "HPP: try PI\n");break; 5226 case XML_PARSER_SYSTEM_LITERAL: 5227 xmlGenericError(xmlGenericErrorContext, 5228 "HPP: try SYSTEM_LITERAL\n");break; 5229 } 5230 #endif 5231 5232 while (1) { 5233 5234 in = ctxt->input; 5235 if (in == NULL) break; 5236 if (in->buf == NULL) 5237 avail = in->length - (in->cur - in->base); 5238 else 5239 avail = in->buf->buffer->use - (in->cur - in->base); 5240 if ((avail == 0) && (terminate)) { 5241 htmlAutoCloseOnEnd(ctxt); 5242 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 5243 /* 5244 * SAX: end of the document processing. 5245 */ 5246 ctxt->instate = XML_PARSER_EOF; 5247 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5248 ctxt->sax->endDocument(ctxt->userData); 5249 } 5250 } 5251 if (avail < 1) 5252 goto done; 5253 cur = in->cur[0]; 5254 if (cur == 0) { 5255 SKIP(1); 5256 continue; 5257 } 5258 5259 switch (ctxt->instate) { 5260 case XML_PARSER_EOF: 5261 /* 5262 * Document parsing is done ! 5263 */ 5264 goto done; 5265 case XML_PARSER_START: 5266 /* 5267 * Very first chars read from the document flow. 5268 */ 5269 cur = in->cur[0]; 5270 if (IS_BLANK_CH(cur)) { 5271 SKIP_BLANKS; 5272 if (in->buf == NULL) 5273 avail = in->length - (in->cur - in->base); 5274 else 5275 avail = in->buf->buffer->use - (in->cur - in->base); 5276 } 5277 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) 5278 ctxt->sax->setDocumentLocator(ctxt->userData, 5279 &xmlDefaultSAXLocator); 5280 if ((ctxt->sax) && (ctxt->sax->startDocument) && 5281 (!ctxt->disableSAX)) 5282 ctxt->sax->startDocument(ctxt->userData); 5283 5284 cur = in->cur[0]; 5285 next = in->cur[1]; 5286 if ((cur == '<') && (next == '!') && 5287 (UPP(2) == 'D') && (UPP(3) == 'O') && 5288 (UPP(4) == 'C') && (UPP(5) == 'T') && 5289 (UPP(6) == 'Y') && (UPP(7) == 'P') && 5290 (UPP(8) == 'E')) { 5291 if ((!terminate) && 5292 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5293 goto done; 5294 #ifdef DEBUG_PUSH 5295 xmlGenericError(xmlGenericErrorContext, 5296 "HPP: Parsing internal subset\n"); 5297 #endif 5298 htmlParseDocTypeDecl(ctxt); 5299 ctxt->instate = XML_PARSER_PROLOG; 5300 #ifdef DEBUG_PUSH 5301 xmlGenericError(xmlGenericErrorContext, 5302 "HPP: entering PROLOG\n"); 5303 #endif 5304 } else { 5305 ctxt->instate = XML_PARSER_MISC; 5306 #ifdef DEBUG_PUSH 5307 xmlGenericError(xmlGenericErrorContext, 5308 "HPP: entering MISC\n"); 5309 #endif 5310 } 5311 break; 5312 case XML_PARSER_MISC: 5313 SKIP_BLANKS; 5314 if (in->buf == NULL) 5315 avail = in->length - (in->cur - in->base); 5316 else 5317 avail = in->buf->buffer->use - (in->cur - in->base); 5318 if (avail < 2) 5319 goto done; 5320 cur = in->cur[0]; 5321 next = in->cur[1]; 5322 if ((cur == '<') && (next == '!') && 5323 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5324 if ((!terminate) && 5325 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 5326 goto done; 5327 #ifdef DEBUG_PUSH 5328 xmlGenericError(xmlGenericErrorContext, 5329 "HPP: Parsing Comment\n"); 5330 #endif 5331 htmlParseComment(ctxt); 5332 ctxt->instate = XML_PARSER_MISC; 5333 } else if ((cur == '<') && (next == '?')) { 5334 if ((!terminate) && 5335 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5336 goto done; 5337 #ifdef DEBUG_PUSH 5338 xmlGenericError(xmlGenericErrorContext, 5339 "HPP: Parsing PI\n"); 5340 #endif 5341 htmlParsePI(ctxt); 5342 ctxt->instate = XML_PARSER_MISC; 5343 } else if ((cur == '<') && (next == '!') && 5344 (UPP(2) == 'D') && (UPP(3) == 'O') && 5345 (UPP(4) == 'C') && (UPP(5) == 'T') && 5346 (UPP(6) == 'Y') && (UPP(7) == 'P') && 5347 (UPP(8) == 'E')) { 5348 if ((!terminate) && 5349 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5350 goto done; 5351 #ifdef DEBUG_PUSH 5352 xmlGenericError(xmlGenericErrorContext, 5353 "HPP: Parsing internal subset\n"); 5354 #endif 5355 htmlParseDocTypeDecl(ctxt); 5356 ctxt->instate = XML_PARSER_PROLOG; 5357 #ifdef DEBUG_PUSH 5358 xmlGenericError(xmlGenericErrorContext, 5359 "HPP: entering PROLOG\n"); 5360 #endif 5361 } else if ((cur == '<') && (next == '!') && 5362 (avail < 9)) { 5363 goto done; 5364 } else { 5365 ctxt->instate = XML_PARSER_START_TAG; 5366 #ifdef DEBUG_PUSH 5367 xmlGenericError(xmlGenericErrorContext, 5368 "HPP: entering START_TAG\n"); 5369 #endif 5370 } 5371 break; 5372 case XML_PARSER_PROLOG: 5373 SKIP_BLANKS; 5374 if (in->buf == NULL) 5375 avail = in->length - (in->cur - in->base); 5376 else 5377 avail = in->buf->buffer->use - (in->cur - in->base); 5378 if (avail < 2) 5379 goto done; 5380 cur = in->cur[0]; 5381 next = in->cur[1]; 5382 if ((cur == '<') && (next == '!') && 5383 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5384 if ((!terminate) && 5385 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 5386 goto done; 5387 #ifdef DEBUG_PUSH 5388 xmlGenericError(xmlGenericErrorContext, 5389 "HPP: Parsing Comment\n"); 5390 #endif 5391 htmlParseComment(ctxt); 5392 ctxt->instate = XML_PARSER_PROLOG; 5393 } else if ((cur == '<') && (next == '?')) { 5394 if ((!terminate) && 5395 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5396 goto done; 5397 #ifdef DEBUG_PUSH 5398 xmlGenericError(xmlGenericErrorContext, 5399 "HPP: Parsing PI\n"); 5400 #endif 5401 htmlParsePI(ctxt); 5402 ctxt->instate = XML_PARSER_PROLOG; 5403 } else if ((cur == '<') && (next == '!') && 5404 (avail < 4)) { 5405 goto done; 5406 } else { 5407 ctxt->instate = XML_PARSER_START_TAG; 5408 #ifdef DEBUG_PUSH 5409 xmlGenericError(xmlGenericErrorContext, 5410 "HPP: entering START_TAG\n"); 5411 #endif 5412 } 5413 break; 5414 case XML_PARSER_EPILOG: 5415 if (in->buf == NULL) 5416 avail = in->length - (in->cur - in->base); 5417 else 5418 avail = in->buf->buffer->use - (in->cur - in->base); 5419 if (avail < 1) 5420 goto done; 5421 cur = in->cur[0]; 5422 if (IS_BLANK_CH(cur)) { 5423 htmlParseCharData(ctxt); 5424 goto done; 5425 } 5426 if (avail < 2) 5427 goto done; 5428 next = in->cur[1]; 5429 if ((cur == '<') && (next == '!') && 5430 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5431 if ((!terminate) && 5432 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 5433 goto done; 5434 #ifdef DEBUG_PUSH 5435 xmlGenericError(xmlGenericErrorContext, 5436 "HPP: Parsing Comment\n"); 5437 #endif 5438 htmlParseComment(ctxt); 5439 ctxt->instate = XML_PARSER_EPILOG; 5440 } else if ((cur == '<') && (next == '?')) { 5441 if ((!terminate) && 5442 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5443 goto done; 5444 #ifdef DEBUG_PUSH 5445 xmlGenericError(xmlGenericErrorContext, 5446 "HPP: Parsing PI\n"); 5447 #endif 5448 htmlParsePI(ctxt); 5449 ctxt->instate = XML_PARSER_EPILOG; 5450 } else if ((cur == '<') && (next == '!') && 5451 (avail < 4)) { 5452 goto done; 5453 } else { 5454 ctxt->errNo = XML_ERR_DOCUMENT_END; 5455 ctxt->wellFormed = 0; 5456 ctxt->instate = XML_PARSER_EOF; 5457 #ifdef DEBUG_PUSH 5458 xmlGenericError(xmlGenericErrorContext, 5459 "HPP: entering EOF\n"); 5460 #endif 5461 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5462 ctxt->sax->endDocument(ctxt->userData); 5463 goto done; 5464 } 5465 break; 5466 case XML_PARSER_START_TAG: { 5467 const xmlChar *name; 5468 int failed; 5469 const htmlElemDesc * info; 5470 5471 if (avail < 2) 5472 goto done; 5473 cur = in->cur[0]; 5474 if (cur != '<') { 5475 ctxt->instate = XML_PARSER_CONTENT; 5476 #ifdef DEBUG_PUSH 5477 xmlGenericError(xmlGenericErrorContext, 5478 "HPP: entering CONTENT\n"); 5479 #endif 5480 break; 5481 } 5482 if (in->cur[1] == '/') { 5483 ctxt->instate = XML_PARSER_END_TAG; 5484 ctxt->checkIndex = 0; 5485 #ifdef DEBUG_PUSH 5486 xmlGenericError(xmlGenericErrorContext, 5487 "HPP: entering END_TAG\n"); 5488 #endif 5489 break; 5490 } 5491 if ((!terminate) && 5492 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5493 goto done; 5494 5495 failed = htmlParseStartTag(ctxt); 5496 name = ctxt->name; 5497 if ((failed == -1) || 5498 (name == NULL)) { 5499 if (CUR == '>') 5500 NEXT; 5501 break; 5502 } 5503 5504 /* 5505 * Lookup the info for that element. 5506 */ 5507 info = htmlTagLookup(name); 5508 if (info == NULL) { 5509 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 5510 "Tag %s invalid\n", name, NULL); 5511 } 5512 5513 /* 5514 * Check for an Empty Element labeled the XML/SGML way 5515 */ 5516 if ((CUR == '/') && (NXT(1) == '>')) { 5517 SKIP(2); 5518 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 5519 ctxt->sax->endElement(ctxt->userData, name); 5520 htmlnamePop(ctxt); 5521 ctxt->instate = XML_PARSER_CONTENT; 5522 #ifdef DEBUG_PUSH 5523 xmlGenericError(xmlGenericErrorContext, 5524 "HPP: entering CONTENT\n"); 5525 #endif 5526 break; 5527 } 5528 5529 if (CUR == '>') { 5530 NEXT; 5531 } else { 5532 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 5533 "Couldn't find end of Start Tag %s\n", 5534 name, NULL); 5535 5536 /* 5537 * end of parsing of this node. 5538 */ 5539 if (xmlStrEqual(name, ctxt->name)) { 5540 nodePop(ctxt); 5541 htmlnamePop(ctxt); 5542 } 5543 5544 ctxt->instate = XML_PARSER_CONTENT; 5545 #ifdef DEBUG_PUSH 5546 xmlGenericError(xmlGenericErrorContext, 5547 "HPP: entering CONTENT\n"); 5548 #endif 5549 break; 5550 } 5551 5552 /* 5553 * Check for an Empty Element from DTD definition 5554 */ 5555 if ((info != NULL) && (info->empty)) { 5556 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 5557 ctxt->sax->endElement(ctxt->userData, name); 5558 htmlnamePop(ctxt); 5559 } 5560 ctxt->instate = XML_PARSER_CONTENT; 5561 #ifdef DEBUG_PUSH 5562 xmlGenericError(xmlGenericErrorContext, 5563 "HPP: entering CONTENT\n"); 5564 #endif 5565 break; 5566 } 5567 case XML_PARSER_CONTENT: { 5568 long cons; 5569 /* 5570 * Handle preparsed entities and charRef 5571 */ 5572 if (ctxt->token != 0) { 5573 xmlChar chr[2] = { 0 , 0 } ; 5574 5575 chr[0] = (xmlChar) ctxt->token; 5576 htmlCheckParagraph(ctxt); 5577 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 5578 ctxt->sax->characters(ctxt->userData, chr, 1); 5579 ctxt->token = 0; 5580 ctxt->checkIndex = 0; 5581 } 5582 if ((avail == 1) && (terminate)) { 5583 cur = in->cur[0]; 5584 if ((cur != '<') && (cur != '&')) { 5585 if (ctxt->sax != NULL) { 5586 if (IS_BLANK_CH(cur)) { 5587 if (ctxt->sax->ignorableWhitespace != NULL) 5588 ctxt->sax->ignorableWhitespace( 5589 ctxt->userData, &cur, 1); 5590 } else { 5591 htmlCheckParagraph(ctxt); 5592 if (ctxt->sax->characters != NULL) 5593 ctxt->sax->characters( 5594 ctxt->userData, &cur, 1); 5595 } 5596 } 5597 ctxt->token = 0; 5598 ctxt->checkIndex = 0; 5599 in->cur++; 5600 break; 5601 } 5602 } 5603 if (avail < 2) 5604 goto done; 5605 cur = in->cur[0]; 5606 next = in->cur[1]; 5607 cons = ctxt->nbChars; 5608 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) || 5609 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) { 5610 /* 5611 * Handle SCRIPT/STYLE separately 5612 */ 5613 if (!terminate) { 5614 int idx; 5615 xmlChar val; 5616 5617 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0); 5618 if (idx < 0) 5619 goto done; 5620 val = in->cur[idx + 2]; 5621 if (val == 0) /* bad cut of input */ 5622 goto done; 5623 } 5624 htmlParseScript(ctxt); 5625 if ((cur == '<') && (next == '/')) { 5626 ctxt->instate = XML_PARSER_END_TAG; 5627 ctxt->checkIndex = 0; 5628 #ifdef DEBUG_PUSH 5629 xmlGenericError(xmlGenericErrorContext, 5630 "HPP: entering END_TAG\n"); 5631 #endif 5632 break; 5633 } 5634 } else { 5635 /* 5636 * Sometimes DOCTYPE arrives in the middle of the document 5637 */ 5638 if ((cur == '<') && (next == '!') && 5639 (UPP(2) == 'D') && (UPP(3) == 'O') && 5640 (UPP(4) == 'C') && (UPP(5) == 'T') && 5641 (UPP(6) == 'Y') && (UPP(7) == 'P') && 5642 (UPP(8) == 'E')) { 5643 if ((!terminate) && 5644 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5645 goto done; 5646 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 5647 "Misplaced DOCTYPE declaration\n", 5648 BAD_CAST "DOCTYPE" , NULL); 5649 htmlParseDocTypeDecl(ctxt); 5650 } else if ((cur == '<') && (next == '!') && 5651 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5652 if ((!terminate) && 5653 (htmlParseLookupSequence( 5654 ctxt, '-', '-', '>', 1, 1) < 0)) 5655 goto done; 5656 #ifdef DEBUG_PUSH 5657 xmlGenericError(xmlGenericErrorContext, 5658 "HPP: Parsing Comment\n"); 5659 #endif 5660 htmlParseComment(ctxt); 5661 ctxt->instate = XML_PARSER_CONTENT; 5662 } else if ((cur == '<') && (next == '?')) { 5663 if ((!terminate) && 5664 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5665 goto done; 5666 #ifdef DEBUG_PUSH 5667 xmlGenericError(xmlGenericErrorContext, 5668 "HPP: Parsing PI\n"); 5669 #endif 5670 htmlParsePI(ctxt); 5671 ctxt->instate = XML_PARSER_CONTENT; 5672 } else if ((cur == '<') && (next == '!') && (avail < 4)) { 5673 goto done; 5674 } else if ((cur == '<') && (next == '/')) { 5675 ctxt->instate = XML_PARSER_END_TAG; 5676 ctxt->checkIndex = 0; 5677 #ifdef DEBUG_PUSH 5678 xmlGenericError(xmlGenericErrorContext, 5679 "HPP: entering END_TAG\n"); 5680 #endif 5681 break; 5682 } else if (cur == '<') { 5683 ctxt->instate = XML_PARSER_START_TAG; 5684 ctxt->checkIndex = 0; 5685 #ifdef DEBUG_PUSH 5686 xmlGenericError(xmlGenericErrorContext, 5687 "HPP: entering START_TAG\n"); 5688 #endif 5689 break; 5690 } else if (cur == '&') { 5691 if ((!terminate) && 5692 (htmlParseLookupChars(ctxt, 5693 BAD_CAST "; >/", 4) < 0)) 5694 goto done; 5695 #ifdef DEBUG_PUSH 5696 xmlGenericError(xmlGenericErrorContext, 5697 "HPP: Parsing Reference\n"); 5698 #endif 5699 /* TODO: check generation of subtrees if noent !!! */ 5700 htmlParseReference(ctxt); 5701 } else { 5702 /* 5703 * check that the text sequence is complete 5704 * before handing out the data to the parser 5705 * to avoid problems with erroneous end of 5706 * data detection. 5707 */ 5708 if ((!terminate) && 5709 (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0)) 5710 goto done; 5711 ctxt->checkIndex = 0; 5712 #ifdef DEBUG_PUSH 5713 xmlGenericError(xmlGenericErrorContext, 5714 "HPP: Parsing char data\n"); 5715 #endif 5716 htmlParseCharData(ctxt); 5717 } 5718 } 5719 if (cons == ctxt->nbChars) { 5720 if (ctxt->node != NULL) { 5721 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5722 "detected an error in element content\n", 5723 NULL, NULL); 5724 } 5725 NEXT; 5726 break; 5727 } 5728 5729 break; 5730 } 5731 case XML_PARSER_END_TAG: 5732 if (avail < 2) 5733 goto done; 5734 if ((!terminate) && 5735 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5736 goto done; 5737 htmlParseEndTag(ctxt); 5738 if (ctxt->nameNr == 0) { 5739 ctxt->instate = XML_PARSER_EPILOG; 5740 } else { 5741 ctxt->instate = XML_PARSER_CONTENT; 5742 } 5743 ctxt->checkIndex = 0; 5744 #ifdef DEBUG_PUSH 5745 xmlGenericError(xmlGenericErrorContext, 5746 "HPP: entering CONTENT\n"); 5747 #endif 5748 break; 5749 case XML_PARSER_CDATA_SECTION: 5750 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5751 "HPP: internal error, state == CDATA\n", 5752 NULL, NULL); 5753 ctxt->instate = XML_PARSER_CONTENT; 5754 ctxt->checkIndex = 0; 5755 #ifdef DEBUG_PUSH 5756 xmlGenericError(xmlGenericErrorContext, 5757 "HPP: entering CONTENT\n"); 5758 #endif 5759 break; 5760 case XML_PARSER_DTD: 5761 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5762 "HPP: internal error, state == DTD\n", 5763 NULL, NULL); 5764 ctxt->instate = XML_PARSER_CONTENT; 5765 ctxt->checkIndex = 0; 5766 #ifdef DEBUG_PUSH 5767 xmlGenericError(xmlGenericErrorContext, 5768 "HPP: entering CONTENT\n"); 5769 #endif 5770 break; 5771 case XML_PARSER_COMMENT: 5772 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5773 "HPP: internal error, state == COMMENT\n", 5774 NULL, NULL); 5775 ctxt->instate = XML_PARSER_CONTENT; 5776 ctxt->checkIndex = 0; 5777 #ifdef DEBUG_PUSH 5778 xmlGenericError(xmlGenericErrorContext, 5779 "HPP: entering CONTENT\n"); 5780 #endif 5781 break; 5782 case XML_PARSER_PI: 5783 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5784 "HPP: internal error, state == PI\n", 5785 NULL, NULL); 5786 ctxt->instate = XML_PARSER_CONTENT; 5787 ctxt->checkIndex = 0; 5788 #ifdef DEBUG_PUSH 5789 xmlGenericError(xmlGenericErrorContext, 5790 "HPP: entering CONTENT\n"); 5791 #endif 5792 break; 5793 case XML_PARSER_ENTITY_DECL: 5794 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5795 "HPP: internal error, state == ENTITY_DECL\n", 5796 NULL, NULL); 5797 ctxt->instate = XML_PARSER_CONTENT; 5798 ctxt->checkIndex = 0; 5799 #ifdef DEBUG_PUSH 5800 xmlGenericError(xmlGenericErrorContext, 5801 "HPP: entering CONTENT\n"); 5802 #endif 5803 break; 5804 case XML_PARSER_ENTITY_VALUE: 5805 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5806 "HPP: internal error, state == ENTITY_VALUE\n", 5807 NULL, NULL); 5808 ctxt->instate = XML_PARSER_CONTENT; 5809 ctxt->checkIndex = 0; 5810 #ifdef DEBUG_PUSH 5811 xmlGenericError(xmlGenericErrorContext, 5812 "HPP: entering DTD\n"); 5813 #endif 5814 break; 5815 case XML_PARSER_ATTRIBUTE_VALUE: 5816 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5817 "HPP: internal error, state == ATTRIBUTE_VALUE\n", 5818 NULL, NULL); 5819 ctxt->instate = XML_PARSER_START_TAG; 5820 ctxt->checkIndex = 0; 5821 #ifdef DEBUG_PUSH 5822 xmlGenericError(xmlGenericErrorContext, 5823 "HPP: entering START_TAG\n"); 5824 #endif 5825 break; 5826 case XML_PARSER_SYSTEM_LITERAL: 5827 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5828 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n", 5829 NULL, NULL); 5830 ctxt->instate = XML_PARSER_CONTENT; 5831 ctxt->checkIndex = 0; 5832 #ifdef DEBUG_PUSH 5833 xmlGenericError(xmlGenericErrorContext, 5834 "HPP: entering CONTENT\n"); 5835 #endif 5836 break; 5837 case XML_PARSER_IGNORE: 5838 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5839 "HPP: internal error, state == XML_PARSER_IGNORE\n", 5840 NULL, NULL); 5841 ctxt->instate = XML_PARSER_CONTENT; 5842 ctxt->checkIndex = 0; 5843 #ifdef DEBUG_PUSH 5844 xmlGenericError(xmlGenericErrorContext, 5845 "HPP: entering CONTENT\n"); 5846 #endif 5847 break; 5848 case XML_PARSER_PUBLIC_LITERAL: 5849 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5850 "HPP: internal error, state == XML_PARSER_LITERAL\n", 5851 NULL, NULL); 5852 ctxt->instate = XML_PARSER_CONTENT; 5853 ctxt->checkIndex = 0; 5854 #ifdef DEBUG_PUSH 5855 xmlGenericError(xmlGenericErrorContext, 5856 "HPP: entering CONTENT\n"); 5857 #endif 5858 break; 5859 5860 } 5861 } 5862 done: 5863 if ((avail == 0) && (terminate)) { 5864 htmlAutoCloseOnEnd(ctxt); 5865 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 5866 /* 5867 * SAX: end of the document processing. 5868 */ 5869 ctxt->instate = XML_PARSER_EOF; 5870 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5871 ctxt->sax->endDocument(ctxt->userData); 5872 } 5873 } 5874 if ((ctxt->myDoc != NULL) && 5875 ((terminate) || (ctxt->instate == XML_PARSER_EOF) || 5876 (ctxt->instate == XML_PARSER_EPILOG))) { 5877 xmlDtdPtr dtd; 5878 dtd = xmlGetIntSubset(ctxt->myDoc); 5879 if (dtd == NULL) 5880 ctxt->myDoc->intSubset = 5881 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 5882 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 5883 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 5884 } 5885 #ifdef DEBUG_PUSH 5886 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret); 5887 #endif 5888 return(ret); 5889 } 5890 5891 /** 5892 * htmlParseChunk: 5893 * @ctxt: an HTML parser context 5894 * @chunk: an char array 5895 * @size: the size in byte of the chunk 5896 * @terminate: last chunk indicator 5897 * 5898 * Parse a Chunk of memory 5899 * 5900 * Returns zero if no error, the xmlParserErrors otherwise. 5901 */ 5902 int 5903 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, 5904 int terminate) { 5905 if ((ctxt == NULL) || (ctxt->input == NULL)) { 5906 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5907 "htmlParseChunk: context error\n", NULL, NULL); 5908 return(XML_ERR_INTERNAL_ERROR); 5909 } 5910 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 5911 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) { 5912 int base = ctxt->input->base - ctxt->input->buf->buffer->content; 5913 int cur = ctxt->input->cur - ctxt->input->base; 5914 int res; 5915 5916 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 5917 if (res < 0) { 5918 ctxt->errNo = XML_PARSER_EOF; 5919 ctxt->disableSAX = 1; 5920 return (XML_PARSER_EOF); 5921 } 5922 ctxt->input->base = ctxt->input->buf->buffer->content + base; 5923 ctxt->input->cur = ctxt->input->base + cur; 5924 ctxt->input->end = 5925 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; 5926 #ifdef DEBUG_PUSH 5927 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 5928 #endif 5929 5930 #if 0 5931 if ((terminate) || (ctxt->input->buf->buffer->use > 80)) 5932 htmlParseTryOrFinish(ctxt, terminate); 5933 #endif 5934 } else if (ctxt->instate != XML_PARSER_EOF) { 5935 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) { 5936 xmlParserInputBufferPtr in = ctxt->input->buf; 5937 if ((in->encoder != NULL) && (in->buffer != NULL) && 5938 (in->raw != NULL)) { 5939 int nbchars; 5940 5941 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw); 5942 if (nbchars < 0) { 5943 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 5944 "encoder error\n", NULL, NULL); 5945 return(XML_ERR_INVALID_ENCODING); 5946 } 5947 } 5948 } 5949 } 5950 htmlParseTryOrFinish(ctxt, terminate); 5951 if (terminate) { 5952 if ((ctxt->instate != XML_PARSER_EOF) && 5953 (ctxt->instate != XML_PARSER_EPILOG) && 5954 (ctxt->instate != XML_PARSER_MISC)) { 5955 ctxt->errNo = XML_ERR_DOCUMENT_END; 5956 ctxt->wellFormed = 0; 5957 } 5958 if (ctxt->instate != XML_PARSER_EOF) { 5959 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5960 ctxt->sax->endDocument(ctxt->userData); 5961 } 5962 ctxt->instate = XML_PARSER_EOF; 5963 } 5964 return((xmlParserErrors) ctxt->errNo); 5965 } 5966 5967 /************************************************************************ 5968 * * 5969 * User entry points * 5970 * * 5971 ************************************************************************/ 5972 5973 /** 5974 * htmlCreatePushParserCtxt: 5975 * @sax: a SAX handler 5976 * @user_data: The user data returned on SAX callbacks 5977 * @chunk: a pointer to an array of chars 5978 * @size: number of chars in the array 5979 * @filename: an optional file name or URI 5980 * @enc: an optional encoding 5981 * 5982 * Create a parser context for using the HTML parser in push mode 5983 * The value of @filename is used for fetching external entities 5984 * and error/warning reports. 5985 * 5986 * Returns the new parser context or NULL 5987 */ 5988 htmlParserCtxtPtr 5989 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, 5990 const char *chunk, int size, const char *filename, 5991 xmlCharEncoding enc) { 5992 htmlParserCtxtPtr ctxt; 5993 htmlParserInputPtr inputStream; 5994 xmlParserInputBufferPtr buf; 5995 5996 xmlInitParser(); 5997 5998 buf = xmlAllocParserInputBuffer(enc); 5999 if (buf == NULL) return(NULL); 6000 6001 ctxt = htmlNewParserCtxt(); 6002 if (ctxt == NULL) { 6003 xmlFreeParserInputBuffer(buf); 6004 return(NULL); 6005 } 6006 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder) 6007 ctxt->charset=XML_CHAR_ENCODING_UTF8; 6008 if (sax != NULL) { 6009 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler) 6010 xmlFree(ctxt->sax); 6011 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler)); 6012 if (ctxt->sax == NULL) { 6013 xmlFree(buf); 6014 xmlFree(ctxt); 6015 return(NULL); 6016 } 6017 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler)); 6018 if (user_data != NULL) 6019 ctxt->userData = user_data; 6020 } 6021 if (filename == NULL) { 6022 ctxt->directory = NULL; 6023 } else { 6024 ctxt->directory = xmlParserGetDirectory(filename); 6025 } 6026 6027 inputStream = htmlNewInputStream(ctxt); 6028 if (inputStream == NULL) { 6029 xmlFreeParserCtxt(ctxt); 6030 xmlFree(buf); 6031 return(NULL); 6032 } 6033 6034 if (filename == NULL) 6035 inputStream->filename = NULL; 6036 else 6037 inputStream->filename = (char *) 6038 xmlCanonicPath((const xmlChar *) filename); 6039 inputStream->buf = buf; 6040 inputStream->base = inputStream->buf->buffer->content; 6041 inputStream->cur = inputStream->buf->buffer->content; 6042 inputStream->end = 6043 &inputStream->buf->buffer->content[inputStream->buf->buffer->use]; 6044 6045 inputPush(ctxt, inputStream); 6046 6047 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 6048 (ctxt->input->buf != NULL)) { 6049 int base = ctxt->input->base - ctxt->input->buf->buffer->content; 6050 int cur = ctxt->input->cur - ctxt->input->base; 6051 6052 xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 6053 6054 ctxt->input->base = ctxt->input->buf->buffer->content + base; 6055 ctxt->input->cur = ctxt->input->base + cur; 6056 ctxt->input->end = 6057 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; 6058 #ifdef DEBUG_PUSH 6059 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 6060 #endif 6061 } 6062 ctxt->progressive = 1; 6063 6064 return(ctxt); 6065 } 6066 #endif /* LIBXML_PUSH_ENABLED */ 6067 6068 /** 6069 * htmlSAXParseDoc: 6070 * @cur: a pointer to an array of xmlChar 6071 * @encoding: a free form C string describing the HTML document encoding, or NULL 6072 * @sax: the SAX handler block 6073 * @userData: if using SAX, this pointer will be provided on callbacks. 6074 * 6075 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks 6076 * to handle parse events. If sax is NULL, fallback to the default DOM 6077 * behavior and return a tree. 6078 * 6079 * Returns the resulting document tree unless SAX is NULL or the document is 6080 * not well formed. 6081 */ 6082 6083 htmlDocPtr 6084 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) { 6085 htmlDocPtr ret; 6086 htmlParserCtxtPtr ctxt; 6087 6088 xmlInitParser(); 6089 6090 if (cur == NULL) return(NULL); 6091 6092 6093 ctxt = htmlCreateDocParserCtxt(cur, encoding); 6094 if (ctxt == NULL) return(NULL); 6095 if (sax != NULL) { 6096 if (ctxt->sax != NULL) xmlFree (ctxt->sax); 6097 ctxt->sax = sax; 6098 ctxt->userData = userData; 6099 } 6100 6101 htmlParseDocument(ctxt); 6102 ret = ctxt->myDoc; 6103 if (sax != NULL) { 6104 ctxt->sax = NULL; 6105 ctxt->userData = NULL; 6106 } 6107 htmlFreeParserCtxt(ctxt); 6108 6109 return(ret); 6110 } 6111 6112 /** 6113 * htmlParseDoc: 6114 * @cur: a pointer to an array of xmlChar 6115 * @encoding: a free form C string describing the HTML document encoding, or NULL 6116 * 6117 * parse an HTML in-memory document and build a tree. 6118 * 6119 * Returns the resulting document tree 6120 */ 6121 6122 htmlDocPtr 6123 htmlParseDoc(xmlChar *cur, const char *encoding) { 6124 return(htmlSAXParseDoc(cur, encoding, NULL, NULL)); 6125 } 6126 6127 6128 /** 6129 * htmlCreateFileParserCtxt: 6130 * @filename: the filename 6131 * @encoding: a free form C string describing the HTML document encoding, or NULL 6132 * 6133 * Create a parser context for a file content. 6134 * Automatic support for ZLIB/Compress compressed document is provided 6135 * by default if found at compile-time. 6136 * 6137 * Returns the new parser context or NULL 6138 */ 6139 htmlParserCtxtPtr 6140 htmlCreateFileParserCtxt(const char *filename, const char *encoding) 6141 { 6142 htmlParserCtxtPtr ctxt; 6143 htmlParserInputPtr inputStream; 6144 char *canonicFilename; 6145 /* htmlCharEncoding enc; */ 6146 xmlChar *content, *content_line = (xmlChar *) "charset="; 6147 6148 if (filename == NULL) 6149 return(NULL); 6150 6151 ctxt = htmlNewParserCtxt(); 6152 if (ctxt == NULL) { 6153 return(NULL); 6154 } 6155 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename); 6156 if (canonicFilename == NULL) { 6157 #ifdef LIBXML_SAX1_ENABLED 6158 if (xmlDefaultSAXHandler.error != NULL) { 6159 xmlDefaultSAXHandler.error(NULL, "out of memory\n"); 6160 } 6161 #endif 6162 xmlFreeParserCtxt(ctxt); 6163 return(NULL); 6164 } 6165 6166 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt); 6167 xmlFree(canonicFilename); 6168 if (inputStream == NULL) { 6169 xmlFreeParserCtxt(ctxt); 6170 return(NULL); 6171 } 6172 6173 inputPush(ctxt, inputStream); 6174 6175 /* set encoding */ 6176 if (encoding) { 6177 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1); 6178 if (content) { 6179 strcpy ((char *)content, (char *)content_line); 6180 strcat ((char *)content, (char *)encoding); 6181 htmlCheckEncoding (ctxt, content); 6182 xmlFree (content); 6183 } 6184 } 6185 6186 return(ctxt); 6187 } 6188 6189 /** 6190 * htmlSAXParseFile: 6191 * @filename: the filename 6192 * @encoding: a free form C string describing the HTML document encoding, or NULL 6193 * @sax: the SAX handler block 6194 * @userData: if using SAX, this pointer will be provided on callbacks. 6195 * 6196 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 6197 * compressed document is provided by default if found at compile-time. 6198 * It use the given SAX function block to handle the parsing callback. 6199 * If sax is NULL, fallback to the default DOM tree building routines. 6200 * 6201 * Returns the resulting document tree unless SAX is NULL or the document is 6202 * not well formed. 6203 */ 6204 6205 htmlDocPtr 6206 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax, 6207 void *userData) { 6208 htmlDocPtr ret; 6209 htmlParserCtxtPtr ctxt; 6210 htmlSAXHandlerPtr oldsax = NULL; 6211 6212 xmlInitParser(); 6213 6214 ctxt = htmlCreateFileParserCtxt(filename, encoding); 6215 if (ctxt == NULL) return(NULL); 6216 if (sax != NULL) { 6217 oldsax = ctxt->sax; 6218 ctxt->sax = sax; 6219 ctxt->userData = userData; 6220 } 6221 6222 htmlParseDocument(ctxt); 6223 6224 ret = ctxt->myDoc; 6225 if (sax != NULL) { 6226 ctxt->sax = oldsax; 6227 ctxt->userData = NULL; 6228 } 6229 htmlFreeParserCtxt(ctxt); 6230 6231 return(ret); 6232 } 6233 6234 /** 6235 * htmlParseFile: 6236 * @filename: the filename 6237 * @encoding: a free form C string describing the HTML document encoding, or NULL 6238 * 6239 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 6240 * compressed document is provided by default if found at compile-time. 6241 * 6242 * Returns the resulting document tree 6243 */ 6244 6245 htmlDocPtr 6246 htmlParseFile(const char *filename, const char *encoding) { 6247 return(htmlSAXParseFile(filename, encoding, NULL, NULL)); 6248 } 6249 6250 /** 6251 * htmlHandleOmittedElem: 6252 * @val: int 0 or 1 6253 * 6254 * Set and return the previous value for handling HTML omitted tags. 6255 * 6256 * Returns the last value for 0 for no handling, 1 for auto insertion. 6257 */ 6258 6259 int 6260 htmlHandleOmittedElem(int val) { 6261 int old = htmlOmittedDefaultValue; 6262 6263 htmlOmittedDefaultValue = val; 6264 return(old); 6265 } 6266 6267 /** 6268 * htmlElementAllowedHere: 6269 * @parent: HTML parent element 6270 * @elt: HTML element 6271 * 6272 * Checks whether an HTML element may be a direct child of a parent element. 6273 * Note - doesn't check for deprecated elements 6274 * 6275 * Returns 1 if allowed; 0 otherwise. 6276 */ 6277 int 6278 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) { 6279 const char** p ; 6280 6281 if ( ! elt || ! parent || ! parent->subelts ) 6282 return 0 ; 6283 6284 for ( p = parent->subelts; *p; ++p ) 6285 if ( !xmlStrcmp((const xmlChar *)*p, elt) ) 6286 return 1 ; 6287 6288 return 0 ; 6289 } 6290 /** 6291 * htmlElementStatusHere: 6292 * @parent: HTML parent element 6293 * @elt: HTML element 6294 * 6295 * Checks whether an HTML element may be a direct child of a parent element. 6296 * and if so whether it is valid or deprecated. 6297 * 6298 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID 6299 */ 6300 htmlStatus 6301 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) { 6302 if ( ! parent || ! elt ) 6303 return HTML_INVALID ; 6304 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) ) 6305 return HTML_INVALID ; 6306 6307 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ; 6308 } 6309 /** 6310 * htmlAttrAllowed: 6311 * @elt: HTML element 6312 * @attr: HTML attribute 6313 * @legacy: whether to allow deprecated attributes 6314 * 6315 * Checks whether an attribute is valid for an element 6316 * Has full knowledge of Required and Deprecated attributes 6317 * 6318 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID 6319 */ 6320 htmlStatus 6321 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) { 6322 const char** p ; 6323 6324 if ( !elt || ! attr ) 6325 return HTML_INVALID ; 6326 6327 if ( elt->attrs_req ) 6328 for ( p = elt->attrs_req; *p; ++p) 6329 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6330 return HTML_REQUIRED ; 6331 6332 if ( elt->attrs_opt ) 6333 for ( p = elt->attrs_opt; *p; ++p) 6334 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6335 return HTML_VALID ; 6336 6337 if ( legacy && elt->attrs_depr ) 6338 for ( p = elt->attrs_depr; *p; ++p) 6339 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6340 return HTML_DEPRECATED ; 6341 6342 return HTML_INVALID ; 6343 } 6344 /** 6345 * htmlNodeStatus: 6346 * @node: an htmlNodePtr in a tree 6347 * @legacy: whether to allow deprecated elements (YES is faster here 6348 * for Element nodes) 6349 * 6350 * Checks whether the tree node is valid. Experimental (the author 6351 * only uses the HTML enhancements in a SAX parser) 6352 * 6353 * Return: for Element nodes, a return from htmlElementAllowedHere (if 6354 * legacy allowed) or htmlElementStatusHere (otherwise). 6355 * for Attribute nodes, a return from htmlAttrAllowed 6356 * for other nodes, HTML_NA (no checks performed) 6357 */ 6358 htmlStatus 6359 htmlNodeStatus(const htmlNodePtr node, int legacy) { 6360 if ( ! node ) 6361 return HTML_INVALID ; 6362 6363 switch ( node->type ) { 6364 case XML_ELEMENT_NODE: 6365 return legacy 6366 ? ( htmlElementAllowedHere ( 6367 htmlTagLookup(node->parent->name) , node->name 6368 ) ? HTML_VALID : HTML_INVALID ) 6369 : htmlElementStatusHere( 6370 htmlTagLookup(node->parent->name) , 6371 htmlTagLookup(node->name) ) 6372 ; 6373 case XML_ATTRIBUTE_NODE: 6374 return htmlAttrAllowed( 6375 htmlTagLookup(node->parent->name) , node->name, legacy) ; 6376 default: return HTML_NA ; 6377 } 6378 } 6379 /************************************************************************ 6380 * * 6381 * New set (2.6.0) of simpler and more flexible APIs * 6382 * * 6383 ************************************************************************/ 6384 /** 6385 * DICT_FREE: 6386 * @str: a string 6387 * 6388 * Free a string if it is not owned by the "dict" dictionnary in the 6389 * current scope 6390 */ 6391 #define DICT_FREE(str) \ 6392 if ((str) && ((!dict) || \ 6393 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \ 6394 xmlFree((char *)(str)); 6395 6396 /** 6397 * htmlCtxtReset: 6398 * @ctxt: an HTML parser context 6399 * 6400 * Reset a parser context 6401 */ 6402 void 6403 htmlCtxtReset(htmlParserCtxtPtr ctxt) 6404 { 6405 xmlParserInputPtr input; 6406 xmlDictPtr dict; 6407 6408 if (ctxt == NULL) 6409 return; 6410 6411 xmlInitParser(); 6412 dict = ctxt->dict; 6413 6414 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */ 6415 xmlFreeInputStream(input); 6416 } 6417 ctxt->inputNr = 0; 6418 ctxt->input = NULL; 6419 6420 ctxt->spaceNr = 0; 6421 if (ctxt->spaceTab != NULL) { 6422 ctxt->spaceTab[0] = -1; 6423 ctxt->space = &ctxt->spaceTab[0]; 6424 } else { 6425 ctxt->space = NULL; 6426 } 6427 6428 6429 ctxt->nodeNr = 0; 6430 ctxt->node = NULL; 6431 6432 ctxt->nameNr = 0; 6433 ctxt->name = NULL; 6434 6435 DICT_FREE(ctxt->version); 6436 ctxt->version = NULL; 6437 DICT_FREE(ctxt->encoding); 6438 ctxt->encoding = NULL; 6439 DICT_FREE(ctxt->directory); 6440 ctxt->directory = NULL; 6441 DICT_FREE(ctxt->extSubURI); 6442 ctxt->extSubURI = NULL; 6443 DICT_FREE(ctxt->extSubSystem); 6444 ctxt->extSubSystem = NULL; 6445 if (ctxt->myDoc != NULL) 6446 xmlFreeDoc(ctxt->myDoc); 6447 ctxt->myDoc = NULL; 6448 6449 ctxt->standalone = -1; 6450 ctxt->hasExternalSubset = 0; 6451 ctxt->hasPErefs = 0; 6452 ctxt->html = 1; 6453 ctxt->external = 0; 6454 ctxt->instate = XML_PARSER_START; 6455 ctxt->token = 0; 6456 6457 ctxt->wellFormed = 1; 6458 ctxt->nsWellFormed = 1; 6459 ctxt->disableSAX = 0; 6460 ctxt->valid = 1; 6461 ctxt->vctxt.userData = ctxt; 6462 ctxt->vctxt.error = xmlParserValidityError; 6463 ctxt->vctxt.warning = xmlParserValidityWarning; 6464 ctxt->record_info = 0; 6465 ctxt->nbChars = 0; 6466 ctxt->checkIndex = 0; 6467 ctxt->inSubset = 0; 6468 ctxt->errNo = XML_ERR_OK; 6469 ctxt->depth = 0; 6470 ctxt->charset = XML_CHAR_ENCODING_NONE; 6471 ctxt->catalogs = NULL; 6472 xmlInitNodeInfoSeq(&ctxt->node_seq); 6473 6474 if (ctxt->attsDefault != NULL) { 6475 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree); 6476 ctxt->attsDefault = NULL; 6477 } 6478 if (ctxt->attsSpecial != NULL) { 6479 xmlHashFree(ctxt->attsSpecial, NULL); 6480 ctxt->attsSpecial = NULL; 6481 } 6482 } 6483 6484 /** 6485 * htmlCtxtUseOptions: 6486 * @ctxt: an HTML parser context 6487 * @options: a combination of htmlParserOption(s) 6488 * 6489 * Applies the options to the parser context 6490 * 6491 * Returns 0 in case of success, the set of unknown or unimplemented options 6492 * in case of error. 6493 */ 6494 int 6495 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options) 6496 { 6497 if (ctxt == NULL) 6498 return(-1); 6499 6500 if (options & HTML_PARSE_NOWARNING) { 6501 ctxt->sax->warning = NULL; 6502 ctxt->vctxt.warning = NULL; 6503 options -= XML_PARSE_NOWARNING; 6504 ctxt->options |= XML_PARSE_NOWARNING; 6505 } 6506 if (options & HTML_PARSE_NOERROR) { 6507 ctxt->sax->error = NULL; 6508 ctxt->vctxt.error = NULL; 6509 ctxt->sax->fatalError = NULL; 6510 options -= XML_PARSE_NOERROR; 6511 ctxt->options |= XML_PARSE_NOERROR; 6512 } 6513 if (options & HTML_PARSE_PEDANTIC) { 6514 ctxt->pedantic = 1; 6515 options -= XML_PARSE_PEDANTIC; 6516 ctxt->options |= XML_PARSE_PEDANTIC; 6517 } else 6518 ctxt->pedantic = 0; 6519 if (options & XML_PARSE_NOBLANKS) { 6520 ctxt->keepBlanks = 0; 6521 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace; 6522 options -= XML_PARSE_NOBLANKS; 6523 ctxt->options |= XML_PARSE_NOBLANKS; 6524 } else 6525 ctxt->keepBlanks = 1; 6526 if (options & HTML_PARSE_RECOVER) { 6527 ctxt->recovery = 1; 6528 options -= HTML_PARSE_RECOVER; 6529 } else 6530 ctxt->recovery = 0; 6531 if (options & HTML_PARSE_COMPACT) { 6532 ctxt->options |= HTML_PARSE_COMPACT; 6533 options -= HTML_PARSE_COMPACT; 6534 } 6535 if (options & XML_PARSE_HUGE) { 6536 ctxt->options |= XML_PARSE_HUGE; 6537 options -= XML_PARSE_HUGE; 6538 } 6539 if (options & HTML_PARSE_NODEFDTD) { 6540 ctxt->options |= HTML_PARSE_NODEFDTD; 6541 options -= HTML_PARSE_NODEFDTD; 6542 } 6543 if (options & HTML_PARSE_IGNORE_ENC) { 6544 ctxt->options |= HTML_PARSE_IGNORE_ENC; 6545 options -= HTML_PARSE_IGNORE_ENC; 6546 } 6547 ctxt->dictNames = 0; 6548 return (options); 6549 } 6550 6551 /** 6552 * htmlDoRead: 6553 * @ctxt: an HTML parser context 6554 * @URL: the base URL to use for the document 6555 * @encoding: the document encoding, or NULL 6556 * @options: a combination of htmlParserOption(s) 6557 * @reuse: keep the context for reuse 6558 * 6559 * Common front-end for the htmlRead functions 6560 * 6561 * Returns the resulting document tree or NULL 6562 */ 6563 static htmlDocPtr 6564 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding, 6565 int options, int reuse) 6566 { 6567 htmlDocPtr ret; 6568 6569 htmlCtxtUseOptions(ctxt, options); 6570 ctxt->html = 1; 6571 if (encoding != NULL) { 6572 xmlCharEncodingHandlerPtr hdlr; 6573 6574 hdlr = xmlFindCharEncodingHandler(encoding); 6575 if (hdlr != NULL) { 6576 xmlSwitchToEncoding(ctxt, hdlr); 6577 if (ctxt->input->encoding != NULL) 6578 xmlFree((xmlChar *) ctxt->input->encoding); 6579 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding); 6580 } 6581 } 6582 if ((URL != NULL) && (ctxt->input != NULL) && 6583 (ctxt->input->filename == NULL)) 6584 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL); 6585 htmlParseDocument(ctxt); 6586 ret = ctxt->myDoc; 6587 ctxt->myDoc = NULL; 6588 if (!reuse) { 6589 if ((ctxt->dictNames) && 6590 (ret != NULL) && 6591 (ret->dict == ctxt->dict)) 6592 ctxt->dict = NULL; 6593 xmlFreeParserCtxt(ctxt); 6594 } 6595 return (ret); 6596 } 6597 6598 /** 6599 * htmlReadDoc: 6600 * @cur: a pointer to a zero terminated string 6601 * @URL: the base URL to use for the document 6602 * @encoding: the document encoding, or NULL 6603 * @options: a combination of htmlParserOption(s) 6604 * 6605 * parse an XML in-memory document and build a tree. 6606 * 6607 * Returns the resulting document tree 6608 */ 6609 htmlDocPtr 6610 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options) 6611 { 6612 htmlParserCtxtPtr ctxt; 6613 6614 if (cur == NULL) 6615 return (NULL); 6616 6617 xmlInitParser(); 6618 ctxt = htmlCreateDocParserCtxt(cur, NULL); 6619 if (ctxt == NULL) 6620 return (NULL); 6621 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6622 } 6623 6624 /** 6625 * htmlReadFile: 6626 * @filename: a file or URL 6627 * @encoding: the document encoding, or NULL 6628 * @options: a combination of htmlParserOption(s) 6629 * 6630 * parse an XML file from the filesystem or the network. 6631 * 6632 * Returns the resulting document tree 6633 */ 6634 htmlDocPtr 6635 htmlReadFile(const char *filename, const char *encoding, int options) 6636 { 6637 htmlParserCtxtPtr ctxt; 6638 6639 xmlInitParser(); 6640 ctxt = htmlCreateFileParserCtxt(filename, encoding); 6641 if (ctxt == NULL) 6642 return (NULL); 6643 return (htmlDoRead(ctxt, NULL, NULL, options, 0)); 6644 } 6645 6646 /** 6647 * htmlReadMemory: 6648 * @buffer: a pointer to a char array 6649 * @size: the size of the array 6650 * @URL: the base URL to use for the document 6651 * @encoding: the document encoding, or NULL 6652 * @options: a combination of htmlParserOption(s) 6653 * 6654 * parse an XML in-memory document and build a tree. 6655 * 6656 * Returns the resulting document tree 6657 */ 6658 htmlDocPtr 6659 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options) 6660 { 6661 htmlParserCtxtPtr ctxt; 6662 6663 xmlInitParser(); 6664 ctxt = xmlCreateMemoryParserCtxt(buffer, size); 6665 if (ctxt == NULL) 6666 return (NULL); 6667 htmlDefaultSAXHandlerInit(); 6668 if (ctxt->sax != NULL) 6669 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 6670 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6671 } 6672 6673 /** 6674 * htmlReadFd: 6675 * @fd: an open file descriptor 6676 * @URL: the base URL to use for the document 6677 * @encoding: the document encoding, or NULL 6678 * @options: a combination of htmlParserOption(s) 6679 * 6680 * parse an XML from a file descriptor and build a tree. 6681 * 6682 * Returns the resulting document tree 6683 */ 6684 htmlDocPtr 6685 htmlReadFd(int fd, const char *URL, const char *encoding, int options) 6686 { 6687 htmlParserCtxtPtr ctxt; 6688 xmlParserInputBufferPtr input; 6689 xmlParserInputPtr stream; 6690 6691 if (fd < 0) 6692 return (NULL); 6693 6694 xmlInitParser(); 6695 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 6696 if (input == NULL) 6697 return (NULL); 6698 ctxt = xmlNewParserCtxt(); 6699 if (ctxt == NULL) { 6700 xmlFreeParserInputBuffer(input); 6701 return (NULL); 6702 } 6703 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6704 if (stream == NULL) { 6705 xmlFreeParserInputBuffer(input); 6706 xmlFreeParserCtxt(ctxt); 6707 return (NULL); 6708 } 6709 inputPush(ctxt, stream); 6710 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6711 } 6712 6713 /** 6714 * htmlReadIO: 6715 * @ioread: an I/O read function 6716 * @ioclose: an I/O close function 6717 * @ioctx: an I/O handler 6718 * @URL: the base URL to use for the document 6719 * @encoding: the document encoding, or NULL 6720 * @options: a combination of htmlParserOption(s) 6721 * 6722 * parse an HTML document from I/O functions and source and build a tree. 6723 * 6724 * Returns the resulting document tree 6725 */ 6726 htmlDocPtr 6727 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, 6728 void *ioctx, const char *URL, const char *encoding, int options) 6729 { 6730 htmlParserCtxtPtr ctxt; 6731 xmlParserInputBufferPtr input; 6732 xmlParserInputPtr stream; 6733 6734 if (ioread == NULL) 6735 return (NULL); 6736 xmlInitParser(); 6737 6738 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 6739 XML_CHAR_ENCODING_NONE); 6740 if (input == NULL) 6741 return (NULL); 6742 ctxt = htmlNewParserCtxt(); 6743 if (ctxt == NULL) { 6744 xmlFreeParserInputBuffer(input); 6745 return (NULL); 6746 } 6747 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6748 if (stream == NULL) { 6749 xmlFreeParserInputBuffer(input); 6750 xmlFreeParserCtxt(ctxt); 6751 return (NULL); 6752 } 6753 inputPush(ctxt, stream); 6754 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6755 } 6756 6757 /** 6758 * htmlCtxtReadDoc: 6759 * @ctxt: an HTML parser context 6760 * @cur: a pointer to a zero terminated string 6761 * @URL: the base URL to use for the document 6762 * @encoding: the document encoding, or NULL 6763 * @options: a combination of htmlParserOption(s) 6764 * 6765 * parse an XML in-memory document and build a tree. 6766 * This reuses the existing @ctxt parser context 6767 * 6768 * Returns the resulting document tree 6769 */ 6770 htmlDocPtr 6771 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur, 6772 const char *URL, const char *encoding, int options) 6773 { 6774 xmlParserInputPtr stream; 6775 6776 if (cur == NULL) 6777 return (NULL); 6778 if (ctxt == NULL) 6779 return (NULL); 6780 6781 htmlCtxtReset(ctxt); 6782 6783 stream = xmlNewStringInputStream(ctxt, cur); 6784 if (stream == NULL) { 6785 return (NULL); 6786 } 6787 inputPush(ctxt, stream); 6788 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6789 } 6790 6791 /** 6792 * htmlCtxtReadFile: 6793 * @ctxt: an HTML parser context 6794 * @filename: a file or URL 6795 * @encoding: the document encoding, or NULL 6796 * @options: a combination of htmlParserOption(s) 6797 * 6798 * parse an XML file from the filesystem or the network. 6799 * This reuses the existing @ctxt parser context 6800 * 6801 * Returns the resulting document tree 6802 */ 6803 htmlDocPtr 6804 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename, 6805 const char *encoding, int options) 6806 { 6807 xmlParserInputPtr stream; 6808 6809 if (filename == NULL) 6810 return (NULL); 6811 if (ctxt == NULL) 6812 return (NULL); 6813 6814 htmlCtxtReset(ctxt); 6815 6816 stream = xmlLoadExternalEntity(filename, NULL, ctxt); 6817 if (stream == NULL) { 6818 return (NULL); 6819 } 6820 inputPush(ctxt, stream); 6821 return (htmlDoRead(ctxt, NULL, encoding, options, 1)); 6822 } 6823 6824 /** 6825 * htmlCtxtReadMemory: 6826 * @ctxt: an HTML parser context 6827 * @buffer: a pointer to a char array 6828 * @size: the size of the array 6829 * @URL: the base URL to use for the document 6830 * @encoding: the document encoding, or NULL 6831 * @options: a combination of htmlParserOption(s) 6832 * 6833 * parse an XML in-memory document and build a tree. 6834 * This reuses the existing @ctxt parser context 6835 * 6836 * Returns the resulting document tree 6837 */ 6838 htmlDocPtr 6839 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size, 6840 const char *URL, const char *encoding, int options) 6841 { 6842 xmlParserInputBufferPtr input; 6843 xmlParserInputPtr stream; 6844 6845 if (ctxt == NULL) 6846 return (NULL); 6847 if (buffer == NULL) 6848 return (NULL); 6849 6850 htmlCtxtReset(ctxt); 6851 6852 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); 6853 if (input == NULL) { 6854 return(NULL); 6855 } 6856 6857 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6858 if (stream == NULL) { 6859 xmlFreeParserInputBuffer(input); 6860 return(NULL); 6861 } 6862 6863 inputPush(ctxt, stream); 6864 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6865 } 6866 6867 /** 6868 * htmlCtxtReadFd: 6869 * @ctxt: an HTML parser context 6870 * @fd: an open file descriptor 6871 * @URL: the base URL to use for the document 6872 * @encoding: the document encoding, or NULL 6873 * @options: a combination of htmlParserOption(s) 6874 * 6875 * parse an XML from a file descriptor and build a tree. 6876 * This reuses the existing @ctxt parser context 6877 * 6878 * Returns the resulting document tree 6879 */ 6880 htmlDocPtr 6881 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd, 6882 const char *URL, const char *encoding, int options) 6883 { 6884 xmlParserInputBufferPtr input; 6885 xmlParserInputPtr stream; 6886 6887 if (fd < 0) 6888 return (NULL); 6889 if (ctxt == NULL) 6890 return (NULL); 6891 6892 htmlCtxtReset(ctxt); 6893 6894 6895 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 6896 if (input == NULL) 6897 return (NULL); 6898 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6899 if (stream == NULL) { 6900 xmlFreeParserInputBuffer(input); 6901 return (NULL); 6902 } 6903 inputPush(ctxt, stream); 6904 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6905 } 6906 6907 /** 6908 * htmlCtxtReadIO: 6909 * @ctxt: an HTML parser context 6910 * @ioread: an I/O read function 6911 * @ioclose: an I/O close function 6912 * @ioctx: an I/O handler 6913 * @URL: the base URL to use for the document 6914 * @encoding: the document encoding, or NULL 6915 * @options: a combination of htmlParserOption(s) 6916 * 6917 * parse an HTML document from I/O functions and source and build a tree. 6918 * This reuses the existing @ctxt parser context 6919 * 6920 * Returns the resulting document tree 6921 */ 6922 htmlDocPtr 6923 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, 6924 xmlInputCloseCallback ioclose, void *ioctx, 6925 const char *URL, 6926 const char *encoding, int options) 6927 { 6928 xmlParserInputBufferPtr input; 6929 xmlParserInputPtr stream; 6930 6931 if (ioread == NULL) 6932 return (NULL); 6933 if (ctxt == NULL) 6934 return (NULL); 6935 6936 htmlCtxtReset(ctxt); 6937 6938 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 6939 XML_CHAR_ENCODING_NONE); 6940 if (input == NULL) 6941 return (NULL); 6942 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6943 if (stream == NULL) { 6944 xmlFreeParserInputBuffer(input); 6945 return (NULL); 6946 } 6947 inputPush(ctxt, stream); 6948 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6949 } 6950 6951 #define bottom_HTMLparser 6952 #include "elfgcchack.h" 6953 #endif /* LIBXML_HTML_ENABLED */ 6954