1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd 2 See the file COPYING for copying permission. 3 */ 4 5 #include <stddef.h> 6 7 #ifdef COMPILED_FROM_DSP 8 #include "winconfig.h" 9 #elif defined(MACOS_CLASSIC) 10 #include "macconfig.h" 11 #elif defined(__amigaos__) 12 #include "amigaconfig.h" 13 #elif defined(__WATCOMC__) 14 #include "watcomconfig.h" 15 #else 16 #ifdef HAVE_EXPAT_CONFIG_H 17 #include <expat_config.h> 18 #endif 19 #endif /* ndef COMPILED_FROM_DSP */ 20 21 #include "expat_external.h" 22 #include "internal.h" 23 #include "xmltok.h" 24 #include "nametab.h" 25 26 #ifdef XML_DTD 27 #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok) 28 #else 29 #define IGNORE_SECTION_TOK_VTABLE /* as nothing */ 30 #endif 31 32 #define VTABLE1 \ 33 { PREFIX(prologTok), PREFIX(contentTok), \ 34 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \ 35 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \ 36 PREFIX(sameName), \ 37 PREFIX(nameMatchesAscii), \ 38 PREFIX(nameLength), \ 39 PREFIX(skipS), \ 40 PREFIX(getAtts), \ 41 PREFIX(charRefNumber), \ 42 PREFIX(predefinedEntityName), \ 43 PREFIX(updatePosition), \ 44 PREFIX(isPublicId) 45 46 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) 47 48 #define UCS2_GET_NAMING(pages, hi, lo) \ 49 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F))) 50 51 /* A 2 byte UTF-8 representation splits the characters 11 bits between 52 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into 53 pages, 3 bits to add to that index and 5 bits to generate the mask. 54 */ 55 #define UTF8_GET_NAMING2(pages, byte) \ 56 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ 57 + ((((byte)[0]) & 3) << 1) \ 58 + ((((byte)[1]) >> 5) & 1)] \ 59 & (1 << (((byte)[1]) & 0x1F))) 60 61 /* A 3 byte UTF-8 representation splits the characters 16 bits between 62 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index 63 into pages, 3 bits to add to that index and 5 bits to generate the 64 mask. 65 */ 66 #define UTF8_GET_NAMING3(pages, byte) \ 67 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \ 68 + ((((byte)[1]) >> 2) & 0xF)] \ 69 << 3) \ 70 + ((((byte)[1]) & 3) << 1) \ 71 + ((((byte)[2]) >> 5) & 1)] \ 72 & (1 << (((byte)[2]) & 0x1F))) 73 74 #define UTF8_GET_NAMING(pages, p, n) \ 75 ((n) == 2 \ 76 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ 77 : ((n) == 3 \ 78 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \ 79 : 0)) 80 81 /* Detection of invalid UTF-8 sequences is based on Table 3.1B 82 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/ 83 with the additional restriction of not allowing the Unicode 84 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE). 85 Implementation details: 86 (A & 0x80) == 0 means A < 0x80 87 and 88 (A & 0xC0) == 0xC0 means A > 0xBF 89 */ 90 91 #define UTF8_INVALID2(p) \ 92 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0) 93 94 #define UTF8_INVALID3(p) \ 95 (((p)[2] & 0x80) == 0 \ 96 || \ 97 ((*p) == 0xEF && (p)[1] == 0xBF \ 98 ? \ 99 (p)[2] > 0xBD \ 100 : \ 101 ((p)[2] & 0xC0) == 0xC0) \ 102 || \ 103 ((*p) == 0xE0 \ 104 ? \ 105 (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \ 106 : \ 107 ((p)[1] & 0x80) == 0 \ 108 || \ 109 ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0))) 110 111 #define UTF8_INVALID4(p) \ 112 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \ 113 || \ 114 ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \ 115 || \ 116 ((*p) == 0xF0 \ 117 ? \ 118 (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \ 119 : \ 120 ((p)[1] & 0x80) == 0 \ 121 || \ 122 ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0))) 123 124 static int PTRFASTCALL 125 isNever(const ENCODING *enc, const char *p) 126 { 127 return 0; 128 } 129 130 static int PTRFASTCALL 131 utf8_isName2(const ENCODING *enc, const char *p) 132 { 133 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p); 134 } 135 136 static int PTRFASTCALL 137 utf8_isName3(const ENCODING *enc, const char *p) 138 { 139 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p); 140 } 141 142 #define utf8_isName4 isNever 143 144 static int PTRFASTCALL 145 utf8_isNmstrt2(const ENCODING *enc, const char *p) 146 { 147 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p); 148 } 149 150 static int PTRFASTCALL 151 utf8_isNmstrt3(const ENCODING *enc, const char *p) 152 { 153 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p); 154 } 155 156 #define utf8_isNmstrt4 isNever 157 158 static int PTRFASTCALL 159 utf8_isInvalid2(const ENCODING *enc, const char *p) 160 { 161 return UTF8_INVALID2((const unsigned char *)p); 162 } 163 164 static int PTRFASTCALL 165 utf8_isInvalid3(const ENCODING *enc, const char *p) 166 { 167 return UTF8_INVALID3((const unsigned char *)p); 168 } 169 170 static int PTRFASTCALL 171 utf8_isInvalid4(const ENCODING *enc, const char *p) 172 { 173 return UTF8_INVALID4((const unsigned char *)p); 174 } 175 176 struct normal_encoding { 177 ENCODING enc; 178 unsigned char type[256]; 179 #ifdef XML_MIN_SIZE 180 int (PTRFASTCALL *byteType)(const ENCODING *, const char *); 181 int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *); 182 int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *); 183 int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *); 184 int (PTRCALL *charMatches)(const ENCODING *, const char *, int); 185 #endif /* XML_MIN_SIZE */ 186 int (PTRFASTCALL *isName2)(const ENCODING *, const char *); 187 int (PTRFASTCALL *isName3)(const ENCODING *, const char *); 188 int (PTRFASTCALL *isName4)(const ENCODING *, const char *); 189 int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *); 190 int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *); 191 int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *); 192 int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *); 193 int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *); 194 int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *); 195 }; 196 197 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc)) 198 199 #ifdef XML_MIN_SIZE 200 201 #define STANDARD_VTABLE(E) \ 202 E ## byteType, \ 203 E ## isNameMin, \ 204 E ## isNmstrtMin, \ 205 E ## byteToAscii, \ 206 E ## charMatches, 207 208 #else 209 210 #define STANDARD_VTABLE(E) /* as nothing */ 211 212 #endif 213 214 #define NORMAL_VTABLE(E) \ 215 E ## isName2, \ 216 E ## isName3, \ 217 E ## isName4, \ 218 E ## isNmstrt2, \ 219 E ## isNmstrt3, \ 220 E ## isNmstrt4, \ 221 E ## isInvalid2, \ 222 E ## isInvalid3, \ 223 E ## isInvalid4 224 225 static int FASTCALL checkCharRefNumber(int); 226 227 #include "xmltok_impl.h" 228 #include "ascii.h" 229 230 #ifdef XML_MIN_SIZE 231 #define sb_isNameMin isNever 232 #define sb_isNmstrtMin isNever 233 #endif 234 235 #ifdef XML_MIN_SIZE 236 #define MINBPC(enc) ((enc)->minBytesPerChar) 237 #else 238 /* minimum bytes per character */ 239 #define MINBPC(enc) 1 240 #endif 241 242 #define SB_BYTE_TYPE(enc, p) \ 243 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) 244 245 #ifdef XML_MIN_SIZE 246 static int PTRFASTCALL 247 sb_byteType(const ENCODING *enc, const char *p) 248 { 249 return SB_BYTE_TYPE(enc, p); 250 } 251 #define BYTE_TYPE(enc, p) \ 252 (AS_NORMAL_ENCODING(enc)->byteType(enc, p)) 253 #else 254 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p) 255 #endif 256 257 #ifdef XML_MIN_SIZE 258 #define BYTE_TO_ASCII(enc, p) \ 259 (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p)) 260 static int PTRFASTCALL 261 sb_byteToAscii(const ENCODING *enc, const char *p) 262 { 263 return *p; 264 } 265 #else 266 #define BYTE_TO_ASCII(enc, p) (*(p)) 267 #endif 268 269 #define IS_NAME_CHAR(enc, p, n) \ 270 (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p)) 271 #define IS_NMSTRT_CHAR(enc, p, n) \ 272 (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p)) 273 #define IS_INVALID_CHAR(enc, p, n) \ 274 (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p)) 275 276 #ifdef XML_MIN_SIZE 277 #define IS_NAME_CHAR_MINBPC(enc, p) \ 278 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p)) 279 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \ 280 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p)) 281 #else 282 #define IS_NAME_CHAR_MINBPC(enc, p) (0) 283 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) 284 #endif 285 286 #ifdef XML_MIN_SIZE 287 #define CHAR_MATCHES(enc, p, c) \ 288 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c)) 289 static int PTRCALL 290 sb_charMatches(const ENCODING *enc, const char *p, int c) 291 { 292 return *p == c; 293 } 294 #else 295 /* c is an ASCII character */ 296 #define CHAR_MATCHES(enc, p, c) (*(p) == c) 297 #endif 298 299 #define PREFIX(ident) normal_ ## ident 300 #define XML_TOK_IMPL_C 301 #include "xmltok_impl.c" 302 #undef XML_TOK_IMPL_C 303 304 #undef MINBPC 305 #undef BYTE_TYPE 306 #undef BYTE_TO_ASCII 307 #undef CHAR_MATCHES 308 #undef IS_NAME_CHAR 309 #undef IS_NAME_CHAR_MINBPC 310 #undef IS_NMSTRT_CHAR 311 #undef IS_NMSTRT_CHAR_MINBPC 312 #undef IS_INVALID_CHAR 313 314 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ 315 UTF8_cval1 = 0x00, 316 UTF8_cval2 = 0xc0, 317 UTF8_cval3 = 0xe0, 318 UTF8_cval4 = 0xf0 319 }; 320 321 static enum XML_Convert_Result PTRCALL 322 utf8_toUtf8(const ENCODING *enc, 323 const char **fromP, const char *fromLim, 324 char **toP, const char *toLim) 325 { 326 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; 327 char *to; 328 const char *from; 329 if (fromLim - *fromP > toLim - *toP) { 330 /* Avoid copying partial characters. */ 331 res = XML_CONVERT_OUTPUT_EXHAUSTED; 332 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--) 333 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80) 334 break; 335 } 336 for (to = *toP, from = *fromP; (from < fromLim) && (to < toLim); from++, to++) 337 *to = *from; 338 *fromP = from; 339 *toP = to; 340 341 if ((to == toLim) && (from < fromLim)) 342 return XML_CONVERT_OUTPUT_EXHAUSTED; 343 else 344 return res; 345 } 346 347 static enum XML_Convert_Result PTRCALL 348 utf8_toUtf16(const ENCODING *enc, 349 const char **fromP, const char *fromLim, 350 unsigned short **toP, const unsigned short *toLim) 351 { 352 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; 353 unsigned short *to = *toP; 354 const char *from = *fromP; 355 while (from < fromLim && to < toLim) { 356 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) { 357 case BT_LEAD2: 358 if (fromLim - from < 2) { 359 res = XML_CONVERT_INPUT_INCOMPLETE; 360 break; 361 } 362 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f)); 363 from += 2; 364 break; 365 case BT_LEAD3: 366 if (fromLim - from < 3) { 367 res = XML_CONVERT_INPUT_INCOMPLETE; 368 break; 369 } 370 *to++ = (unsigned short)(((from[0] & 0xf) << 12) 371 | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f)); 372 from += 3; 373 break; 374 case BT_LEAD4: 375 { 376 unsigned long n; 377 if (toLim - to < 2) { 378 res = XML_CONVERT_OUTPUT_EXHAUSTED; 379 goto after; 380 } 381 if (fromLim - from < 4) { 382 res = XML_CONVERT_INPUT_INCOMPLETE; 383 goto after; 384 } 385 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) 386 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); 387 n -= 0x10000; 388 to[0] = (unsigned short)((n >> 10) | 0xD800); 389 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00); 390 to += 2; 391 from += 4; 392 } 393 break; 394 default: 395 *to++ = *from++; 396 break; 397 } 398 } 399 after: 400 *fromP = from; 401 *toP = to; 402 return res; 403 } 404 405 #ifdef XML_NS 406 static const struct normal_encoding utf8_encoding_ns = { 407 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 408 { 409 #include "asciitab.h" 410 #include "utf8tab.h" 411 }, 412 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 413 }; 414 #endif 415 416 static const struct normal_encoding utf8_encoding = { 417 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 418 { 419 #define BT_COLON BT_NMSTRT 420 #include "asciitab.h" 421 #undef BT_COLON 422 #include "utf8tab.h" 423 }, 424 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 425 }; 426 427 #ifdef XML_NS 428 429 static const struct normal_encoding internal_utf8_encoding_ns = { 430 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 431 { 432 #include "iasciitab.h" 433 #include "utf8tab.h" 434 }, 435 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 436 }; 437 438 #endif 439 440 static const struct normal_encoding internal_utf8_encoding = { 441 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 442 { 443 #define BT_COLON BT_NMSTRT 444 #include "iasciitab.h" 445 #undef BT_COLON 446 #include "utf8tab.h" 447 }, 448 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 449 }; 450 451 static enum XML_Convert_Result PTRCALL 452 latin1_toUtf8(const ENCODING *enc, 453 const char **fromP, const char *fromLim, 454 char **toP, const char *toLim) 455 { 456 for (;;) { 457 unsigned char c; 458 if (*fromP == fromLim) 459 return XML_CONVERT_COMPLETED; 460 c = (unsigned char)**fromP; 461 if (c & 0x80) { 462 if (toLim - *toP < 2) 463 return XML_CONVERT_OUTPUT_EXHAUSTED; 464 *(*toP)++ = (char)((c >> 6) | UTF8_cval2); 465 *(*toP)++ = (char)((c & 0x3f) | 0x80); 466 (*fromP)++; 467 } 468 else { 469 if (*toP == toLim) 470 return XML_CONVERT_OUTPUT_EXHAUSTED; 471 *(*toP)++ = *(*fromP)++; 472 } 473 } 474 } 475 476 static enum XML_Convert_Result PTRCALL 477 latin1_toUtf16(const ENCODING *enc, 478 const char **fromP, const char *fromLim, 479 unsigned short **toP, const unsigned short *toLim) 480 { 481 while (*fromP < fromLim && *toP < toLim) 482 *(*toP)++ = (unsigned char)*(*fromP)++; 483 484 if ((*toP == toLim) && (*fromP < fromLim)) 485 return XML_CONVERT_OUTPUT_EXHAUSTED; 486 else 487 return XML_CONVERT_COMPLETED; 488 } 489 490 #ifdef XML_NS 491 492 static const struct normal_encoding latin1_encoding_ns = { 493 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, 494 { 495 #include "asciitab.h" 496 #include "latin1tab.h" 497 }, 498 STANDARD_VTABLE(sb_) 499 }; 500 501 #endif 502 503 static const struct normal_encoding latin1_encoding = { 504 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, 505 { 506 #define BT_COLON BT_NMSTRT 507 #include "asciitab.h" 508 #undef BT_COLON 509 #include "latin1tab.h" 510 }, 511 STANDARD_VTABLE(sb_) 512 }; 513 514 static enum XML_Convert_Result PTRCALL 515 ascii_toUtf8(const ENCODING *enc, 516 const char **fromP, const char *fromLim, 517 char **toP, const char *toLim) 518 { 519 while (*fromP < fromLim && *toP < toLim) 520 *(*toP)++ = *(*fromP)++; 521 522 if ((*toP == toLim) && (*fromP < fromLim)) 523 return XML_CONVERT_OUTPUT_EXHAUSTED; 524 else 525 return XML_CONVERT_COMPLETED; 526 } 527 528 #ifdef XML_NS 529 530 static const struct normal_encoding ascii_encoding_ns = { 531 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, 532 { 533 #include "asciitab.h" 534 /* BT_NONXML == 0 */ 535 }, 536 STANDARD_VTABLE(sb_) 537 }; 538 539 #endif 540 541 static const struct normal_encoding ascii_encoding = { 542 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, 543 { 544 #define BT_COLON BT_NMSTRT 545 #include "asciitab.h" 546 #undef BT_COLON 547 /* BT_NONXML == 0 */ 548 }, 549 STANDARD_VTABLE(sb_) 550 }; 551 552 static int PTRFASTCALL 553 unicode_byte_type(char hi, char lo) 554 { 555 switch ((unsigned char)hi) { 556 case 0xD8: case 0xD9: case 0xDA: case 0xDB: 557 return BT_LEAD4; 558 case 0xDC: case 0xDD: case 0xDE: case 0xDF: 559 return BT_TRAIL; 560 case 0xFF: 561 switch ((unsigned char)lo) { 562 case 0xFF: 563 case 0xFE: 564 return BT_NONXML; 565 } 566 break; 567 } 568 return BT_NONASCII; 569 } 570 571 #define DEFINE_UTF16_TO_UTF8(E) \ 572 static enum XML_Convert_Result PTRCALL \ 573 E ## toUtf8(const ENCODING *enc, \ 574 const char **fromP, const char *fromLim, \ 575 char **toP, const char *toLim) \ 576 { \ 577 const char *from = *fromP; \ 578 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \ 579 for (; from < fromLim; from += 2) { \ 580 int plane; \ 581 unsigned char lo2; \ 582 unsigned char lo = GET_LO(from); \ 583 unsigned char hi = GET_HI(from); \ 584 switch (hi) { \ 585 case 0: \ 586 if (lo < 0x80) { \ 587 if (*toP == toLim) { \ 588 *fromP = from; \ 589 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 590 } \ 591 *(*toP)++ = lo; \ 592 break; \ 593 } \ 594 /* fall through */ \ 595 case 0x1: case 0x2: case 0x3: \ 596 case 0x4: case 0x5: case 0x6: case 0x7: \ 597 if (toLim - *toP < 2) { \ 598 *fromP = from; \ 599 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 600 } \ 601 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ 602 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 603 break; \ 604 default: \ 605 if (toLim - *toP < 3) { \ 606 *fromP = from; \ 607 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 608 } \ 609 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ 610 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ 611 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ 612 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 613 break; \ 614 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \ 615 if (toLim - *toP < 4) { \ 616 *fromP = from; \ 617 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 618 } \ 619 if (fromLim - from < 4) { \ 620 *fromP = from; \ 621 return XML_CONVERT_INPUT_INCOMPLETE; \ 622 } \ 623 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ 624 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \ 625 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ 626 from += 2; \ 627 lo2 = GET_LO(from); \ 628 *(*toP)++ = (((lo & 0x3) << 4) \ 629 | ((GET_HI(from) & 0x3) << 2) \ 630 | (lo2 >> 6) \ 631 | 0x80); \ 632 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ 633 break; \ 634 } \ 635 } \ 636 *fromP = from; \ 637 if (from < fromLim) \ 638 return XML_CONVERT_INPUT_INCOMPLETE; \ 639 else \ 640 return XML_CONVERT_COMPLETED; \ 641 } 642 643 #define DEFINE_UTF16_TO_UTF16(E) \ 644 static enum XML_Convert_Result PTRCALL \ 645 E ## toUtf16(const ENCODING *enc, \ 646 const char **fromP, const char *fromLim, \ 647 unsigned short **toP, const unsigned short *toLim) \ 648 { \ 649 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \ 650 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \ 651 /* Avoid copying first half only of surrogate */ \ 652 if (fromLim - *fromP > ((toLim - *toP) << 1) \ 653 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \ 654 fromLim -= 2; \ 655 res = XML_CONVERT_INPUT_INCOMPLETE; \ 656 } \ 657 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \ 658 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ 659 if ((*toP == toLim) && (*fromP < fromLim)) \ 660 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 661 else \ 662 return res; \ 663 } 664 665 #define SET2(ptr, ch) \ 666 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8))) 667 #define GET_LO(ptr) ((unsigned char)(ptr)[0]) 668 #define GET_HI(ptr) ((unsigned char)(ptr)[1]) 669 670 DEFINE_UTF16_TO_UTF8(little2_) 671 DEFINE_UTF16_TO_UTF16(little2_) 672 673 #undef SET2 674 #undef GET_LO 675 #undef GET_HI 676 677 #define SET2(ptr, ch) \ 678 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF))) 679 #define GET_LO(ptr) ((unsigned char)(ptr)[1]) 680 #define GET_HI(ptr) ((unsigned char)(ptr)[0]) 681 682 DEFINE_UTF16_TO_UTF8(big2_) 683 DEFINE_UTF16_TO_UTF16(big2_) 684 685 #undef SET2 686 #undef GET_LO 687 #undef GET_HI 688 689 #define LITTLE2_BYTE_TYPE(enc, p) \ 690 ((p)[1] == 0 \ 691 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \ 692 : unicode_byte_type((p)[1], (p)[0])) 693 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1) 694 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c) 695 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \ 696 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0]) 697 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ 698 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0]) 699 700 #ifdef XML_MIN_SIZE 701 702 static int PTRFASTCALL 703 little2_byteType(const ENCODING *enc, const char *p) 704 { 705 return LITTLE2_BYTE_TYPE(enc, p); 706 } 707 708 static int PTRFASTCALL 709 little2_byteToAscii(const ENCODING *enc, const char *p) 710 { 711 return LITTLE2_BYTE_TO_ASCII(enc, p); 712 } 713 714 static int PTRCALL 715 little2_charMatches(const ENCODING *enc, const char *p, int c) 716 { 717 return LITTLE2_CHAR_MATCHES(enc, p, c); 718 } 719 720 static int PTRFASTCALL 721 little2_isNameMin(const ENCODING *enc, const char *p) 722 { 723 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p); 724 } 725 726 static int PTRFASTCALL 727 little2_isNmstrtMin(const ENCODING *enc, const char *p) 728 { 729 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p); 730 } 731 732 #undef VTABLE 733 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16 734 735 #else /* not XML_MIN_SIZE */ 736 737 #undef PREFIX 738 #define PREFIX(ident) little2_ ## ident 739 #define MINBPC(enc) 2 740 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 741 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p) 742 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p) 743 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c) 744 #define IS_NAME_CHAR(enc, p, n) 0 745 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) 746 #define IS_NMSTRT_CHAR(enc, p, n) (0) 747 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) 748 749 #define XML_TOK_IMPL_C 750 #include "xmltok_impl.c" 751 #undef XML_TOK_IMPL_C 752 753 #undef MINBPC 754 #undef BYTE_TYPE 755 #undef BYTE_TO_ASCII 756 #undef CHAR_MATCHES 757 #undef IS_NAME_CHAR 758 #undef IS_NAME_CHAR_MINBPC 759 #undef IS_NMSTRT_CHAR 760 #undef IS_NMSTRT_CHAR_MINBPC 761 #undef IS_INVALID_CHAR 762 763 #endif /* not XML_MIN_SIZE */ 764 765 #ifdef XML_NS 766 767 static const struct normal_encoding little2_encoding_ns = { 768 { VTABLE, 2, 0, 769 #if BYTEORDER == 1234 770 1 771 #else 772 0 773 #endif 774 }, 775 { 776 #include "asciitab.h" 777 #include "latin1tab.h" 778 }, 779 STANDARD_VTABLE(little2_) 780 }; 781 782 #endif 783 784 static const struct normal_encoding little2_encoding = { 785 { VTABLE, 2, 0, 786 #if BYTEORDER == 1234 787 1 788 #else 789 0 790 #endif 791 }, 792 { 793 #define BT_COLON BT_NMSTRT 794 #include "asciitab.h" 795 #undef BT_COLON 796 #include "latin1tab.h" 797 }, 798 STANDARD_VTABLE(little2_) 799 }; 800 801 #if BYTEORDER != 4321 802 803 #ifdef XML_NS 804 805 static const struct normal_encoding internal_little2_encoding_ns = { 806 { VTABLE, 2, 0, 1 }, 807 { 808 #include "iasciitab.h" 809 #include "latin1tab.h" 810 }, 811 STANDARD_VTABLE(little2_) 812 }; 813 814 #endif 815 816 static const struct normal_encoding internal_little2_encoding = { 817 { VTABLE, 2, 0, 1 }, 818 { 819 #define BT_COLON BT_NMSTRT 820 #include "iasciitab.h" 821 #undef BT_COLON 822 #include "latin1tab.h" 823 }, 824 STANDARD_VTABLE(little2_) 825 }; 826 827 #endif 828 829 830 #define BIG2_BYTE_TYPE(enc, p) \ 831 ((p)[0] == 0 \ 832 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \ 833 : unicode_byte_type((p)[0], (p)[1])) 834 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1) 835 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c) 836 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \ 837 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) 838 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ 839 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) 840 841 #ifdef XML_MIN_SIZE 842 843 static int PTRFASTCALL 844 big2_byteType(const ENCODING *enc, const char *p) 845 { 846 return BIG2_BYTE_TYPE(enc, p); 847 } 848 849 static int PTRFASTCALL 850 big2_byteToAscii(const ENCODING *enc, const char *p) 851 { 852 return BIG2_BYTE_TO_ASCII(enc, p); 853 } 854 855 static int PTRCALL 856 big2_charMatches(const ENCODING *enc, const char *p, int c) 857 { 858 return BIG2_CHAR_MATCHES(enc, p, c); 859 } 860 861 static int PTRFASTCALL 862 big2_isNameMin(const ENCODING *enc, const char *p) 863 { 864 return BIG2_IS_NAME_CHAR_MINBPC(enc, p); 865 } 866 867 static int PTRFASTCALL 868 big2_isNmstrtMin(const ENCODING *enc, const char *p) 869 { 870 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p); 871 } 872 873 #undef VTABLE 874 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16 875 876 #else /* not XML_MIN_SIZE */ 877 878 #undef PREFIX 879 #define PREFIX(ident) big2_ ## ident 880 #define MINBPC(enc) 2 881 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 882 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p) 883 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p) 884 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c) 885 #define IS_NAME_CHAR(enc, p, n) 0 886 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p) 887 #define IS_NMSTRT_CHAR(enc, p, n) (0) 888 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) 889 890 #define XML_TOK_IMPL_C 891 #include "xmltok_impl.c" 892 #undef XML_TOK_IMPL_C 893 894 #undef MINBPC 895 #undef BYTE_TYPE 896 #undef BYTE_TO_ASCII 897 #undef CHAR_MATCHES 898 #undef IS_NAME_CHAR 899 #undef IS_NAME_CHAR_MINBPC 900 #undef IS_NMSTRT_CHAR 901 #undef IS_NMSTRT_CHAR_MINBPC 902 #undef IS_INVALID_CHAR 903 904 #endif /* not XML_MIN_SIZE */ 905 906 #ifdef XML_NS 907 908 static const struct normal_encoding big2_encoding_ns = { 909 { VTABLE, 2, 0, 910 #if BYTEORDER == 4321 911 1 912 #else 913 0 914 #endif 915 }, 916 { 917 #include "asciitab.h" 918 #include "latin1tab.h" 919 }, 920 STANDARD_VTABLE(big2_) 921 }; 922 923 #endif 924 925 static const struct normal_encoding big2_encoding = { 926 { VTABLE, 2, 0, 927 #if BYTEORDER == 4321 928 1 929 #else 930 0 931 #endif 932 }, 933 { 934 #define BT_COLON BT_NMSTRT 935 #include "asciitab.h" 936 #undef BT_COLON 937 #include "latin1tab.h" 938 }, 939 STANDARD_VTABLE(big2_) 940 }; 941 942 #if BYTEORDER != 1234 943 944 #ifdef XML_NS 945 946 static const struct normal_encoding internal_big2_encoding_ns = { 947 { VTABLE, 2, 0, 1 }, 948 { 949 #include "iasciitab.h" 950 #include "latin1tab.h" 951 }, 952 STANDARD_VTABLE(big2_) 953 }; 954 955 #endif 956 957 static const struct normal_encoding internal_big2_encoding = { 958 { VTABLE, 2, 0, 1 }, 959 { 960 #define BT_COLON BT_NMSTRT 961 #include "iasciitab.h" 962 #undef BT_COLON 963 #include "latin1tab.h" 964 }, 965 STANDARD_VTABLE(big2_) 966 }; 967 968 #endif 969 970 #undef PREFIX 971 972 static int FASTCALL 973 streqci(const char *s1, const char *s2) 974 { 975 for (;;) { 976 char c1 = *s1++; 977 char c2 = *s2++; 978 if (ASCII_a <= c1 && c1 <= ASCII_z) 979 c1 += ASCII_A - ASCII_a; 980 if (ASCII_a <= c2 && c2 <= ASCII_z) 981 c2 += ASCII_A - ASCII_a; 982 if (c1 != c2) 983 return 0; 984 if (!c1) 985 break; 986 } 987 return 1; 988 } 989 990 static void PTRCALL 991 initUpdatePosition(const ENCODING *enc, const char *ptr, 992 const char *end, POSITION *pos) 993 { 994 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); 995 } 996 997 static int 998 toAscii(const ENCODING *enc, const char *ptr, const char *end) 999 { 1000 char buf[1]; 1001 char *p = buf; 1002 XmlUtf8Convert(enc, &ptr, end, &p, p + 1); 1003 if (p == buf) 1004 return -1; 1005 else 1006 return buf[0]; 1007 } 1008 1009 static int FASTCALL 1010 isSpace(int c) 1011 { 1012 switch (c) { 1013 case 0x20: 1014 case 0xD: 1015 case 0xA: 1016 case 0x9: 1017 return 1; 1018 } 1019 return 0; 1020 } 1021 1022 /* Return 1 if there's just optional white space or there's an S 1023 followed by name=val. 1024 */ 1025 static int 1026 parsePseudoAttribute(const ENCODING *enc, 1027 const char *ptr, 1028 const char *end, 1029 const char **namePtr, 1030 const char **nameEndPtr, 1031 const char **valPtr, 1032 const char **nextTokPtr) 1033 { 1034 int c; 1035 char open; 1036 if (ptr == end) { 1037 *namePtr = NULL; 1038 return 1; 1039 } 1040 if (!isSpace(toAscii(enc, ptr, end))) { 1041 *nextTokPtr = ptr; 1042 return 0; 1043 } 1044 do { 1045 ptr += enc->minBytesPerChar; 1046 } while (isSpace(toAscii(enc, ptr, end))); 1047 if (ptr == end) { 1048 *namePtr = NULL; 1049 return 1; 1050 } 1051 *namePtr = ptr; 1052 for (;;) { 1053 c = toAscii(enc, ptr, end); 1054 if (c == -1) { 1055 *nextTokPtr = ptr; 1056 return 0; 1057 } 1058 if (c == ASCII_EQUALS) { 1059 *nameEndPtr = ptr; 1060 break; 1061 } 1062 if (isSpace(c)) { 1063 *nameEndPtr = ptr; 1064 do { 1065 ptr += enc->minBytesPerChar; 1066 } while (isSpace(c = toAscii(enc, ptr, end))); 1067 if (c != ASCII_EQUALS) { 1068 *nextTokPtr = ptr; 1069 return 0; 1070 } 1071 break; 1072 } 1073 ptr += enc->minBytesPerChar; 1074 } 1075 if (ptr == *namePtr) { 1076 *nextTokPtr = ptr; 1077 return 0; 1078 } 1079 ptr += enc->minBytesPerChar; 1080 c = toAscii(enc, ptr, end); 1081 while (isSpace(c)) { 1082 ptr += enc->minBytesPerChar; 1083 c = toAscii(enc, ptr, end); 1084 } 1085 if (c != ASCII_QUOT && c != ASCII_APOS) { 1086 *nextTokPtr = ptr; 1087 return 0; 1088 } 1089 open = (char)c; 1090 ptr += enc->minBytesPerChar; 1091 *valPtr = ptr; 1092 for (;; ptr += enc->minBytesPerChar) { 1093 c = toAscii(enc, ptr, end); 1094 if (c == open) 1095 break; 1096 if (!(ASCII_a <= c && c <= ASCII_z) 1097 && !(ASCII_A <= c && c <= ASCII_Z) 1098 && !(ASCII_0 <= c && c <= ASCII_9) 1099 && c != ASCII_PERIOD 1100 && c != ASCII_MINUS 1101 && c != ASCII_UNDERSCORE) { 1102 *nextTokPtr = ptr; 1103 return 0; 1104 } 1105 } 1106 *nextTokPtr = ptr + enc->minBytesPerChar; 1107 return 1; 1108 } 1109 1110 static const char KW_version[] = { 1111 ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0' 1112 }; 1113 1114 static const char KW_encoding[] = { 1115 ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0' 1116 }; 1117 1118 static const char KW_standalone[] = { 1119 ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o, 1120 ASCII_n, ASCII_e, '\0' 1121 }; 1122 1123 static const char KW_yes[] = { 1124 ASCII_y, ASCII_e, ASCII_s, '\0' 1125 }; 1126 1127 static const char KW_no[] = { 1128 ASCII_n, ASCII_o, '\0' 1129 }; 1130 1131 static int 1132 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, 1133 const char *, 1134 const char *), 1135 int isGeneralTextEntity, 1136 const ENCODING *enc, 1137 const char *ptr, 1138 const char *end, 1139 const char **badPtr, 1140 const char **versionPtr, 1141 const char **versionEndPtr, 1142 const char **encodingName, 1143 const ENCODING **encoding, 1144 int *standalone) 1145 { 1146 const char *val = NULL; 1147 const char *name = NULL; 1148 const char *nameEnd = NULL; 1149 ptr += 5 * enc->minBytesPerChar; 1150 end -= 2 * enc->minBytesPerChar; 1151 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) 1152 || !name) { 1153 *badPtr = ptr; 1154 return 0; 1155 } 1156 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) { 1157 if (!isGeneralTextEntity) { 1158 *badPtr = name; 1159 return 0; 1160 } 1161 } 1162 else { 1163 if (versionPtr) 1164 *versionPtr = val; 1165 if (versionEndPtr) 1166 *versionEndPtr = ptr; 1167 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1168 *badPtr = ptr; 1169 return 0; 1170 } 1171 if (!name) { 1172 if (isGeneralTextEntity) { 1173 /* a TextDecl must have an EncodingDecl */ 1174 *badPtr = ptr; 1175 return 0; 1176 } 1177 return 1; 1178 } 1179 } 1180 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) { 1181 int c = toAscii(enc, val, end); 1182 if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) { 1183 *badPtr = val; 1184 return 0; 1185 } 1186 if (encodingName) 1187 *encodingName = val; 1188 if (encoding) 1189 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar); 1190 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1191 *badPtr = ptr; 1192 return 0; 1193 } 1194 if (!name) 1195 return 1; 1196 } 1197 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) 1198 || isGeneralTextEntity) { 1199 *badPtr = name; 1200 return 0; 1201 } 1202 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) { 1203 if (standalone) 1204 *standalone = 1; 1205 } 1206 else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) { 1207 if (standalone) 1208 *standalone = 0; 1209 } 1210 else { 1211 *badPtr = val; 1212 return 0; 1213 } 1214 while (isSpace(toAscii(enc, ptr, end))) 1215 ptr += enc->minBytesPerChar; 1216 if (ptr != end) { 1217 *badPtr = ptr; 1218 return 0; 1219 } 1220 return 1; 1221 } 1222 1223 static int FASTCALL 1224 checkCharRefNumber(int result) 1225 { 1226 switch (result >> 8) { 1227 case 0xD8: case 0xD9: case 0xDA: case 0xDB: 1228 case 0xDC: case 0xDD: case 0xDE: case 0xDF: 1229 return -1; 1230 case 0: 1231 if (latin1_encoding.type[result] == BT_NONXML) 1232 return -1; 1233 break; 1234 case 0xFF: 1235 if (result == 0xFFFE || result == 0xFFFF) 1236 return -1; 1237 break; 1238 } 1239 return result; 1240 } 1241 1242 int FASTCALL 1243 XmlUtf8Encode(int c, char *buf) 1244 { 1245 enum { 1246 /* minN is minimum legal resulting value for N byte sequence */ 1247 min2 = 0x80, 1248 min3 = 0x800, 1249 min4 = 0x10000 1250 }; 1251 1252 if (c < 0) 1253 return 0; 1254 if (c < min2) { 1255 buf[0] = (char)(c | UTF8_cval1); 1256 return 1; 1257 } 1258 if (c < min3) { 1259 buf[0] = (char)((c >> 6) | UTF8_cval2); 1260 buf[1] = (char)((c & 0x3f) | 0x80); 1261 return 2; 1262 } 1263 if (c < min4) { 1264 buf[0] = (char)((c >> 12) | UTF8_cval3); 1265 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80); 1266 buf[2] = (char)((c & 0x3f) | 0x80); 1267 return 3; 1268 } 1269 if (c < 0x110000) { 1270 buf[0] = (char)((c >> 18) | UTF8_cval4); 1271 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80); 1272 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80); 1273 buf[3] = (char)((c & 0x3f) | 0x80); 1274 return 4; 1275 } 1276 return 0; 1277 } 1278 1279 int FASTCALL 1280 XmlUtf16Encode(int charNum, unsigned short *buf) 1281 { 1282 if (charNum < 0) 1283 return 0; 1284 if (charNum < 0x10000) { 1285 buf[0] = (unsigned short)charNum; 1286 return 1; 1287 } 1288 if (charNum < 0x110000) { 1289 charNum -= 0x10000; 1290 buf[0] = (unsigned short)((charNum >> 10) + 0xD800); 1291 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00); 1292 return 2; 1293 } 1294 return 0; 1295 } 1296 1297 struct unknown_encoding { 1298 struct normal_encoding normal; 1299 CONVERTER convert; 1300 void *userData; 1301 unsigned short utf16[256]; 1302 char utf8[256][4]; 1303 }; 1304 1305 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc)) 1306 1307 int 1308 XmlSizeOfUnknownEncoding(void) 1309 { 1310 return sizeof(struct unknown_encoding); 1311 } 1312 1313 static int PTRFASTCALL 1314 unknown_isName(const ENCODING *enc, const char *p) 1315 { 1316 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1317 int c = uenc->convert(uenc->userData, p); 1318 if (c & ~0xFFFF) 1319 return 0; 1320 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF); 1321 } 1322 1323 static int PTRFASTCALL 1324 unknown_isNmstrt(const ENCODING *enc, const char *p) 1325 { 1326 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1327 int c = uenc->convert(uenc->userData, p); 1328 if (c & ~0xFFFF) 1329 return 0; 1330 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF); 1331 } 1332 1333 static int PTRFASTCALL 1334 unknown_isInvalid(const ENCODING *enc, const char *p) 1335 { 1336 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1337 int c = uenc->convert(uenc->userData, p); 1338 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0; 1339 } 1340 1341 static enum XML_Convert_Result PTRCALL 1342 unknown_toUtf8(const ENCODING *enc, 1343 const char **fromP, const char *fromLim, 1344 char **toP, const char *toLim) 1345 { 1346 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1347 char buf[XML_UTF8_ENCODE_MAX]; 1348 for (;;) { 1349 const char *utf8; 1350 int n; 1351 if (*fromP == fromLim) 1352 return XML_CONVERT_COMPLETED; 1353 utf8 = uenc->utf8[(unsigned char)**fromP]; 1354 n = *utf8++; 1355 if (n == 0) { 1356 int c = uenc->convert(uenc->userData, *fromP); 1357 n = XmlUtf8Encode(c, buf); 1358 if (n > toLim - *toP) 1359 return XML_CONVERT_OUTPUT_EXHAUSTED; 1360 utf8 = buf; 1361 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1362 - (BT_LEAD2 - 2)); 1363 } 1364 else { 1365 if (n > toLim - *toP) 1366 return XML_CONVERT_OUTPUT_EXHAUSTED; 1367 (*fromP)++; 1368 } 1369 do { 1370 *(*toP)++ = *utf8++; 1371 } while (--n != 0); 1372 } 1373 } 1374 1375 static enum XML_Convert_Result PTRCALL 1376 unknown_toUtf16(const ENCODING *enc, 1377 const char **fromP, const char *fromLim, 1378 unsigned short **toP, const unsigned short *toLim) 1379 { 1380 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1381 while (*fromP < fromLim && *toP < toLim) { 1382 unsigned short c = uenc->utf16[(unsigned char)**fromP]; 1383 if (c == 0) { 1384 c = (unsigned short) 1385 uenc->convert(uenc->userData, *fromP); 1386 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1387 - (BT_LEAD2 - 2)); 1388 } 1389 else 1390 (*fromP)++; 1391 *(*toP)++ = c; 1392 } 1393 1394 if ((*toP == toLim) && (*fromP < fromLim)) 1395 return XML_CONVERT_OUTPUT_EXHAUSTED; 1396 else 1397 return XML_CONVERT_COMPLETED; 1398 } 1399 1400 ENCODING * 1401 XmlInitUnknownEncoding(void *mem, 1402 int *table, 1403 CONVERTER convert, 1404 void *userData) 1405 { 1406 int i; 1407 struct unknown_encoding *e = (struct unknown_encoding *)mem; 1408 for (i = 0; i < (int)sizeof(struct normal_encoding); i++) 1409 ((char *)mem)[i] = ((char *)&latin1_encoding)[i]; 1410 for (i = 0; i < 128; i++) 1411 if (latin1_encoding.type[i] != BT_OTHER 1412 && latin1_encoding.type[i] != BT_NONXML 1413 && table[i] != i) 1414 return 0; 1415 for (i = 0; i < 256; i++) { 1416 int c = table[i]; 1417 if (c == -1) { 1418 e->normal.type[i] = BT_MALFORM; 1419 /* This shouldn't really get used. */ 1420 e->utf16[i] = 0xFFFF; 1421 e->utf8[i][0] = 1; 1422 e->utf8[i][1] = 0; 1423 } 1424 else if (c < 0) { 1425 if (c < -4) 1426 return 0; 1427 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2)); 1428 e->utf8[i][0] = 0; 1429 e->utf16[i] = 0; 1430 } 1431 else if (c < 0x80) { 1432 if (latin1_encoding.type[c] != BT_OTHER 1433 && latin1_encoding.type[c] != BT_NONXML 1434 && c != i) 1435 return 0; 1436 e->normal.type[i] = latin1_encoding.type[c]; 1437 e->utf8[i][0] = 1; 1438 e->utf8[i][1] = (char)c; 1439 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c); 1440 } 1441 else if (checkCharRefNumber(c) < 0) { 1442 e->normal.type[i] = BT_NONXML; 1443 /* This shouldn't really get used. */ 1444 e->utf16[i] = 0xFFFF; 1445 e->utf8[i][0] = 1; 1446 e->utf8[i][1] = 0; 1447 } 1448 else { 1449 if (c > 0xFFFF) 1450 return 0; 1451 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)) 1452 e->normal.type[i] = BT_NMSTRT; 1453 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff)) 1454 e->normal.type[i] = BT_NAME; 1455 else 1456 e->normal.type[i] = BT_OTHER; 1457 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1); 1458 e->utf16[i] = (unsigned short)c; 1459 } 1460 } 1461 e->userData = userData; 1462 e->convert = convert; 1463 if (convert) { 1464 e->normal.isName2 = unknown_isName; 1465 e->normal.isName3 = unknown_isName; 1466 e->normal.isName4 = unknown_isName; 1467 e->normal.isNmstrt2 = unknown_isNmstrt; 1468 e->normal.isNmstrt3 = unknown_isNmstrt; 1469 e->normal.isNmstrt4 = unknown_isNmstrt; 1470 e->normal.isInvalid2 = unknown_isInvalid; 1471 e->normal.isInvalid3 = unknown_isInvalid; 1472 e->normal.isInvalid4 = unknown_isInvalid; 1473 } 1474 e->normal.enc.utf8Convert = unknown_toUtf8; 1475 e->normal.enc.utf16Convert = unknown_toUtf16; 1476 return &(e->normal.enc); 1477 } 1478 1479 /* If this enumeration is changed, getEncodingIndex and encodings 1480 must also be changed. */ 1481 enum { 1482 UNKNOWN_ENC = -1, 1483 ISO_8859_1_ENC = 0, 1484 US_ASCII_ENC, 1485 UTF_8_ENC, 1486 UTF_16_ENC, 1487 UTF_16BE_ENC, 1488 UTF_16LE_ENC, 1489 /* must match encodingNames up to here */ 1490 NO_ENC 1491 }; 1492 1493 static const char KW_ISO_8859_1[] = { 1494 ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9, 1495 ASCII_MINUS, ASCII_1, '\0' 1496 }; 1497 static const char KW_US_ASCII[] = { 1498 ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I, 1499 '\0' 1500 }; 1501 static const char KW_UTF_8[] = { 1502 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0' 1503 }; 1504 static const char KW_UTF_16[] = { 1505 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0' 1506 }; 1507 static const char KW_UTF_16BE[] = { 1508 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E, 1509 '\0' 1510 }; 1511 static const char KW_UTF_16LE[] = { 1512 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E, 1513 '\0' 1514 }; 1515 1516 static int FASTCALL 1517 getEncodingIndex(const char *name) 1518 { 1519 static const char * const encodingNames[] = { 1520 KW_ISO_8859_1, 1521 KW_US_ASCII, 1522 KW_UTF_8, 1523 KW_UTF_16, 1524 KW_UTF_16BE, 1525 KW_UTF_16LE, 1526 }; 1527 int i; 1528 if (name == NULL) 1529 return NO_ENC; 1530 for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++) 1531 if (streqci(name, encodingNames[i])) 1532 return i; 1533 return UNKNOWN_ENC; 1534 } 1535 1536 /* For binary compatibility, we store the index of the encoding 1537 specified at initialization in the isUtf16 member. 1538 */ 1539 1540 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16) 1541 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i) 1542 1543 /* This is what detects the encoding. encodingTable maps from 1544 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of 1545 the external (protocol) specified encoding; state is 1546 XML_CONTENT_STATE if we're parsing an external text entity, and 1547 XML_PROLOG_STATE otherwise. 1548 */ 1549 1550 1551 static int 1552 initScan(const ENCODING * const *encodingTable, 1553 const INIT_ENCODING *enc, 1554 int state, 1555 const char *ptr, 1556 const char *end, 1557 const char **nextTokPtr) 1558 { 1559 const ENCODING **encPtr; 1560 1561 if (ptr >= end) 1562 return XML_TOK_NONE; 1563 encPtr = enc->encPtr; 1564 if (ptr + 1 == end) { 1565 /* only a single byte available for auto-detection */ 1566 #ifndef XML_DTD /* FIXME */ 1567 /* a well-formed document entity must have more than one byte */ 1568 if (state != XML_CONTENT_STATE) 1569 return XML_TOK_PARTIAL; 1570 #endif 1571 /* so we're parsing an external text entity... */ 1572 /* if UTF-16 was externally specified, then we need at least 2 bytes */ 1573 switch (INIT_ENC_INDEX(enc)) { 1574 case UTF_16_ENC: 1575 case UTF_16LE_ENC: 1576 case UTF_16BE_ENC: 1577 return XML_TOK_PARTIAL; 1578 } 1579 switch ((unsigned char)*ptr) { 1580 case 0xFE: 1581 case 0xFF: 1582 case 0xEF: /* possibly first byte of UTF-8 BOM */ 1583 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC 1584 && state == XML_CONTENT_STATE) 1585 break; 1586 /* fall through */ 1587 case 0x00: 1588 case 0x3C: 1589 return XML_TOK_PARTIAL; 1590 } 1591 } 1592 else { 1593 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { 1594 case 0xFEFF: 1595 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC 1596 && state == XML_CONTENT_STATE) 1597 break; 1598 *nextTokPtr = ptr + 2; 1599 *encPtr = encodingTable[UTF_16BE_ENC]; 1600 return XML_TOK_BOM; 1601 /* 00 3C is handled in the default case */ 1602 case 0x3C00: 1603 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC 1604 || INIT_ENC_INDEX(enc) == UTF_16_ENC) 1605 && state == XML_CONTENT_STATE) 1606 break; 1607 *encPtr = encodingTable[UTF_16LE_ENC]; 1608 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1609 case 0xFFFE: 1610 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC 1611 && state == XML_CONTENT_STATE) 1612 break; 1613 *nextTokPtr = ptr + 2; 1614 *encPtr = encodingTable[UTF_16LE_ENC]; 1615 return XML_TOK_BOM; 1616 case 0xEFBB: 1617 /* Maybe a UTF-8 BOM (EF BB BF) */ 1618 /* If there's an explicitly specified (external) encoding 1619 of ISO-8859-1 or some flavour of UTF-16 1620 and this is an external text entity, 1621 don't look for the BOM, 1622 because it might be a legal data. 1623 */ 1624 if (state == XML_CONTENT_STATE) { 1625 int e = INIT_ENC_INDEX(enc); 1626 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC 1627 || e == UTF_16LE_ENC || e == UTF_16_ENC) 1628 break; 1629 } 1630 if (ptr + 2 == end) 1631 return XML_TOK_PARTIAL; 1632 if ((unsigned char)ptr[2] == 0xBF) { 1633 *nextTokPtr = ptr + 3; 1634 *encPtr = encodingTable[UTF_8_ENC]; 1635 return XML_TOK_BOM; 1636 } 1637 break; 1638 default: 1639 if (ptr[0] == '\0') { 1640 /* 0 isn't a legal data character. Furthermore a document 1641 entity can only start with ASCII characters. So the only 1642 way this can fail to be big-endian UTF-16 if it it's an 1643 external parsed general entity that's labelled as 1644 UTF-16LE. 1645 */ 1646 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC) 1647 break; 1648 *encPtr = encodingTable[UTF_16BE_ENC]; 1649 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1650 } 1651 else if (ptr[1] == '\0') { 1652 /* We could recover here in the case: 1653 - parsing an external entity 1654 - second byte is 0 1655 - no externally specified encoding 1656 - no encoding declaration 1657 by assuming UTF-16LE. But we don't, because this would mean when 1658 presented just with a single byte, we couldn't reliably determine 1659 whether we needed further bytes. 1660 */ 1661 if (state == XML_CONTENT_STATE) 1662 break; 1663 *encPtr = encodingTable[UTF_16LE_ENC]; 1664 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1665 } 1666 break; 1667 } 1668 } 1669 *encPtr = encodingTable[INIT_ENC_INDEX(enc)]; 1670 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1671 } 1672 1673 1674 #define NS(x) x 1675 #define ns(x) x 1676 #define XML_TOK_NS_C 1677 #include "xmltok_ns.c" 1678 #undef XML_TOK_NS_C 1679 #undef NS 1680 #undef ns 1681 1682 #ifdef XML_NS 1683 1684 #define NS(x) x ## NS 1685 #define ns(x) x ## _ns 1686 1687 #define XML_TOK_NS_C 1688 #include "xmltok_ns.c" 1689 #undef XML_TOK_NS_C 1690 1691 #undef NS 1692 #undef ns 1693 1694 ENCODING * 1695 XmlInitUnknownEncodingNS(void *mem, 1696 int *table, 1697 CONVERTER convert, 1698 void *userData) 1699 { 1700 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData); 1701 if (enc) 1702 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON; 1703 return enc; 1704 } 1705 1706 #endif /* XML_NS */ 1707