1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd 2 See the file COPYING for copying permission. 3 */ 4 5 #ifdef COMPILED_FROM_DSP 6 #include "winconfig.h" 7 #elif defined(MACOS_CLASSIC) 8 #include "macconfig.h" 9 #elif defined(__amigaos4__) 10 #include "amigaconfig.h" 11 #else 12 #ifdef HAVE_EXPAT_CONFIG_H 13 #include <expat_config.h> 14 #endif 15 #endif /* ndef COMPILED_FROM_DSP */ 16 17 #include <stddef.h> 18 19 #include "expat_external.h" 20 #include "internal.h" 21 #include "xmltok.h" 22 #include "nametab.h" 23 24 #ifdef XML_DTD 25 #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok) 26 #else 27 #define IGNORE_SECTION_TOK_VTABLE /* as nothing */ 28 #endif 29 30 #define VTABLE1 \ 31 { PREFIX(prologTok), PREFIX(contentTok), \ 32 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \ 33 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \ 34 PREFIX(sameName), \ 35 PREFIX(nameMatchesAscii), \ 36 PREFIX(nameLength), \ 37 PREFIX(skipS), \ 38 PREFIX(getAtts), \ 39 PREFIX(charRefNumber), \ 40 PREFIX(predefinedEntityName), \ 41 PREFIX(updatePosition), \ 42 PREFIX(isPublicId) 43 44 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) 45 46 #define UCS2_GET_NAMING(pages, hi, lo) \ 47 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F))) 48 49 /* A 2 byte UTF-8 representation splits the characters 11 bits between 50 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into 51 pages, 3 bits to add to that index and 5 bits to generate the mask. 52 */ 53 #define UTF8_GET_NAMING2(pages, byte) \ 54 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ 55 + ((((byte)[0]) & 3) << 1) \ 56 + ((((byte)[1]) >> 5) & 1)] \ 57 & (1 << (((byte)[1]) & 0x1F))) 58 59 /* A 3 byte UTF-8 representation splits the characters 16 bits between 60 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index 61 into pages, 3 bits to add to that index and 5 bits to generate the 62 mask. 63 */ 64 #define UTF8_GET_NAMING3(pages, byte) \ 65 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \ 66 + ((((byte)[1]) >> 2) & 0xF)] \ 67 << 3) \ 68 + ((((byte)[1]) & 3) << 1) \ 69 + ((((byte)[2]) >> 5) & 1)] \ 70 & (1 << (((byte)[2]) & 0x1F))) 71 72 #define UTF8_GET_NAMING(pages, p, n) \ 73 ((n) == 2 \ 74 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ 75 : ((n) == 3 \ 76 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \ 77 : 0)) 78 79 /* Detection of invalid UTF-8 sequences is based on Table 3.1B 80 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/ 81 with the additional restriction of not allowing the Unicode 82 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE). 83 Implementation details: 84 (A & 0x80) == 0 means A < 0x80 85 and 86 (A & 0xC0) == 0xC0 means A > 0xBF 87 */ 88 89 #define UTF8_INVALID2(p) \ 90 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0) 91 92 #define UTF8_INVALID3(p) \ 93 (((p)[2] & 0x80) == 0 \ 94 || \ 95 ((*p) == 0xEF && (p)[1] == 0xBF \ 96 ? \ 97 (p)[2] > 0xBD \ 98 : \ 99 ((p)[2] & 0xC0) == 0xC0) \ 100 || \ 101 ((*p) == 0xE0 \ 102 ? \ 103 (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \ 104 : \ 105 ((p)[1] & 0x80) == 0 \ 106 || \ 107 ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0))) 108 109 #define UTF8_INVALID4(p) \ 110 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \ 111 || \ 112 ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \ 113 || \ 114 ((*p) == 0xF0 \ 115 ? \ 116 (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \ 117 : \ 118 ((p)[1] & 0x80) == 0 \ 119 || \ 120 ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0))) 121 122 static int PTRFASTCALL 123 isNever(const ENCODING *enc, const char *p) 124 { 125 return 0; 126 } 127 128 static int PTRFASTCALL 129 utf8_isName2(const ENCODING *enc, const char *p) 130 { 131 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p); 132 } 133 134 static int PTRFASTCALL 135 utf8_isName3(const ENCODING *enc, const char *p) 136 { 137 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p); 138 } 139 140 #define utf8_isName4 isNever 141 142 static int PTRFASTCALL 143 utf8_isNmstrt2(const ENCODING *enc, const char *p) 144 { 145 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p); 146 } 147 148 static int PTRFASTCALL 149 utf8_isNmstrt3(const ENCODING *enc, const char *p) 150 { 151 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p); 152 } 153 154 #define utf8_isNmstrt4 isNever 155 156 static int PTRFASTCALL 157 utf8_isInvalid2(const ENCODING *enc, const char *p) 158 { 159 return UTF8_INVALID2((const unsigned char *)p); 160 } 161 162 static int PTRFASTCALL 163 utf8_isInvalid3(const ENCODING *enc, const char *p) 164 { 165 return UTF8_INVALID3((const unsigned char *)p); 166 } 167 168 static int PTRFASTCALL 169 utf8_isInvalid4(const ENCODING *enc, const char *p) 170 { 171 return UTF8_INVALID4((const unsigned char *)p); 172 } 173 174 struct normal_encoding { 175 ENCODING enc; 176 unsigned char type[256]; 177 #ifdef XML_MIN_SIZE 178 int (PTRFASTCALL *byteType)(const ENCODING *, const char *); 179 int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *); 180 int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *); 181 int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *); 182 int (PTRCALL *charMatches)(const ENCODING *, const char *, int); 183 #endif /* XML_MIN_SIZE */ 184 int (PTRFASTCALL *isName2)(const ENCODING *, const char *); 185 int (PTRFASTCALL *isName3)(const ENCODING *, const char *); 186 int (PTRFASTCALL *isName4)(const ENCODING *, const char *); 187 int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *); 188 int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *); 189 int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *); 190 int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *); 191 int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *); 192 int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *); 193 }; 194 195 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc)) 196 197 #ifdef XML_MIN_SIZE 198 199 #define STANDARD_VTABLE(E) \ 200 E ## byteType, \ 201 E ## isNameMin, \ 202 E ## isNmstrtMin, \ 203 E ## byteToAscii, \ 204 E ## charMatches, 205 206 #else 207 208 #define STANDARD_VTABLE(E) /* as nothing */ 209 210 #endif 211 212 #define NORMAL_VTABLE(E) \ 213 E ## isName2, \ 214 E ## isName3, \ 215 E ## isName4, \ 216 E ## isNmstrt2, \ 217 E ## isNmstrt3, \ 218 E ## isNmstrt4, \ 219 E ## isInvalid2, \ 220 E ## isInvalid3, \ 221 E ## isInvalid4 222 223 static int FASTCALL checkCharRefNumber(int); 224 225 #include "xmltok_impl.h" 226 #include "ascii.h" 227 228 #ifdef XML_MIN_SIZE 229 #define sb_isNameMin isNever 230 #define sb_isNmstrtMin isNever 231 #endif 232 233 #ifdef XML_MIN_SIZE 234 #define MINBPC(enc) ((enc)->minBytesPerChar) 235 #else 236 /* minimum bytes per character */ 237 #define MINBPC(enc) 1 238 #endif 239 240 #define SB_BYTE_TYPE(enc, p) \ 241 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) 242 243 #ifdef XML_MIN_SIZE 244 static int PTRFASTCALL 245 sb_byteType(const ENCODING *enc, const char *p) 246 { 247 return SB_BYTE_TYPE(enc, p); 248 } 249 #define BYTE_TYPE(enc, p) \ 250 (AS_NORMAL_ENCODING(enc)->byteType(enc, p)) 251 #else 252 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p) 253 #endif 254 255 #ifdef XML_MIN_SIZE 256 #define BYTE_TO_ASCII(enc, p) \ 257 (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p)) 258 static int PTRFASTCALL 259 sb_byteToAscii(const ENCODING *enc, const char *p) 260 { 261 return *p; 262 } 263 #else 264 #define BYTE_TO_ASCII(enc, p) (*(p)) 265 #endif 266 267 #define IS_NAME_CHAR(enc, p, n) \ 268 (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p)) 269 #define IS_NMSTRT_CHAR(enc, p, n) \ 270 (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p)) 271 #define IS_INVALID_CHAR(enc, p, n) \ 272 (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p)) 273 274 #ifdef XML_MIN_SIZE 275 #define IS_NAME_CHAR_MINBPC(enc, p) \ 276 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p)) 277 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \ 278 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p)) 279 #else 280 #define IS_NAME_CHAR_MINBPC(enc, p) (0) 281 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) 282 #endif 283 284 #ifdef XML_MIN_SIZE 285 #define CHAR_MATCHES(enc, p, c) \ 286 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c)) 287 static int PTRCALL 288 sb_charMatches(const ENCODING *enc, const char *p, int c) 289 { 290 return *p == c; 291 } 292 #else 293 /* c is an ASCII character */ 294 #define CHAR_MATCHES(enc, p, c) (*(p) == c) 295 #endif 296 297 #define PREFIX(ident) normal_ ## ident 298 #include "xmltok_impl.c" 299 300 #undef MINBPC 301 #undef BYTE_TYPE 302 #undef BYTE_TO_ASCII 303 #undef CHAR_MATCHES 304 #undef IS_NAME_CHAR 305 #undef IS_NAME_CHAR_MINBPC 306 #undef IS_NMSTRT_CHAR 307 #undef IS_NMSTRT_CHAR_MINBPC 308 #undef IS_INVALID_CHAR 309 310 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ 311 UTF8_cval1 = 0x00, 312 UTF8_cval2 = 0xc0, 313 UTF8_cval3 = 0xe0, 314 UTF8_cval4 = 0xf0 315 }; 316 317 static void PTRCALL 318 utf8_toUtf8(const ENCODING *enc, 319 const char **fromP, const char *fromLim, 320 char **toP, const char *toLim) 321 { 322 char *to; 323 const char *from; 324 if (fromLim - *fromP > toLim - *toP) { 325 /* Avoid copying partial characters. */ 326 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--) 327 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80) 328 break; 329 } 330 for (to = *toP, from = *fromP; from != fromLim; from++, to++) 331 *to = *from; 332 *fromP = from; 333 *toP = to; 334 } 335 336 static void PTRCALL 337 utf8_toUtf16(const ENCODING *enc, 338 const char **fromP, const char *fromLim, 339 unsigned short **toP, const unsigned short *toLim) 340 { 341 unsigned short *to = *toP; 342 const char *from = *fromP; 343 while (from != fromLim && to != toLim) { 344 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) { 345 case BT_LEAD2: 346 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f)); 347 from += 2; 348 break; 349 case BT_LEAD3: 350 *to++ = (unsigned short)(((from[0] & 0xf) << 12) 351 | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f)); 352 from += 3; 353 break; 354 case BT_LEAD4: 355 { 356 unsigned long n; 357 if (to + 1 == toLim) 358 goto after; 359 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) 360 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); 361 n -= 0x10000; 362 to[0] = (unsigned short)((n >> 10) | 0xD800); 363 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00); 364 to += 2; 365 from += 4; 366 } 367 break; 368 default: 369 *to++ = *from++; 370 break; 371 } 372 } 373 after: 374 *fromP = from; 375 *toP = to; 376 } 377 378 #ifdef XML_NS 379 static const struct normal_encoding utf8_encoding_ns = { 380 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 381 { 382 #include "asciitab.h" 383 #include "utf8tab.h" 384 }, 385 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 386 }; 387 #endif 388 389 static const struct normal_encoding utf8_encoding = { 390 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 391 { 392 #define BT_COLON BT_NMSTRT 393 #include "asciitab.h" 394 #undef BT_COLON 395 #include "utf8tab.h" 396 }, 397 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 398 }; 399 400 #ifdef XML_NS 401 402 static const struct normal_encoding internal_utf8_encoding_ns = { 403 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 404 { 405 #include "iasciitab.h" 406 #include "utf8tab.h" 407 }, 408 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 409 }; 410 411 #endif 412 413 static const struct normal_encoding internal_utf8_encoding = { 414 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 415 { 416 #define BT_COLON BT_NMSTRT 417 #include "iasciitab.h" 418 #undef BT_COLON 419 #include "utf8tab.h" 420 }, 421 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 422 }; 423 424 static void PTRCALL 425 latin1_toUtf8(const ENCODING *enc, 426 const char **fromP, const char *fromLim, 427 char **toP, const char *toLim) 428 { 429 for (;;) { 430 unsigned char c; 431 if (*fromP == fromLim) 432 break; 433 c = (unsigned char)**fromP; 434 if (c & 0x80) { 435 if (toLim - *toP < 2) 436 break; 437 *(*toP)++ = (char)((c >> 6) | UTF8_cval2); 438 *(*toP)++ = (char)((c & 0x3f) | 0x80); 439 (*fromP)++; 440 } 441 else { 442 if (*toP == toLim) 443 break; 444 *(*toP)++ = *(*fromP)++; 445 } 446 } 447 } 448 449 static void PTRCALL 450 latin1_toUtf16(const ENCODING *enc, 451 const char **fromP, const char *fromLim, 452 unsigned short **toP, const unsigned short *toLim) 453 { 454 while (*fromP != fromLim && *toP != toLim) 455 *(*toP)++ = (unsigned char)*(*fromP)++; 456 } 457 458 #ifdef XML_NS 459 460 static const struct normal_encoding latin1_encoding_ns = { 461 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, 462 { 463 #include "asciitab.h" 464 #include "latin1tab.h" 465 }, 466 STANDARD_VTABLE(sb_) 467 }; 468 469 #endif 470 471 static const struct normal_encoding latin1_encoding = { 472 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, 473 { 474 #define BT_COLON BT_NMSTRT 475 #include "asciitab.h" 476 #undef BT_COLON 477 #include "latin1tab.h" 478 }, 479 STANDARD_VTABLE(sb_) 480 }; 481 482 static void PTRCALL 483 ascii_toUtf8(const ENCODING *enc, 484 const char **fromP, const char *fromLim, 485 char **toP, const char *toLim) 486 { 487 while (*fromP != fromLim && *toP != toLim) 488 *(*toP)++ = *(*fromP)++; 489 } 490 491 #ifdef XML_NS 492 493 static const struct normal_encoding ascii_encoding_ns = { 494 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, 495 { 496 #include "asciitab.h" 497 /* BT_NONXML == 0 */ 498 }, 499 STANDARD_VTABLE(sb_) 500 }; 501 502 #endif 503 504 static const struct normal_encoding ascii_encoding = { 505 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, 506 { 507 #define BT_COLON BT_NMSTRT 508 #include "asciitab.h" 509 #undef BT_COLON 510 /* BT_NONXML == 0 */ 511 }, 512 STANDARD_VTABLE(sb_) 513 }; 514 515 static int PTRFASTCALL 516 unicode_byte_type(char hi, char lo) 517 { 518 switch ((unsigned char)hi) { 519 case 0xD8: case 0xD9: case 0xDA: case 0xDB: 520 return BT_LEAD4; 521 case 0xDC: case 0xDD: case 0xDE: case 0xDF: 522 return BT_TRAIL; 523 case 0xFF: 524 switch ((unsigned char)lo) { 525 case 0xFF: 526 case 0xFE: 527 return BT_NONXML; 528 } 529 break; 530 } 531 return BT_NONASCII; 532 } 533 534 #define DEFINE_UTF16_TO_UTF8(E) \ 535 static void PTRCALL \ 536 E ## toUtf8(const ENCODING *enc, \ 537 const char **fromP, const char *fromLim, \ 538 char **toP, const char *toLim) \ 539 { \ 540 const char *from; \ 541 for (from = *fromP; from != fromLim; from += 2) { \ 542 int plane; \ 543 unsigned char lo2; \ 544 unsigned char lo = GET_LO(from); \ 545 unsigned char hi = GET_HI(from); \ 546 switch (hi) { \ 547 case 0: \ 548 if (lo < 0x80) { \ 549 if (*toP == toLim) { \ 550 *fromP = from; \ 551 return; \ 552 } \ 553 *(*toP)++ = lo; \ 554 break; \ 555 } \ 556 /* fall through */ \ 557 case 0x1: case 0x2: case 0x3: \ 558 case 0x4: case 0x5: case 0x6: case 0x7: \ 559 if (toLim - *toP < 2) { \ 560 *fromP = from; \ 561 return; \ 562 } \ 563 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ 564 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 565 break; \ 566 default: \ 567 if (toLim - *toP < 3) { \ 568 *fromP = from; \ 569 return; \ 570 } \ 571 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ 572 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ 573 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ 574 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 575 break; \ 576 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \ 577 if (toLim - *toP < 4) { \ 578 *fromP = from; \ 579 return; \ 580 } \ 581 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ 582 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \ 583 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ 584 from += 2; \ 585 lo2 = GET_LO(from); \ 586 *(*toP)++ = (((lo & 0x3) << 4) \ 587 | ((GET_HI(from) & 0x3) << 2) \ 588 | (lo2 >> 6) \ 589 | 0x80); \ 590 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ 591 break; \ 592 } \ 593 } \ 594 *fromP = from; \ 595 } 596 597 #define DEFINE_UTF16_TO_UTF16(E) \ 598 static void PTRCALL \ 599 E ## toUtf16(const ENCODING *enc, \ 600 const char **fromP, const char *fromLim, \ 601 unsigned short **toP, const unsigned short *toLim) \ 602 { \ 603 /* Avoid copying first half only of surrogate */ \ 604 if (fromLim - *fromP > ((toLim - *toP) << 1) \ 605 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \ 606 fromLim -= 2; \ 607 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \ 608 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ 609 } 610 611 #define SET2(ptr, ch) \ 612 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8))) 613 #define GET_LO(ptr) ((unsigned char)(ptr)[0]) 614 #define GET_HI(ptr) ((unsigned char)(ptr)[1]) 615 616 DEFINE_UTF16_TO_UTF8(little2_) 617 DEFINE_UTF16_TO_UTF16(little2_) 618 619 #undef SET2 620 #undef GET_LO 621 #undef GET_HI 622 623 #define SET2(ptr, ch) \ 624 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF))) 625 #define GET_LO(ptr) ((unsigned char)(ptr)[1]) 626 #define GET_HI(ptr) ((unsigned char)(ptr)[0]) 627 628 DEFINE_UTF16_TO_UTF8(big2_) 629 DEFINE_UTF16_TO_UTF16(big2_) 630 631 #undef SET2 632 #undef GET_LO 633 #undef GET_HI 634 635 #define LITTLE2_BYTE_TYPE(enc, p) \ 636 ((p)[1] == 0 \ 637 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \ 638 : unicode_byte_type((p)[1], (p)[0])) 639 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1) 640 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c) 641 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \ 642 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0]) 643 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ 644 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0]) 645 646 #ifdef XML_MIN_SIZE 647 648 static int PTRFASTCALL 649 little2_byteType(const ENCODING *enc, const char *p) 650 { 651 return LITTLE2_BYTE_TYPE(enc, p); 652 } 653 654 static int PTRFASTCALL 655 little2_byteToAscii(const ENCODING *enc, const char *p) 656 { 657 return LITTLE2_BYTE_TO_ASCII(enc, p); 658 } 659 660 static int PTRCALL 661 little2_charMatches(const ENCODING *enc, const char *p, int c) 662 { 663 return LITTLE2_CHAR_MATCHES(enc, p, c); 664 } 665 666 static int PTRFASTCALL 667 little2_isNameMin(const ENCODING *enc, const char *p) 668 { 669 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p); 670 } 671 672 static int PTRFASTCALL 673 little2_isNmstrtMin(const ENCODING *enc, const char *p) 674 { 675 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p); 676 } 677 678 #undef VTABLE 679 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16 680 681 #else /* not XML_MIN_SIZE */ 682 683 #undef PREFIX 684 #define PREFIX(ident) little2_ ## ident 685 #define MINBPC(enc) 2 686 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 687 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p) 688 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p) 689 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c) 690 #define IS_NAME_CHAR(enc, p, n) 0 691 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) 692 #define IS_NMSTRT_CHAR(enc, p, n) (0) 693 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) 694 695 #include "xmltok_impl.c" 696 697 #undef MINBPC 698 #undef BYTE_TYPE 699 #undef BYTE_TO_ASCII 700 #undef CHAR_MATCHES 701 #undef IS_NAME_CHAR 702 #undef IS_NAME_CHAR_MINBPC 703 #undef IS_NMSTRT_CHAR 704 #undef IS_NMSTRT_CHAR_MINBPC 705 #undef IS_INVALID_CHAR 706 707 #endif /* not XML_MIN_SIZE */ 708 709 #ifdef XML_NS 710 711 static const struct normal_encoding little2_encoding_ns = { 712 { VTABLE, 2, 0, 713 #if BYTEORDER == 1234 714 1 715 #else 716 0 717 #endif 718 }, 719 { 720 #include "asciitab.h" 721 #include "latin1tab.h" 722 }, 723 STANDARD_VTABLE(little2_) 724 }; 725 726 #endif 727 728 static const struct normal_encoding little2_encoding = { 729 { VTABLE, 2, 0, 730 #if BYTEORDER == 1234 731 1 732 #else 733 0 734 #endif 735 }, 736 { 737 #define BT_COLON BT_NMSTRT 738 #include "asciitab.h" 739 #undef BT_COLON 740 #include "latin1tab.h" 741 }, 742 STANDARD_VTABLE(little2_) 743 }; 744 745 #if BYTEORDER != 4321 746 747 #ifdef XML_NS 748 749 static const struct normal_encoding internal_little2_encoding_ns = { 750 { VTABLE, 2, 0, 1 }, 751 { 752 #include "iasciitab.h" 753 #include "latin1tab.h" 754 }, 755 STANDARD_VTABLE(little2_) 756 }; 757 758 #endif 759 760 static const struct normal_encoding internal_little2_encoding = { 761 { VTABLE, 2, 0, 1 }, 762 { 763 #define BT_COLON BT_NMSTRT 764 #include "iasciitab.h" 765 #undef BT_COLON 766 #include "latin1tab.h" 767 }, 768 STANDARD_VTABLE(little2_) 769 }; 770 771 #endif 772 773 774 #define BIG2_BYTE_TYPE(enc, p) \ 775 ((p)[0] == 0 \ 776 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \ 777 : unicode_byte_type((p)[0], (p)[1])) 778 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1) 779 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c) 780 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \ 781 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) 782 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ 783 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) 784 785 #ifdef XML_MIN_SIZE 786 787 static int PTRFASTCALL 788 big2_byteType(const ENCODING *enc, const char *p) 789 { 790 return BIG2_BYTE_TYPE(enc, p); 791 } 792 793 static int PTRFASTCALL 794 big2_byteToAscii(const ENCODING *enc, const char *p) 795 { 796 return BIG2_BYTE_TO_ASCII(enc, p); 797 } 798 799 static int PTRCALL 800 big2_charMatches(const ENCODING *enc, const char *p, int c) 801 { 802 return BIG2_CHAR_MATCHES(enc, p, c); 803 } 804 805 static int PTRFASTCALL 806 big2_isNameMin(const ENCODING *enc, const char *p) 807 { 808 return BIG2_IS_NAME_CHAR_MINBPC(enc, p); 809 } 810 811 static int PTRFASTCALL 812 big2_isNmstrtMin(const ENCODING *enc, const char *p) 813 { 814 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p); 815 } 816 817 #undef VTABLE 818 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16 819 820 #else /* not XML_MIN_SIZE */ 821 822 #undef PREFIX 823 #define PREFIX(ident) big2_ ## ident 824 #define MINBPC(enc) 2 825 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 826 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p) 827 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p) 828 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c) 829 #define IS_NAME_CHAR(enc, p, n) 0 830 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p) 831 #define IS_NMSTRT_CHAR(enc, p, n) (0) 832 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) 833 834 #include "xmltok_impl.c" 835 836 #undef MINBPC 837 #undef BYTE_TYPE 838 #undef BYTE_TO_ASCII 839 #undef CHAR_MATCHES 840 #undef IS_NAME_CHAR 841 #undef IS_NAME_CHAR_MINBPC 842 #undef IS_NMSTRT_CHAR 843 #undef IS_NMSTRT_CHAR_MINBPC 844 #undef IS_INVALID_CHAR 845 846 #endif /* not XML_MIN_SIZE */ 847 848 #ifdef XML_NS 849 850 static const struct normal_encoding big2_encoding_ns = { 851 { VTABLE, 2, 0, 852 #if BYTEORDER == 4321 853 1 854 #else 855 0 856 #endif 857 }, 858 { 859 #include "asciitab.h" 860 #include "latin1tab.h" 861 }, 862 STANDARD_VTABLE(big2_) 863 }; 864 865 #endif 866 867 static const struct normal_encoding big2_encoding = { 868 { VTABLE, 2, 0, 869 #if BYTEORDER == 4321 870 1 871 #else 872 0 873 #endif 874 }, 875 { 876 #define BT_COLON BT_NMSTRT 877 #include "asciitab.h" 878 #undef BT_COLON 879 #include "latin1tab.h" 880 }, 881 STANDARD_VTABLE(big2_) 882 }; 883 884 #if BYTEORDER != 1234 885 886 #ifdef XML_NS 887 888 static const struct normal_encoding internal_big2_encoding_ns = { 889 { VTABLE, 2, 0, 1 }, 890 { 891 #include "iasciitab.h" 892 #include "latin1tab.h" 893 }, 894 STANDARD_VTABLE(big2_) 895 }; 896 897 #endif 898 899 static const struct normal_encoding internal_big2_encoding = { 900 { VTABLE, 2, 0, 1 }, 901 { 902 #define BT_COLON BT_NMSTRT 903 #include "iasciitab.h" 904 #undef BT_COLON 905 #include "latin1tab.h" 906 }, 907 STANDARD_VTABLE(big2_) 908 }; 909 910 #endif 911 912 #undef PREFIX 913 914 static int FASTCALL 915 streqci(const char *s1, const char *s2) 916 { 917 for (;;) { 918 char c1 = *s1++; 919 char c2 = *s2++; 920 if (ASCII_a <= c1 && c1 <= ASCII_z) 921 c1 += ASCII_A - ASCII_a; 922 if (ASCII_a <= c2 && c2 <= ASCII_z) 923 c2 += ASCII_A - ASCII_a; 924 if (c1 != c2) 925 return 0; 926 if (!c1) 927 break; 928 } 929 return 1; 930 } 931 932 static void PTRCALL 933 initUpdatePosition(const ENCODING *enc, const char *ptr, 934 const char *end, POSITION *pos) 935 { 936 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); 937 } 938 939 static int 940 toAscii(const ENCODING *enc, const char *ptr, const char *end) 941 { 942 char buf[1]; 943 char *p = buf; 944 XmlUtf8Convert(enc, &ptr, end, &p, p + 1); 945 if (p == buf) 946 return -1; 947 else 948 return buf[0]; 949 } 950 951 static int FASTCALL 952 isSpace(int c) 953 { 954 switch (c) { 955 case 0x20: 956 case 0xD: 957 case 0xA: 958 case 0x9: 959 return 1; 960 } 961 return 0; 962 } 963 964 /* Return 1 if there's just optional white space or there's an S 965 followed by name=val. 966 */ 967 static int 968 parsePseudoAttribute(const ENCODING *enc, 969 const char *ptr, 970 const char *end, 971 const char **namePtr, 972 const char **nameEndPtr, 973 const char **valPtr, 974 const char **nextTokPtr) 975 { 976 int c; 977 char open; 978 if (ptr == end) { 979 *namePtr = NULL; 980 return 1; 981 } 982 if (!isSpace(toAscii(enc, ptr, end))) { 983 *nextTokPtr = ptr; 984 return 0; 985 } 986 do { 987 ptr += enc->minBytesPerChar; 988 } while (isSpace(toAscii(enc, ptr, end))); 989 if (ptr == end) { 990 *namePtr = NULL; 991 return 1; 992 } 993 *namePtr = ptr; 994 for (;;) { 995 c = toAscii(enc, ptr, end); 996 if (c == -1) { 997 *nextTokPtr = ptr; 998 return 0; 999 } 1000 if (c == ASCII_EQUALS) { 1001 *nameEndPtr = ptr; 1002 break; 1003 } 1004 if (isSpace(c)) { 1005 *nameEndPtr = ptr; 1006 do { 1007 ptr += enc->minBytesPerChar; 1008 } while (isSpace(c = toAscii(enc, ptr, end))); 1009 if (c != ASCII_EQUALS) { 1010 *nextTokPtr = ptr; 1011 return 0; 1012 } 1013 break; 1014 } 1015 ptr += enc->minBytesPerChar; 1016 } 1017 if (ptr == *namePtr) { 1018 *nextTokPtr = ptr; 1019 return 0; 1020 } 1021 ptr += enc->minBytesPerChar; 1022 c = toAscii(enc, ptr, end); 1023 while (isSpace(c)) { 1024 ptr += enc->minBytesPerChar; 1025 c = toAscii(enc, ptr, end); 1026 } 1027 if (c != ASCII_QUOT && c != ASCII_APOS) { 1028 *nextTokPtr = ptr; 1029 return 0; 1030 } 1031 open = (char)c; 1032 ptr += enc->minBytesPerChar; 1033 *valPtr = ptr; 1034 for (;; ptr += enc->minBytesPerChar) { 1035 c = toAscii(enc, ptr, end); 1036 if (c == open) 1037 break; 1038 if (!(ASCII_a <= c && c <= ASCII_z) 1039 && !(ASCII_A <= c && c <= ASCII_Z) 1040 && !(ASCII_0 <= c && c <= ASCII_9) 1041 && c != ASCII_PERIOD 1042 && c != ASCII_MINUS 1043 && c != ASCII_UNDERSCORE) { 1044 *nextTokPtr = ptr; 1045 return 0; 1046 } 1047 } 1048 *nextTokPtr = ptr + enc->minBytesPerChar; 1049 return 1; 1050 } 1051 1052 static const char KW_version[] = { 1053 ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0' 1054 }; 1055 1056 static const char KW_encoding[] = { 1057 ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0' 1058 }; 1059 1060 static const char KW_standalone[] = { 1061 ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o, 1062 ASCII_n, ASCII_e, '\0' 1063 }; 1064 1065 static const char KW_yes[] = { 1066 ASCII_y, ASCII_e, ASCII_s, '\0' 1067 }; 1068 1069 static const char KW_no[] = { 1070 ASCII_n, ASCII_o, '\0' 1071 }; 1072 1073 static int 1074 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, 1075 const char *, 1076 const char *), 1077 int isGeneralTextEntity, 1078 const ENCODING *enc, 1079 const char *ptr, 1080 const char *end, 1081 const char **badPtr, 1082 const char **versionPtr, 1083 const char **versionEndPtr, 1084 const char **encodingName, 1085 const ENCODING **encoding, 1086 int *standalone) 1087 { 1088 const char *val = NULL; 1089 const char *name = NULL; 1090 const char *nameEnd = NULL; 1091 ptr += 5 * enc->minBytesPerChar; 1092 end -= 2 * enc->minBytesPerChar; 1093 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) 1094 || !name) { 1095 *badPtr = ptr; 1096 return 0; 1097 } 1098 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) { 1099 if (!isGeneralTextEntity) { 1100 *badPtr = name; 1101 return 0; 1102 } 1103 } 1104 else { 1105 if (versionPtr) 1106 *versionPtr = val; 1107 if (versionEndPtr) 1108 *versionEndPtr = ptr; 1109 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1110 *badPtr = ptr; 1111 return 0; 1112 } 1113 if (!name) { 1114 if (isGeneralTextEntity) { 1115 /* a TextDecl must have an EncodingDecl */ 1116 *badPtr = ptr; 1117 return 0; 1118 } 1119 return 1; 1120 } 1121 } 1122 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) { 1123 int c = toAscii(enc, val, end); 1124 if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) { 1125 *badPtr = val; 1126 return 0; 1127 } 1128 if (encodingName) 1129 *encodingName = val; 1130 if (encoding) 1131 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar); 1132 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1133 *badPtr = ptr; 1134 return 0; 1135 } 1136 if (!name) 1137 return 1; 1138 } 1139 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) 1140 || isGeneralTextEntity) { 1141 *badPtr = name; 1142 return 0; 1143 } 1144 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) { 1145 if (standalone) 1146 *standalone = 1; 1147 } 1148 else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) { 1149 if (standalone) 1150 *standalone = 0; 1151 } 1152 else { 1153 *badPtr = val; 1154 return 0; 1155 } 1156 while (isSpace(toAscii(enc, ptr, end))) 1157 ptr += enc->minBytesPerChar; 1158 if (ptr != end) { 1159 *badPtr = ptr; 1160 return 0; 1161 } 1162 return 1; 1163 } 1164 1165 static int FASTCALL 1166 checkCharRefNumber(int result) 1167 { 1168 switch (result >> 8) { 1169 case 0xD8: case 0xD9: case 0xDA: case 0xDB: 1170 case 0xDC: case 0xDD: case 0xDE: case 0xDF: 1171 return -1; 1172 case 0: 1173 if (latin1_encoding.type[result] == BT_NONXML) 1174 return -1; 1175 break; 1176 case 0xFF: 1177 if (result == 0xFFFE || result == 0xFFFF) 1178 return -1; 1179 break; 1180 } 1181 return result; 1182 } 1183 1184 int FASTCALL 1185 XmlUtf8Encode(int c, char *buf) 1186 { 1187 enum { 1188 /* minN is minimum legal resulting value for N byte sequence */ 1189 min2 = 0x80, 1190 min3 = 0x800, 1191 min4 = 0x10000 1192 }; 1193 1194 if (c < 0) 1195 return 0; 1196 if (c < min2) { 1197 buf[0] = (char)(c | UTF8_cval1); 1198 return 1; 1199 } 1200 if (c < min3) { 1201 buf[0] = (char)((c >> 6) | UTF8_cval2); 1202 buf[1] = (char)((c & 0x3f) | 0x80); 1203 return 2; 1204 } 1205 if (c < min4) { 1206 buf[0] = (char)((c >> 12) | UTF8_cval3); 1207 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80); 1208 buf[2] = (char)((c & 0x3f) | 0x80); 1209 return 3; 1210 } 1211 if (c < 0x110000) { 1212 buf[0] = (char)((c >> 18) | UTF8_cval4); 1213 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80); 1214 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80); 1215 buf[3] = (char)((c & 0x3f) | 0x80); 1216 return 4; 1217 } 1218 return 0; 1219 } 1220 1221 int FASTCALL 1222 XmlUtf16Encode(int charNum, unsigned short *buf) 1223 { 1224 if (charNum < 0) 1225 return 0; 1226 if (charNum < 0x10000) { 1227 buf[0] = (unsigned short)charNum; 1228 return 1; 1229 } 1230 if (charNum < 0x110000) { 1231 charNum -= 0x10000; 1232 buf[0] = (unsigned short)((charNum >> 10) + 0xD800); 1233 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00); 1234 return 2; 1235 } 1236 return 0; 1237 } 1238 1239 struct unknown_encoding { 1240 struct normal_encoding normal; 1241 CONVERTER convert; 1242 void *userData; 1243 unsigned short utf16[256]; 1244 char utf8[256][4]; 1245 }; 1246 1247 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc)) 1248 1249 int 1250 XmlSizeOfUnknownEncoding(void) 1251 { 1252 return sizeof(struct unknown_encoding); 1253 } 1254 1255 static int PTRFASTCALL 1256 unknown_isName(const ENCODING *enc, const char *p) 1257 { 1258 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1259 int c = uenc->convert(uenc->userData, p); 1260 if (c & ~0xFFFF) 1261 return 0; 1262 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF); 1263 } 1264 1265 static int PTRFASTCALL 1266 unknown_isNmstrt(const ENCODING *enc, const char *p) 1267 { 1268 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1269 int c = uenc->convert(uenc->userData, p); 1270 if (c & ~0xFFFF) 1271 return 0; 1272 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF); 1273 } 1274 1275 static int PTRFASTCALL 1276 unknown_isInvalid(const ENCODING *enc, const char *p) 1277 { 1278 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1279 int c = uenc->convert(uenc->userData, p); 1280 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0; 1281 } 1282 1283 static void PTRCALL 1284 unknown_toUtf8(const ENCODING *enc, 1285 const char **fromP, const char *fromLim, 1286 char **toP, const char *toLim) 1287 { 1288 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1289 char buf[XML_UTF8_ENCODE_MAX]; 1290 for (;;) { 1291 const char *utf8; 1292 int n; 1293 if (*fromP == fromLim) 1294 break; 1295 utf8 = uenc->utf8[(unsigned char)**fromP]; 1296 n = *utf8++; 1297 if (n == 0) { 1298 int c = uenc->convert(uenc->userData, *fromP); 1299 n = XmlUtf8Encode(c, buf); 1300 if (n > toLim - *toP) 1301 break; 1302 utf8 = buf; 1303 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1304 - (BT_LEAD2 - 2)); 1305 } 1306 else { 1307 if (n > toLim - *toP) 1308 break; 1309 (*fromP)++; 1310 } 1311 do { 1312 *(*toP)++ = *utf8++; 1313 } while (--n != 0); 1314 } 1315 } 1316 1317 static void PTRCALL 1318 unknown_toUtf16(const ENCODING *enc, 1319 const char **fromP, const char *fromLim, 1320 unsigned short **toP, const unsigned short *toLim) 1321 { 1322 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1323 while (*fromP != fromLim && *toP != toLim) { 1324 unsigned short c = uenc->utf16[(unsigned char)**fromP]; 1325 if (c == 0) { 1326 c = (unsigned short) 1327 uenc->convert(uenc->userData, *fromP); 1328 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1329 - (BT_LEAD2 - 2)); 1330 } 1331 else 1332 (*fromP)++; 1333 *(*toP)++ = c; 1334 } 1335 } 1336 1337 ENCODING * 1338 XmlInitUnknownEncoding(void *mem, 1339 int *table, 1340 CONVERTER convert, 1341 void *userData) 1342 { 1343 int i; 1344 struct unknown_encoding *e = (struct unknown_encoding *)mem; 1345 for (i = 0; i < (int)sizeof(struct normal_encoding); i++) 1346 ((char *)mem)[i] = ((char *)&latin1_encoding)[i]; 1347 for (i = 0; i < 128; i++) 1348 if (latin1_encoding.type[i] != BT_OTHER 1349 && latin1_encoding.type[i] != BT_NONXML 1350 && table[i] != i) 1351 return 0; 1352 for (i = 0; i < 256; i++) { 1353 int c = table[i]; 1354 if (c == -1) { 1355 e->normal.type[i] = BT_MALFORM; 1356 /* This shouldn't really get used. */ 1357 e->utf16[i] = 0xFFFF; 1358 e->utf8[i][0] = 1; 1359 e->utf8[i][1] = 0; 1360 } 1361 else if (c < 0) { 1362 if (c < -4) 1363 return 0; 1364 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2)); 1365 e->utf8[i][0] = 0; 1366 e->utf16[i] = 0; 1367 } 1368 else if (c < 0x80) { 1369 if (latin1_encoding.type[c] != BT_OTHER 1370 && latin1_encoding.type[c] != BT_NONXML 1371 && c != i) 1372 return 0; 1373 e->normal.type[i] = latin1_encoding.type[c]; 1374 e->utf8[i][0] = 1; 1375 e->utf8[i][1] = (char)c; 1376 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c); 1377 } 1378 else if (checkCharRefNumber(c) < 0) { 1379 e->normal.type[i] = BT_NONXML; 1380 /* This shouldn't really get used. */ 1381 e->utf16[i] = 0xFFFF; 1382 e->utf8[i][0] = 1; 1383 e->utf8[i][1] = 0; 1384 } 1385 else { 1386 if (c > 0xFFFF) 1387 return 0; 1388 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)) 1389 e->normal.type[i] = BT_NMSTRT; 1390 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff)) 1391 e->normal.type[i] = BT_NAME; 1392 else 1393 e->normal.type[i] = BT_OTHER; 1394 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1); 1395 e->utf16[i] = (unsigned short)c; 1396 } 1397 } 1398 e->userData = userData; 1399 e->convert = convert; 1400 if (convert) { 1401 e->normal.isName2 = unknown_isName; 1402 e->normal.isName3 = unknown_isName; 1403 e->normal.isName4 = unknown_isName; 1404 e->normal.isNmstrt2 = unknown_isNmstrt; 1405 e->normal.isNmstrt3 = unknown_isNmstrt; 1406 e->normal.isNmstrt4 = unknown_isNmstrt; 1407 e->normal.isInvalid2 = unknown_isInvalid; 1408 e->normal.isInvalid3 = unknown_isInvalid; 1409 e->normal.isInvalid4 = unknown_isInvalid; 1410 } 1411 e->normal.enc.utf8Convert = unknown_toUtf8; 1412 e->normal.enc.utf16Convert = unknown_toUtf16; 1413 return &(e->normal.enc); 1414 } 1415 1416 /* If this enumeration is changed, getEncodingIndex and encodings 1417 must also be changed. */ 1418 enum { 1419 UNKNOWN_ENC = -1, 1420 ISO_8859_1_ENC = 0, 1421 US_ASCII_ENC, 1422 UTF_8_ENC, 1423 UTF_16_ENC, 1424 UTF_16BE_ENC, 1425 UTF_16LE_ENC, 1426 /* must match encodingNames up to here */ 1427 NO_ENC 1428 }; 1429 1430 static const char KW_ISO_8859_1[] = { 1431 ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9, 1432 ASCII_MINUS, ASCII_1, '\0' 1433 }; 1434 static const char KW_US_ASCII[] = { 1435 ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I, 1436 '\0' 1437 }; 1438 static const char KW_UTF_8[] = { 1439 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0' 1440 }; 1441 static const char KW_UTF_16[] = { 1442 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0' 1443 }; 1444 static const char KW_UTF_16BE[] = { 1445 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E, 1446 '\0' 1447 }; 1448 static const char KW_UTF_16LE[] = { 1449 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E, 1450 '\0' 1451 }; 1452 1453 static int FASTCALL 1454 getEncodingIndex(const char *name) 1455 { 1456 static const char * const encodingNames[] = { 1457 KW_ISO_8859_1, 1458 KW_US_ASCII, 1459 KW_UTF_8, 1460 KW_UTF_16, 1461 KW_UTF_16BE, 1462 KW_UTF_16LE, 1463 }; 1464 int i; 1465 if (name == NULL) 1466 return NO_ENC; 1467 for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++) 1468 if (streqci(name, encodingNames[i])) 1469 return i; 1470 return UNKNOWN_ENC; 1471 } 1472 1473 /* For binary compatibility, we store the index of the encoding 1474 specified at initialization in the isUtf16 member. 1475 */ 1476 1477 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16) 1478 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i) 1479 1480 /* This is what detects the encoding. encodingTable maps from 1481 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of 1482 the external (protocol) specified encoding; state is 1483 XML_CONTENT_STATE if we're parsing an external text entity, and 1484 XML_PROLOG_STATE otherwise. 1485 */ 1486 1487 1488 static int 1489 initScan(const ENCODING * const *encodingTable, 1490 const INIT_ENCODING *enc, 1491 int state, 1492 const char *ptr, 1493 const char *end, 1494 const char **nextTokPtr) 1495 { 1496 const ENCODING **encPtr; 1497 1498 if (ptr == end) 1499 return XML_TOK_NONE; 1500 encPtr = enc->encPtr; 1501 if (ptr + 1 == end) { 1502 /* only a single byte available for auto-detection */ 1503 #ifndef XML_DTD /* FIXME */ 1504 /* a well-formed document entity must have more than one byte */ 1505 if (state != XML_CONTENT_STATE) 1506 return XML_TOK_PARTIAL; 1507 #endif 1508 /* so we're parsing an external text entity... */ 1509 /* if UTF-16 was externally specified, then we need at least 2 bytes */ 1510 switch (INIT_ENC_INDEX(enc)) { 1511 case UTF_16_ENC: 1512 case UTF_16LE_ENC: 1513 case UTF_16BE_ENC: 1514 return XML_TOK_PARTIAL; 1515 } 1516 switch ((unsigned char)*ptr) { 1517 case 0xFE: 1518 case 0xFF: 1519 case 0xEF: /* possibly first byte of UTF-8 BOM */ 1520 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC 1521 && state == XML_CONTENT_STATE) 1522 break; 1523 /* fall through */ 1524 case 0x00: 1525 case 0x3C: 1526 return XML_TOK_PARTIAL; 1527 } 1528 } 1529 else { 1530 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { 1531 case 0xFEFF: 1532 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC 1533 && state == XML_CONTENT_STATE) 1534 break; 1535 *nextTokPtr = ptr + 2; 1536 *encPtr = encodingTable[UTF_16BE_ENC]; 1537 return XML_TOK_BOM; 1538 /* 00 3C is handled in the default case */ 1539 case 0x3C00: 1540 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC 1541 || INIT_ENC_INDEX(enc) == UTF_16_ENC) 1542 && state == XML_CONTENT_STATE) 1543 break; 1544 *encPtr = encodingTable[UTF_16LE_ENC]; 1545 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1546 case 0xFFFE: 1547 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC 1548 && state == XML_CONTENT_STATE) 1549 break; 1550 *nextTokPtr = ptr + 2; 1551 *encPtr = encodingTable[UTF_16LE_ENC]; 1552 return XML_TOK_BOM; 1553 case 0xEFBB: 1554 /* Maybe a UTF-8 BOM (EF BB BF) */ 1555 /* If there's an explicitly specified (external) encoding 1556 of ISO-8859-1 or some flavour of UTF-16 1557 and this is an external text entity, 1558 don't look for the BOM, 1559 because it might be a legal data. 1560 */ 1561 if (state == XML_CONTENT_STATE) { 1562 int e = INIT_ENC_INDEX(enc); 1563 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC 1564 || e == UTF_16LE_ENC || e == UTF_16_ENC) 1565 break; 1566 } 1567 if (ptr + 2 == end) 1568 return XML_TOK_PARTIAL; 1569 if ((unsigned char)ptr[2] == 0xBF) { 1570 *nextTokPtr = ptr + 3; 1571 *encPtr = encodingTable[UTF_8_ENC]; 1572 return XML_TOK_BOM; 1573 } 1574 break; 1575 default: 1576 if (ptr[0] == '\0') { 1577 /* 0 isn't a legal data character. Furthermore a document 1578 entity can only start with ASCII characters. So the only 1579 way this can fail to be big-endian UTF-16 if it it's an 1580 external parsed general entity that's labelled as 1581 UTF-16LE. 1582 */ 1583 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC) 1584 break; 1585 *encPtr = encodingTable[UTF_16BE_ENC]; 1586 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1587 } 1588 else if (ptr[1] == '\0') { 1589 /* We could recover here in the case: 1590 - parsing an external entity 1591 - second byte is 0 1592 - no externally specified encoding 1593 - no encoding declaration 1594 by assuming UTF-16LE. But we don't, because this would mean when 1595 presented just with a single byte, we couldn't reliably determine 1596 whether we needed further bytes. 1597 */ 1598 if (state == XML_CONTENT_STATE) 1599 break; 1600 *encPtr = encodingTable[UTF_16LE_ENC]; 1601 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1602 } 1603 break; 1604 } 1605 } 1606 *encPtr = encodingTable[INIT_ENC_INDEX(enc)]; 1607 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1608 } 1609 1610 1611 #define NS(x) x 1612 #define ns(x) x 1613 #include "xmltok_ns.c" 1614 #undef NS 1615 #undef ns 1616 1617 #ifdef XML_NS 1618 1619 #define NS(x) x ## NS 1620 #define ns(x) x ## _ns 1621 1622 #include "xmltok_ns.c" 1623 1624 #undef NS 1625 #undef ns 1626 1627 ENCODING * 1628 XmlInitUnknownEncodingNS(void *mem, 1629 int *table, 1630 CONVERTER convert, 1631 void *userData) 1632 { 1633 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData); 1634 if (enc) 1635 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON; 1636 return enc; 1637 } 1638 1639 #endif /* XML_NS */ 1640