1 /* 2 * _codecs_iso2022.c: Codecs collection for ISO-2022 encodings. 3 * 4 * Written by Hye-Shik Chang <perky (at) FreeBSD.org> 5 */ 6 7 #define USING_IMPORTED_MAPS 8 #define USING_BINARY_PAIR_SEARCH 9 #define EXTERN_JISX0213_PAIR 10 #define EMULATE_JISX0213_2000_ENCODE_INVALID MAP_UNMAPPABLE 11 #define EMULATE_JISX0213_2000_DECODE_INVALID MAP_UNMAPPABLE 12 13 #include "cjkcodecs.h" 14 #include "alg_jisx0201.h" 15 #include "emu_jisx0213_2000.h" 16 #include "mappings_jisx0213_pair.h" 17 18 /* STATE 19 20 state->c[0-3] 21 22 00000000 23 ||^^^^^| 24 |+-----+---- G0-3 Character Set 25 +----------- Is G0-3 double byte? 26 27 state->c[4] 28 29 00000000 30 || 31 |+---- Locked-Shift? 32 +----- ESC Throughout 33 */ 34 35 #define ESC 0x1B 36 #define SO 0x0E 37 #define SI 0x0F 38 #define LF 0x0A 39 40 #define MAX_ESCSEQLEN 16 41 42 #define CHARSET_ISO8859_1 'A' 43 #define CHARSET_ASCII 'B' 44 #define CHARSET_ISO8859_7 'F' 45 #define CHARSET_JISX0201_K 'I' 46 #define CHARSET_JISX0201_R 'J' 47 48 #define CHARSET_GB2312 ('A'|CHARSET_DBCS) 49 #define CHARSET_JISX0208 ('B'|CHARSET_DBCS) 50 #define CHARSET_KSX1001 ('C'|CHARSET_DBCS) 51 #define CHARSET_JISX0212 ('D'|CHARSET_DBCS) 52 #define CHARSET_GB2312_8565 ('E'|CHARSET_DBCS) 53 #define CHARSET_CNS11643_1 ('G'|CHARSET_DBCS) 54 #define CHARSET_CNS11643_2 ('H'|CHARSET_DBCS) 55 #define CHARSET_JISX0213_2000_1 ('O'|CHARSET_DBCS) 56 #define CHARSET_JISX0213_2 ('P'|CHARSET_DBCS) 57 #define CHARSET_JISX0213_2004_1 ('Q'|CHARSET_DBCS) 58 #define CHARSET_JISX0208_O ('@'|CHARSET_DBCS) 59 60 #define CHARSET_DBCS 0x80 61 #define ESCMARK(mark) ((mark) & 0x7f) 62 63 #define IS_ESCEND(c) (((c) >= 'A' && (c) <= 'Z') || (c) == '@') 64 #define IS_ISO2022ESC(c2) \ 65 ((c2) == '(' || (c2) == ')' || (c2) == '$' || \ 66 (c2) == '.' || (c2) == '&') 67 /* this is not a complete list of ISO-2022 escape sequence headers. 68 * but, it's enough to implement CJK instances of iso-2022. */ 69 70 #define MAP_UNMAPPABLE 0xFFFF 71 #define MAP_MULTIPLE_AVAIL 0xFFFE /* for JIS X 0213 */ 72 73 #define F_SHIFTED 0x01 74 #define F_ESCTHROUGHOUT 0x02 75 76 #define STATE_SETG(dn, v) do { ((state)->c[dn]) = (v); } while (0) 77 #define STATE_GETG(dn) ((state)->c[dn]) 78 79 #define STATE_G0 STATE_GETG(0) 80 #define STATE_G1 STATE_GETG(1) 81 #define STATE_G2 STATE_GETG(2) 82 #define STATE_G3 STATE_GETG(3) 83 #define STATE_SETG0(v) STATE_SETG(0, v) 84 #define STATE_SETG1(v) STATE_SETG(1, v) 85 #define STATE_SETG2(v) STATE_SETG(2, v) 86 #define STATE_SETG3(v) STATE_SETG(3, v) 87 88 #define STATE_SETFLAG(f) do { ((state)->c[4]) |= (f); } while (0) 89 #define STATE_GETFLAG(f) ((state)->c[4] & (f)) 90 #define STATE_CLEARFLAG(f) do { ((state)->c[4]) &= ~(f); } while (0) 91 #define STATE_CLEARFLAGS() do { ((state)->c[4]) = 0; } while (0) 92 93 #define ISO2022_CONFIG ((const struct iso2022_config *)config) 94 #define CONFIG_ISSET(flag) (ISO2022_CONFIG->flags & (flag)) 95 #define CONFIG_DESIGNATIONS (ISO2022_CONFIG->designations) 96 97 /* iso2022_config.flags */ 98 #define NO_SHIFT 0x01 99 #define USE_G2 0x02 100 #define USE_JISX0208_EXT 0x04 101 102 /*-*- internal data structures -*-*/ 103 104 typedef int (*iso2022_init_func)(void); 105 typedef Py_UCS4 (*iso2022_decode_func)(const unsigned char *data); 106 typedef DBCHAR (*iso2022_encode_func)(const Py_UCS4 *data, Py_ssize_t *length); 107 108 struct iso2022_designation { 109 unsigned char mark; 110 unsigned char plane; 111 unsigned char width; 112 iso2022_init_func initializer; 113 iso2022_decode_func decoder; 114 iso2022_encode_func encoder; 115 }; 116 117 struct iso2022_config { 118 int flags; 119 const struct iso2022_designation *designations; /* non-ascii desigs */ 120 }; 121 122 /*-*- iso-2022 codec implementation -*-*/ 123 124 CODEC_INIT(iso2022) 125 { 126 const struct iso2022_designation *desig; 127 for (desig = CONFIG_DESIGNATIONS; desig->mark; desig++) 128 if (desig->initializer != NULL && desig->initializer() != 0) 129 return -1; 130 return 0; 131 } 132 133 ENCODER_INIT(iso2022) 134 { 135 STATE_CLEARFLAGS(); 136 STATE_SETG0(CHARSET_ASCII); 137 STATE_SETG1(CHARSET_ASCII); 138 return 0; 139 } 140 141 ENCODER_RESET(iso2022) 142 { 143 if (STATE_GETFLAG(F_SHIFTED)) { 144 WRITEBYTE1(SI); 145 NEXT_OUT(1); 146 STATE_CLEARFLAG(F_SHIFTED); 147 } 148 if (STATE_G0 != CHARSET_ASCII) { 149 WRITEBYTE3(ESC, '(', 'B'); 150 NEXT_OUT(3); 151 STATE_SETG0(CHARSET_ASCII); 152 } 153 return 0; 154 } 155 156 ENCODER(iso2022) 157 { 158 while (*inpos < inlen) { 159 const struct iso2022_designation *dsg; 160 DBCHAR encoded; 161 Py_UCS4 c = INCHAR1; 162 Py_ssize_t insize; 163 164 if (c < 0x80) { 165 if (STATE_G0 != CHARSET_ASCII) { 166 WRITEBYTE3(ESC, '(', 'B'); 167 STATE_SETG0(CHARSET_ASCII); 168 NEXT_OUT(3); 169 } 170 if (STATE_GETFLAG(F_SHIFTED)) { 171 WRITEBYTE1(SI); 172 STATE_CLEARFLAG(F_SHIFTED); 173 NEXT_OUT(1); 174 } 175 WRITEBYTE1((unsigned char)c); 176 NEXT(1, 1); 177 continue; 178 } 179 180 insize = 1; 181 182 encoded = MAP_UNMAPPABLE; 183 for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) { 184 Py_ssize_t length = 1; 185 encoded = dsg->encoder(&c, &length); 186 if (encoded == MAP_MULTIPLE_AVAIL) { 187 /* this implementation won't work for pair 188 * of non-bmp characters. */ 189 if (inlen - *inpos < 2) { 190 if (!(flags & MBENC_FLUSH)) 191 return MBERR_TOOFEW; 192 length = -1; 193 } 194 else 195 length = 2; 196 encoded = dsg->encoder(&c, &length); 197 if (encoded != MAP_UNMAPPABLE) { 198 insize = length; 199 break; 200 } 201 } 202 else if (encoded != MAP_UNMAPPABLE) 203 break; 204 } 205 206 if (!dsg->mark) 207 return 1; 208 assert(dsg->width == 1 || dsg->width == 2); 209 210 switch (dsg->plane) { 211 case 0: /* G0 */ 212 if (STATE_GETFLAG(F_SHIFTED)) { 213 WRITEBYTE1(SI); 214 STATE_CLEARFLAG(F_SHIFTED); 215 NEXT_OUT(1); 216 } 217 if (STATE_G0 != dsg->mark) { 218 if (dsg->width == 1) { 219 WRITEBYTE3(ESC, '(', ESCMARK(dsg->mark)); 220 STATE_SETG0(dsg->mark); 221 NEXT_OUT(3); 222 } 223 else if (dsg->mark == CHARSET_JISX0208) { 224 WRITEBYTE3(ESC, '$', ESCMARK(dsg->mark)); 225 STATE_SETG0(dsg->mark); 226 NEXT_OUT(3); 227 } 228 else { 229 WRITEBYTE4(ESC, '$', '(', 230 ESCMARK(dsg->mark)); 231 STATE_SETG0(dsg->mark); 232 NEXT_OUT(4); 233 } 234 } 235 break; 236 case 1: /* G1 */ 237 if (STATE_G1 != dsg->mark) { 238 if (dsg->width == 1) { 239 WRITEBYTE3(ESC, ')', ESCMARK(dsg->mark)); 240 STATE_SETG1(dsg->mark); 241 NEXT_OUT(3); 242 } 243 else { 244 WRITEBYTE4(ESC, '$', ')', ESCMARK(dsg->mark)); 245 STATE_SETG1(dsg->mark); 246 NEXT_OUT(4); 247 } 248 } 249 if (!STATE_GETFLAG(F_SHIFTED)) { 250 WRITEBYTE1(SO); 251 STATE_SETFLAG(F_SHIFTED); 252 NEXT_OUT(1); 253 } 254 break; 255 default: /* G2 and G3 is not supported: no encoding in 256 * CJKCodecs are using them yet */ 257 return MBERR_INTERNAL; 258 } 259 260 if (dsg->width == 1) { 261 WRITEBYTE1((unsigned char)encoded); 262 NEXT_OUT(1); 263 } 264 else { 265 WRITEBYTE2(encoded >> 8, encoded & 0xff); 266 NEXT_OUT(2); 267 } 268 NEXT_INCHAR(insize); 269 } 270 271 return 0; 272 } 273 274 DECODER_INIT(iso2022) 275 { 276 STATE_CLEARFLAGS(); 277 STATE_SETG0(CHARSET_ASCII); 278 STATE_SETG1(CHARSET_ASCII); 279 STATE_SETG2(CHARSET_ASCII); 280 return 0; 281 } 282 283 DECODER_RESET(iso2022) 284 { 285 STATE_SETG0(CHARSET_ASCII); 286 STATE_CLEARFLAG(F_SHIFTED); 287 return 0; 288 } 289 290 static Py_ssize_t 291 iso2022processesc(const void *config, MultibyteCodec_State *state, 292 const unsigned char **inbuf, Py_ssize_t *inleft) 293 { 294 unsigned char charset, designation; 295 Py_ssize_t i, esclen = 0; 296 297 for (i = 1;i < MAX_ESCSEQLEN;i++) { 298 if (i >= *inleft) 299 return MBERR_TOOFEW; 300 if (IS_ESCEND((*inbuf)[i])) { 301 esclen = i + 1; 302 break; 303 } 304 else if (CONFIG_ISSET(USE_JISX0208_EXT) && i+1 < *inleft && 305 (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@') { 306 i += 2; 307 } 308 } 309 310 switch (esclen) { 311 case 0: 312 return 1; /* unterminated escape sequence */ 313 case 3: 314 if (INBYTE2 == '$') { 315 charset = INBYTE3 | CHARSET_DBCS; 316 designation = 0; 317 } 318 else { 319 charset = INBYTE3; 320 if (INBYTE2 == '(') 321 designation = 0; 322 else if (INBYTE2 == ')') 323 designation = 1; 324 else if (CONFIG_ISSET(USE_G2) && INBYTE2 == '.') 325 designation = 2; 326 else 327 return 3; 328 } 329 break; 330 case 4: 331 if (INBYTE2 != '$') 332 return 4; 333 334 charset = INBYTE4 | CHARSET_DBCS; 335 if (INBYTE3 == '(') 336 designation = 0; 337 else if (INBYTE3 == ')') 338 designation = 1; 339 else 340 return 4; 341 break; 342 case 6: /* designation with prefix */ 343 if (CONFIG_ISSET(USE_JISX0208_EXT) && 344 (*inbuf)[3] == ESC && (*inbuf)[4] == '$' && 345 (*inbuf)[5] == 'B') { 346 charset = 'B' | CHARSET_DBCS; 347 designation = 0; 348 } 349 else 350 return 6; 351 break; 352 default: 353 return esclen; 354 } 355 356 /* raise error when the charset is not designated for this encoding */ 357 if (charset != CHARSET_ASCII) { 358 const struct iso2022_designation *dsg; 359 360 for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) { 361 if (dsg->mark == charset) 362 break; 363 } 364 if (!dsg->mark) 365 return esclen; 366 } 367 368 STATE_SETG(designation, charset); 369 *inleft -= esclen; 370 (*inbuf) += esclen; 371 return 0; 372 } 373 374 #define ISO8859_7_DECODE(c, writer) \ 375 if ((c) < 0xa0) { \ 376 OUTCHAR(c); \ 377 } else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0)))) { \ 378 OUTCHAR(c); \ 379 } else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 || \ 380 (0xbffffd77L & (1L << ((c)-0xb4))))) { \ 381 OUTCHAR(0x02d0 + (c)); \ 382 } else if ((c) == 0xa1) { \ 383 OUTCHAR(0x2018); \ 384 } else if ((c) == 0xa2) { \ 385 OUTCHAR(0x2019); \ 386 } else if ((c) == 0xaf) { \ 387 OUTCHAR(0x2015); \ 388 } 389 390 static Py_ssize_t 391 iso2022processg2(const void *config, MultibyteCodec_State *state, 392 const unsigned char **inbuf, Py_ssize_t *inleft, 393 _PyUnicodeWriter *writer) 394 { 395 /* not written to use encoder, decoder functions because only few 396 * encodings use G2 designations in CJKCodecs */ 397 if (STATE_G2 == CHARSET_ISO8859_1) { 398 if (INBYTE3 < 0x80) 399 OUTCHAR(INBYTE3 + 0x80); 400 else 401 return 3; 402 } 403 else if (STATE_G2 == CHARSET_ISO8859_7) { 404 ISO8859_7_DECODE(INBYTE3 ^ 0x80, writer) 405 else 406 return 3; 407 } 408 else if (STATE_G2 == CHARSET_ASCII) { 409 if (INBYTE3 & 0x80) 410 return 3; 411 else 412 OUTCHAR(INBYTE3); 413 } 414 else 415 return MBERR_INTERNAL; 416 417 (*inbuf) += 3; 418 *inleft -= 3; 419 return 0; 420 } 421 422 DECODER(iso2022) 423 { 424 const struct iso2022_designation *dsgcache = NULL; 425 426 while (inleft > 0) { 427 unsigned char c = INBYTE1; 428 Py_ssize_t err; 429 430 if (STATE_GETFLAG(F_ESCTHROUGHOUT)) { 431 /* ESC throughout mode: 432 * for non-iso2022 escape sequences */ 433 OUTCHAR(c); /* assume as ISO-8859-1 */ 434 NEXT_IN(1); 435 if (IS_ESCEND(c)) { 436 STATE_CLEARFLAG(F_ESCTHROUGHOUT); 437 } 438 continue; 439 } 440 441 switch (c) { 442 case ESC: 443 REQUIRE_INBUF(2); 444 if (IS_ISO2022ESC(INBYTE2)) { 445 err = iso2022processesc(config, state, 446 inbuf, &inleft); 447 if (err != 0) 448 return err; 449 } 450 else if (CONFIG_ISSET(USE_G2) && INBYTE2 == 'N') {/* SS2 */ 451 REQUIRE_INBUF(3); 452 err = iso2022processg2(config, state, 453 inbuf, &inleft, writer); 454 if (err != 0) 455 return err; 456 } 457 else { 458 OUTCHAR(ESC); 459 STATE_SETFLAG(F_ESCTHROUGHOUT); 460 NEXT_IN(1); 461 } 462 break; 463 case SI: 464 if (CONFIG_ISSET(NO_SHIFT)) 465 goto bypass; 466 STATE_CLEARFLAG(F_SHIFTED); 467 NEXT_IN(1); 468 break; 469 case SO: 470 if (CONFIG_ISSET(NO_SHIFT)) 471 goto bypass; 472 STATE_SETFLAG(F_SHIFTED); 473 NEXT_IN(1); 474 break; 475 case LF: 476 STATE_CLEARFLAG(F_SHIFTED); 477 OUTCHAR(LF); 478 NEXT_IN(1); 479 break; 480 default: 481 if (c < 0x20) /* C0 */ 482 goto bypass; 483 else if (c >= 0x80) 484 return 1; 485 else { 486 const struct iso2022_designation *dsg; 487 unsigned char charset; 488 Py_UCS4 decoded; 489 490 if (STATE_GETFLAG(F_SHIFTED)) 491 charset = STATE_G1; 492 else 493 charset = STATE_G0; 494 495 if (charset == CHARSET_ASCII) { 496 bypass: 497 OUTCHAR(c); 498 NEXT_IN(1); 499 break; 500 } 501 502 if (dsgcache != NULL && 503 dsgcache->mark == charset) 504 dsg = dsgcache; 505 else { 506 for (dsg = CONFIG_DESIGNATIONS; 507 dsg->mark != charset 508 #ifdef Py_DEBUG 509 && dsg->mark != '\0' 510 #endif 511 ; dsg++) 512 { 513 /* noop */ 514 } 515 assert(dsg->mark != '\0'); 516 dsgcache = dsg; 517 } 518 519 REQUIRE_INBUF(dsg->width); 520 decoded = dsg->decoder(*inbuf); 521 if (decoded == MAP_UNMAPPABLE) 522 return dsg->width; 523 524 if (decoded < 0x10000) { 525 OUTCHAR(decoded); 526 } 527 else if (decoded < 0x30000) { 528 OUTCHAR(decoded); 529 } 530 else { /* JIS X 0213 pairs */ 531 OUTCHAR2(decoded >> 16, decoded & 0xffff); 532 } 533 NEXT_IN(dsg->width); 534 } 535 break; 536 } 537 } 538 return 0; 539 } 540 541 /*-*- mapping table holders -*-*/ 542 543 #define ENCMAP(enc) static const encode_map *enc##_encmap = NULL; 544 #define DECMAP(enc) static const decode_map *enc##_decmap = NULL; 545 546 /* kr */ 547 ENCMAP(cp949) 548 DECMAP(ksx1001) 549 550 /* jp */ 551 ENCMAP(jisxcommon) 552 DECMAP(jisx0208) 553 DECMAP(jisx0212) 554 ENCMAP(jisx0213_bmp) 555 DECMAP(jisx0213_1_bmp) 556 DECMAP(jisx0213_2_bmp) 557 ENCMAP(jisx0213_emp) 558 DECMAP(jisx0213_1_emp) 559 DECMAP(jisx0213_2_emp) 560 561 /* cn */ 562 ENCMAP(gbcommon) 563 DECMAP(gb2312) 564 565 /* tw */ 566 567 /*-*- mapping access functions -*-*/ 568 569 static int 570 ksx1001_init(void) 571 { 572 static int initialized = 0; 573 574 if (!initialized && ( 575 IMPORT_MAP(kr, cp949, &cp949_encmap, NULL) || 576 IMPORT_MAP(kr, ksx1001, NULL, &ksx1001_decmap))) 577 return -1; 578 initialized = 1; 579 return 0; 580 } 581 582 static Py_UCS4 583 ksx1001_decoder(const unsigned char *data) 584 { 585 Py_UCS4 u; 586 if (TRYMAP_DEC(ksx1001, u, data[0], data[1])) 587 return u; 588 else 589 return MAP_UNMAPPABLE; 590 } 591 592 static DBCHAR 593 ksx1001_encoder(const Py_UCS4 *data, Py_ssize_t *length) 594 { 595 DBCHAR coded; 596 assert(*length == 1); 597 if (*data < 0x10000) { 598 if (TRYMAP_ENC(cp949, coded, *data)) { 599 if (!(coded & 0x8000)) 600 return coded; 601 } 602 } 603 return MAP_UNMAPPABLE; 604 } 605 606 static int 607 jisx0208_init(void) 608 { 609 static int initialized = 0; 610 611 if (!initialized && ( 612 IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) || 613 IMPORT_MAP(jp, jisx0208, NULL, &jisx0208_decmap))) 614 return -1; 615 initialized = 1; 616 return 0; 617 } 618 619 static Py_UCS4 620 jisx0208_decoder(const unsigned char *data) 621 { 622 Py_UCS4 u; 623 if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */ 624 return 0xff3c; 625 else if (TRYMAP_DEC(jisx0208, u, data[0], data[1])) 626 return u; 627 else 628 return MAP_UNMAPPABLE; 629 } 630 631 static DBCHAR 632 jisx0208_encoder(const Py_UCS4 *data, Py_ssize_t *length) 633 { 634 DBCHAR coded; 635 assert(*length == 1); 636 if (*data < 0x10000) { 637 if (*data == 0xff3c) /* F/W REVERSE SOLIDUS */ 638 return 0x2140; 639 else if (TRYMAP_ENC(jisxcommon, coded, *data)) { 640 if (!(coded & 0x8000)) 641 return coded; 642 } 643 } 644 return MAP_UNMAPPABLE; 645 } 646 647 static int 648 jisx0212_init(void) 649 { 650 static int initialized = 0; 651 652 if (!initialized && ( 653 IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) || 654 IMPORT_MAP(jp, jisx0212, NULL, &jisx0212_decmap))) 655 return -1; 656 initialized = 1; 657 return 0; 658 } 659 660 static Py_UCS4 661 jisx0212_decoder(const unsigned char *data) 662 { 663 Py_UCS4 u; 664 if (TRYMAP_DEC(jisx0212, u, data[0], data[1])) 665 return u; 666 else 667 return MAP_UNMAPPABLE; 668 } 669 670 static DBCHAR 671 jisx0212_encoder(const Py_UCS4 *data, Py_ssize_t *length) 672 { 673 DBCHAR coded; 674 assert(*length == 1); 675 if (*data < 0x10000) { 676 if (TRYMAP_ENC(jisxcommon, coded, *data)) { 677 if (coded & 0x8000) 678 return coded & 0x7fff; 679 } 680 } 681 return MAP_UNMAPPABLE; 682 } 683 684 static int 685 jisx0213_init(void) 686 { 687 static int initialized = 0; 688 689 if (!initialized && ( 690 jisx0208_init() || 691 IMPORT_MAP(jp, jisx0213_bmp, 692 &jisx0213_bmp_encmap, NULL) || 693 IMPORT_MAP(jp, jisx0213_1_bmp, 694 NULL, &jisx0213_1_bmp_decmap) || 695 IMPORT_MAP(jp, jisx0213_2_bmp, 696 NULL, &jisx0213_2_bmp_decmap) || 697 IMPORT_MAP(jp, jisx0213_emp, 698 &jisx0213_emp_encmap, NULL) || 699 IMPORT_MAP(jp, jisx0213_1_emp, 700 NULL, &jisx0213_1_emp_decmap) || 701 IMPORT_MAP(jp, jisx0213_2_emp, 702 NULL, &jisx0213_2_emp_decmap) || 703 IMPORT_MAP(jp, jisx0213_pair, &jisx0213_pair_encmap, 704 &jisx0213_pair_decmap))) 705 return -1; 706 initialized = 1; 707 return 0; 708 } 709 710 #define config ((void *)2000) 711 static Py_UCS4 712 jisx0213_2000_1_decoder(const unsigned char *data) 713 { 714 Py_UCS4 u; 715 EMULATE_JISX0213_2000_DECODE_PLANE1(u, data[0], data[1]) 716 else if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */ 717 return 0xff3c; 718 else if (TRYMAP_DEC(jisx0208, u, data[0], data[1])) 719 ; 720 else if (TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1])) 721 ; 722 else if (TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1])) 723 u |= 0x20000; 724 else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1])) 725 ; 726 else 727 return MAP_UNMAPPABLE; 728 return u; 729 } 730 731 static Py_UCS4 732 jisx0213_2000_2_decoder(const unsigned char *data) 733 { 734 Py_UCS4 u; 735 EMULATE_JISX0213_2000_DECODE_PLANE2_CHAR(u, data[0], data[1]) 736 if (TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1])) 737 ; 738 else if (TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1])) 739 u |= 0x20000; 740 else 741 return MAP_UNMAPPABLE; 742 return u; 743 } 744 #undef config 745 746 static Py_UCS4 747 jisx0213_2004_1_decoder(const unsigned char *data) 748 { 749 Py_UCS4 u; 750 if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */ 751 return 0xff3c; 752 else if (TRYMAP_DEC(jisx0208, u, data[0], data[1])) 753 ; 754 else if (TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1])) 755 ; 756 else if (TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1])) 757 u |= 0x20000; 758 else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1])) 759 ; 760 else 761 return MAP_UNMAPPABLE; 762 return u; 763 } 764 765 static Py_UCS4 766 jisx0213_2004_2_decoder(const unsigned char *data) 767 { 768 Py_UCS4 u; 769 if (TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1])) 770 ; 771 else if (TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1])) 772 u |= 0x20000; 773 else 774 return MAP_UNMAPPABLE; 775 return u; 776 } 777 778 static DBCHAR 779 jisx0213_encoder(const Py_UCS4 *data, Py_ssize_t *length, void *config) 780 { 781 DBCHAR coded; 782 783 switch (*length) { 784 case 1: /* first character */ 785 if (*data >= 0x10000) { 786 if ((*data) >> 16 == 0x20000 >> 16) { 787 EMULATE_JISX0213_2000_ENCODE_EMP(coded, *data) 788 else if (TRYMAP_ENC(jisx0213_emp, coded, (*data) & 0xffff)) 789 return coded; 790 } 791 return MAP_UNMAPPABLE; 792 } 793 794 EMULATE_JISX0213_2000_ENCODE_BMP(coded, *data) 795 else if (TRYMAP_ENC(jisx0213_bmp, coded, *data)) { 796 if (coded == MULTIC) 797 return MAP_MULTIPLE_AVAIL; 798 } 799 else if (TRYMAP_ENC(jisxcommon, coded, *data)) { 800 if (coded & 0x8000) 801 return MAP_UNMAPPABLE; 802 } 803 else 804 return MAP_UNMAPPABLE; 805 return coded; 806 807 case 2: /* second character of unicode pair */ 808 coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1], 809 jisx0213_pair_encmap, JISX0213_ENCPAIRS); 810 if (coded == DBCINV) { 811 *length = 1; 812 coded = find_pairencmap((ucs2_t)data[0], 0, 813 jisx0213_pair_encmap, JISX0213_ENCPAIRS); 814 if (coded == DBCINV) 815 return MAP_UNMAPPABLE; 816 } 817 else 818 return coded; 819 820 case -1: /* flush unterminated */ 821 *length = 1; 822 coded = find_pairencmap((ucs2_t)data[0], 0, 823 jisx0213_pair_encmap, JISX0213_ENCPAIRS); 824 if (coded == DBCINV) 825 return MAP_UNMAPPABLE; 826 else 827 return coded; 828 break; 829 830 default: 831 return MAP_UNMAPPABLE; 832 } 833 } 834 835 static DBCHAR 836 jisx0213_2000_1_encoder(const Py_UCS4 *data, Py_ssize_t *length) 837 { 838 DBCHAR coded = jisx0213_encoder(data, length, (void *)2000); 839 if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) 840 return coded; 841 else if (coded & 0x8000) 842 return MAP_UNMAPPABLE; 843 else 844 return coded; 845 } 846 847 static DBCHAR 848 jisx0213_2000_1_encoder_paironly(const Py_UCS4 *data, Py_ssize_t *length) 849 { 850 DBCHAR coded; 851 Py_ssize_t ilength = *length; 852 853 coded = jisx0213_encoder(data, length, (void *)2000); 854 switch (ilength) { 855 case 1: 856 if (coded == MAP_MULTIPLE_AVAIL) 857 return MAP_MULTIPLE_AVAIL; 858 else 859 return MAP_UNMAPPABLE; 860 case 2: 861 if (*length != 2) 862 return MAP_UNMAPPABLE; 863 else 864 return coded; 865 default: 866 return MAP_UNMAPPABLE; 867 } 868 } 869 870 static DBCHAR 871 jisx0213_2000_2_encoder(const Py_UCS4 *data, Py_ssize_t *length) 872 { 873 DBCHAR coded = jisx0213_encoder(data, length, (void *)2000); 874 if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) 875 return coded; 876 else if (coded & 0x8000) 877 return coded & 0x7fff; 878 else 879 return MAP_UNMAPPABLE; 880 } 881 882 static DBCHAR 883 jisx0213_2004_1_encoder(const Py_UCS4 *data, Py_ssize_t *length) 884 { 885 DBCHAR coded = jisx0213_encoder(data, length, NULL); 886 if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) 887 return coded; 888 else if (coded & 0x8000) 889 return MAP_UNMAPPABLE; 890 else 891 return coded; 892 } 893 894 static DBCHAR 895 jisx0213_2004_1_encoder_paironly(const Py_UCS4 *data, Py_ssize_t *length) 896 { 897 DBCHAR coded; 898 Py_ssize_t ilength = *length; 899 900 coded = jisx0213_encoder(data, length, NULL); 901 switch (ilength) { 902 case 1: 903 if (coded == MAP_MULTIPLE_AVAIL) 904 return MAP_MULTIPLE_AVAIL; 905 else 906 return MAP_UNMAPPABLE; 907 case 2: 908 if (*length != 2) 909 return MAP_UNMAPPABLE; 910 else 911 return coded; 912 default: 913 return MAP_UNMAPPABLE; 914 } 915 } 916 917 static DBCHAR 918 jisx0213_2004_2_encoder(const Py_UCS4 *data, Py_ssize_t *length) 919 { 920 DBCHAR coded = jisx0213_encoder(data, length, NULL); 921 if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) 922 return coded; 923 else if (coded & 0x8000) 924 return coded & 0x7fff; 925 else 926 return MAP_UNMAPPABLE; 927 } 928 929 static Py_UCS4 930 jisx0201_r_decoder(const unsigned char *data) 931 { 932 Py_UCS4 u; 933 JISX0201_R_DECODE_CHAR(*data, u) 934 else 935 return MAP_UNMAPPABLE; 936 return u; 937 } 938 939 static DBCHAR 940 jisx0201_r_encoder(const Py_UCS4 *data, Py_ssize_t *length) 941 { 942 DBCHAR coded; 943 JISX0201_R_ENCODE(*data, coded) 944 else 945 return MAP_UNMAPPABLE; 946 return coded; 947 } 948 949 static Py_UCS4 950 jisx0201_k_decoder(const unsigned char *data) 951 { 952 Py_UCS4 u; 953 JISX0201_K_DECODE_CHAR(*data ^ 0x80, u) 954 else 955 return MAP_UNMAPPABLE; 956 return u; 957 } 958 959 static DBCHAR 960 jisx0201_k_encoder(const Py_UCS4 *data, Py_ssize_t *length) 961 { 962 DBCHAR coded; 963 JISX0201_K_ENCODE(*data, coded) 964 else 965 return MAP_UNMAPPABLE; 966 return coded - 0x80; 967 } 968 969 static int 970 gb2312_init(void) 971 { 972 static int initialized = 0; 973 974 if (!initialized && ( 975 IMPORT_MAP(cn, gbcommon, &gbcommon_encmap, NULL) || 976 IMPORT_MAP(cn, gb2312, NULL, &gb2312_decmap))) 977 return -1; 978 initialized = 1; 979 return 0; 980 } 981 982 static Py_UCS4 983 gb2312_decoder(const unsigned char *data) 984 { 985 Py_UCS4 u; 986 if (TRYMAP_DEC(gb2312, u, data[0], data[1])) 987 return u; 988 else 989 return MAP_UNMAPPABLE; 990 } 991 992 static DBCHAR 993 gb2312_encoder(const Py_UCS4 *data, Py_ssize_t *length) 994 { 995 DBCHAR coded; 996 assert(*length == 1); 997 if (*data < 0x10000) { 998 if (TRYMAP_ENC(gbcommon, coded, *data)) { 999 if (!(coded & 0x8000)) 1000 return coded; 1001 } 1002 } 1003 return MAP_UNMAPPABLE; 1004 } 1005 1006 1007 static Py_UCS4 1008 dummy_decoder(const unsigned char *data) 1009 { 1010 return MAP_UNMAPPABLE; 1011 } 1012 1013 static DBCHAR 1014 dummy_encoder(const Py_UCS4 *data, Py_ssize_t *length) 1015 { 1016 return MAP_UNMAPPABLE; 1017 } 1018 1019 /*-*- registry tables -*-*/ 1020 1021 #define REGISTRY_KSX1001_G0 { CHARSET_KSX1001, 0, 2, \ 1022 ksx1001_init, \ 1023 ksx1001_decoder, ksx1001_encoder } 1024 #define REGISTRY_KSX1001_G1 { CHARSET_KSX1001, 1, 2, \ 1025 ksx1001_init, \ 1026 ksx1001_decoder, ksx1001_encoder } 1027 #define REGISTRY_JISX0201_R { CHARSET_JISX0201_R, 0, 1, \ 1028 NULL, \ 1029 jisx0201_r_decoder, jisx0201_r_encoder } 1030 #define REGISTRY_JISX0201_K { CHARSET_JISX0201_K, 0, 1, \ 1031 NULL, \ 1032 jisx0201_k_decoder, jisx0201_k_encoder } 1033 #define REGISTRY_JISX0208 { CHARSET_JISX0208, 0, 2, \ 1034 jisx0208_init, \ 1035 jisx0208_decoder, jisx0208_encoder } 1036 #define REGISTRY_JISX0208_O { CHARSET_JISX0208_O, 0, 2, \ 1037 jisx0208_init, \ 1038 jisx0208_decoder, jisx0208_encoder } 1039 #define REGISTRY_JISX0212 { CHARSET_JISX0212, 0, 2, \ 1040 jisx0212_init, \ 1041 jisx0212_decoder, jisx0212_encoder } 1042 #define REGISTRY_JISX0213_2000_1 { CHARSET_JISX0213_2000_1, 0, 2, \ 1043 jisx0213_init, \ 1044 jisx0213_2000_1_decoder, \ 1045 jisx0213_2000_1_encoder } 1046 #define REGISTRY_JISX0213_2000_1_PAIRONLY { CHARSET_JISX0213_2000_1, 0, 2, \ 1047 jisx0213_init, \ 1048 jisx0213_2000_1_decoder, \ 1049 jisx0213_2000_1_encoder_paironly } 1050 #define REGISTRY_JISX0213_2000_2 { CHARSET_JISX0213_2, 0, 2, \ 1051 jisx0213_init, \ 1052 jisx0213_2000_2_decoder, \ 1053 jisx0213_2000_2_encoder } 1054 #define REGISTRY_JISX0213_2004_1 { CHARSET_JISX0213_2004_1, 0, 2, \ 1055 jisx0213_init, \ 1056 jisx0213_2004_1_decoder, \ 1057 jisx0213_2004_1_encoder } 1058 #define REGISTRY_JISX0213_2004_1_PAIRONLY { CHARSET_JISX0213_2004_1, 0, 2, \ 1059 jisx0213_init, \ 1060 jisx0213_2004_1_decoder, \ 1061 jisx0213_2004_1_encoder_paironly } 1062 #define REGISTRY_JISX0213_2004_2 { CHARSET_JISX0213_2, 0, 2, \ 1063 jisx0213_init, \ 1064 jisx0213_2004_2_decoder, \ 1065 jisx0213_2004_2_encoder } 1066 #define REGISTRY_GB2312 { CHARSET_GB2312, 0, 2, \ 1067 gb2312_init, \ 1068 gb2312_decoder, gb2312_encoder } 1069 #define REGISTRY_CNS11643_1 { CHARSET_CNS11643_1, 1, 2, \ 1070 cns11643_init, \ 1071 cns11643_1_decoder, cns11643_1_encoder } 1072 #define REGISTRY_CNS11643_2 { CHARSET_CNS11643_2, 2, 2, \ 1073 cns11643_init, \ 1074 cns11643_2_decoder, cns11643_2_encoder } 1075 #define REGISTRY_ISO8859_1 { CHARSET_ISO8859_1, 2, 1, \ 1076 NULL, dummy_decoder, dummy_encoder } 1077 #define REGISTRY_ISO8859_7 { CHARSET_ISO8859_7, 2, 1, \ 1078 NULL, dummy_decoder, dummy_encoder } 1079 #define REGISTRY_SENTINEL { 0, } 1080 #define CONFIGDEF(var, attrs) \ 1081 static const struct iso2022_config iso2022_##var##_config = { \ 1082 attrs, iso2022_##var##_designations \ 1083 }; 1084 1085 static const struct iso2022_designation iso2022_kr_designations[] = { 1086 REGISTRY_KSX1001_G1, REGISTRY_SENTINEL 1087 }; 1088 CONFIGDEF(kr, 0) 1089 1090 static const struct iso2022_designation iso2022_jp_designations[] = { 1091 REGISTRY_JISX0208, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O, 1092 REGISTRY_SENTINEL 1093 }; 1094 CONFIGDEF(jp, NO_SHIFT | USE_JISX0208_EXT) 1095 1096 static const struct iso2022_designation iso2022_jp_1_designations[] = { 1097 REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R, 1098 REGISTRY_JISX0208_O, REGISTRY_SENTINEL 1099 }; 1100 CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT) 1101 1102 static const struct iso2022_designation iso2022_jp_2_designations[] = { 1103 REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0, 1104 REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O, 1105 REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL 1106 }; 1107 CONFIGDEF(jp_2, NO_SHIFT | USE_G2 | USE_JISX0208_EXT) 1108 1109 static const struct iso2022_designation iso2022_jp_2004_designations[] = { 1110 REGISTRY_JISX0213_2004_1_PAIRONLY, REGISTRY_JISX0208, 1111 REGISTRY_JISX0213_2004_1, REGISTRY_JISX0213_2004_2, REGISTRY_SENTINEL 1112 }; 1113 CONFIGDEF(jp_2004, NO_SHIFT | USE_JISX0208_EXT) 1114 1115 static const struct iso2022_designation iso2022_jp_3_designations[] = { 1116 REGISTRY_JISX0213_2000_1_PAIRONLY, REGISTRY_JISX0208, 1117 REGISTRY_JISX0213_2000_1, REGISTRY_JISX0213_2000_2, REGISTRY_SENTINEL 1118 }; 1119 CONFIGDEF(jp_3, NO_SHIFT | USE_JISX0208_EXT) 1120 1121 static const struct iso2022_designation iso2022_jp_ext_designations[] = { 1122 REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R, 1123 REGISTRY_JISX0201_K, REGISTRY_JISX0208_O, REGISTRY_SENTINEL 1124 }; 1125 CONFIGDEF(jp_ext, NO_SHIFT | USE_JISX0208_EXT) 1126 1127 1128 BEGIN_MAPPINGS_LIST 1129 /* no mapping table here */ 1130 END_MAPPINGS_LIST 1131 1132 #define ISO2022_CODEC(variation) { \ 1133 "iso2022_" #variation, \ 1134 &iso2022_##variation##_config, \ 1135 iso2022_codec_init, \ 1136 _STATEFUL_METHODS(iso2022) \ 1137 }, 1138 1139 BEGIN_CODECS_LIST 1140 ISO2022_CODEC(kr) 1141 ISO2022_CODEC(jp) 1142 ISO2022_CODEC(jp_1) 1143 ISO2022_CODEC(jp_2) 1144 ISO2022_CODEC(jp_2004) 1145 ISO2022_CODEC(jp_3) 1146 ISO2022_CODEC(jp_ext) 1147 END_CODECS_LIST 1148 1149 I_AM_A_MODULE_FOR(iso2022) 1150