1 /* 2 * _codecs_jp.c: Codecs collection for Japanese encodings 3 * 4 * Written by Hye-Shik Chang <perky (at) FreeBSD.org> 5 */ 6 7 #define USING_BINARY_PAIR_SEARCH 8 #define EMPBASE 0x20000 9 10 #include "cjkcodecs.h" 11 #include "mappings_jp.h" 12 #include "mappings_jisx0213_pair.h" 13 #include "alg_jisx0201.h" 14 #include "emu_jisx0213_2000.h" 15 16 /* 17 * CP932 codec 18 */ 19 20 ENCODER(cp932) 21 { 22 while (inleft > 0) { 23 Py_UNICODE c = IN1; 24 DBCHAR code; 25 unsigned char c1, c2; 26 27 if (c <= 0x80) { 28 WRITE1((unsigned char)c) 29 NEXT(1, 1) 30 continue; 31 } 32 else if (c >= 0xff61 && c <= 0xff9f) { 33 WRITE1(c - 0xfec0) 34 NEXT(1, 1) 35 continue; 36 } 37 else if (c >= 0xf8f0 && c <= 0xf8f3) { 38 /* Windows compatibility */ 39 REQUIRE_OUTBUF(1) 40 if (c == 0xf8f0) 41 OUT1(0xa0) 42 else 43 OUT1(c - 0xfef1 + 0xfd) 44 NEXT(1, 1) 45 continue; 46 } 47 48 UCS4INVALID(c) 49 REQUIRE_OUTBUF(2) 50 51 TRYMAP_ENC(cp932ext, code, c) { 52 OUT1(code >> 8) 53 OUT2(code & 0xff) 54 } 55 else TRYMAP_ENC(jisxcommon, code, c) { 56 if (code & 0x8000) /* MSB set: JIS X 0212 */ 57 return 1; 58 59 /* JIS X 0208 */ 60 c1 = code >> 8; 61 c2 = code & 0xff; 62 c2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21); 63 c1 = (c1 - 0x21) >> 1; 64 OUT1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1) 65 OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41) 66 } 67 else if (c >= 0xe000 && c < 0xe758) { 68 /* User-defined area */ 69 c1 = (Py_UNICODE)(c - 0xe000) / 188; 70 c2 = (Py_UNICODE)(c - 0xe000) % 188; 71 OUT1(c1 + 0xf0) 72 OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41) 73 } 74 else 75 return 1; 76 77 NEXT(1, 2) 78 } 79 80 return 0; 81 } 82 83 DECODER(cp932) 84 { 85 while (inleft > 0) { 86 unsigned char c = IN1, c2; 87 88 REQUIRE_OUTBUF(1) 89 if (c <= 0x80) { 90 OUT1(c) 91 NEXT(1, 1) 92 continue; 93 } 94 else if (c >= 0xa0 && c <= 0xdf) { 95 if (c == 0xa0) 96 OUT1(0xf8f0) /* half-width katakana */ 97 else 98 OUT1(0xfec0 + c) 99 NEXT(1, 1) 100 continue; 101 } 102 else if (c >= 0xfd/* && c <= 0xff*/) { 103 /* Windows compatibility */ 104 OUT1(0xf8f1 - 0xfd + c) 105 NEXT(1, 1) 106 continue; 107 } 108 109 REQUIRE_INBUF(2) 110 c2 = IN2; 111 112 TRYMAP_DEC(cp932ext, **outbuf, c, c2); 113 else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){ 114 if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc) 115 return 2; 116 117 c = (c < 0xe0 ? c - 0x81 : c - 0xc1); 118 c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41); 119 c = (2 * c + (c2 < 0x5e ? 0 : 1) + 0x21); 120 c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21; 121 122 TRYMAP_DEC(jisx0208, **outbuf, c, c2); 123 else return 2; 124 } 125 else if (c >= 0xf0 && c <= 0xf9) { 126 if ((c2 >= 0x40 && c2 <= 0x7e) || 127 (c2 >= 0x80 && c2 <= 0xfc)) 128 OUT1(0xe000 + 188 * (c - 0xf0) + 129 (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41)) 130 else 131 return 2; 132 } 133 else 134 return 2; 135 136 NEXT(2, 1) 137 } 138 139 return 0; 140 } 141 142 143 /* 144 * EUC-JIS-2004 codec 145 */ 146 147 ENCODER(euc_jis_2004) 148 { 149 while (inleft > 0) { 150 ucs4_t c = IN1; 151 DBCHAR code; 152 Py_ssize_t insize; 153 154 if (c < 0x80) { 155 WRITE1(c) 156 NEXT(1, 1) 157 continue; 158 } 159 160 DECODE_SURROGATE(c) 161 insize = GET_INSIZE(c); 162 163 if (c <= 0xFFFF) { 164 EMULATE_JISX0213_2000_ENCODE_BMP(code, c) 165 else TRYMAP_ENC(jisx0213_bmp, code, c) { 166 if (code == MULTIC) { 167 if (inleft < 2) { 168 if (flags & MBENC_FLUSH) { 169 code = find_pairencmap( 170 (ucs2_t)c, 0, 171 jisx0213_pair_encmap, 172 JISX0213_ENCPAIRS); 173 if (code == DBCINV) 174 return 1; 175 } 176 else 177 return MBERR_TOOFEW; 178 } 179 else { 180 code = find_pairencmap( 181 (ucs2_t)c, (*inbuf)[1], 182 jisx0213_pair_encmap, 183 JISX0213_ENCPAIRS); 184 if (code == DBCINV) { 185 code = find_pairencmap( 186 (ucs2_t)c, 0, 187 jisx0213_pair_encmap, 188 JISX0213_ENCPAIRS); 189 if (code == DBCINV) 190 return 1; 191 } else 192 insize = 2; 193 } 194 } 195 } 196 else TRYMAP_ENC(jisxcommon, code, c); 197 else if (c >= 0xff61 && c <= 0xff9f) { 198 /* JIS X 0201 half-width katakana */ 199 WRITE2(0x8e, c - 0xfec0) 200 NEXT(1, 2) 201 continue; 202 } 203 else if (c == 0xff3c) 204 /* F/W REVERSE SOLIDUS (see NOTES) */ 205 code = 0x2140; 206 else if (c == 0xff5e) 207 /* F/W TILDE (see NOTES) */ 208 code = 0x2232; 209 else 210 return 1; 211 } 212 else if (c >> 16 == EMPBASE >> 16) { 213 EMULATE_JISX0213_2000_ENCODE_EMP(code, c) 214 else TRYMAP_ENC(jisx0213_emp, code, c & 0xffff); 215 else return insize; 216 } 217 else 218 return insize; 219 220 if (code & 0x8000) { 221 /* Codeset 2 */ 222 WRITE3(0x8f, code >> 8, (code & 0xFF) | 0x80) 223 NEXT(insize, 3) 224 } else { 225 /* Codeset 1 */ 226 WRITE2((code >> 8) | 0x80, (code & 0xFF) | 0x80) 227 NEXT(insize, 2) 228 } 229 } 230 231 return 0; 232 } 233 234 DECODER(euc_jis_2004) 235 { 236 while (inleft > 0) { 237 unsigned char c = IN1; 238 ucs4_t code; 239 240 REQUIRE_OUTBUF(1) 241 242 if (c < 0x80) { 243 OUT1(c) 244 NEXT(1, 1) 245 continue; 246 } 247 248 if (c == 0x8e) { 249 /* JIS X 0201 half-width katakana */ 250 unsigned char c2; 251 252 REQUIRE_INBUF(2) 253 c2 = IN2; 254 if (c2 >= 0xa1 && c2 <= 0xdf) { 255 OUT1(0xfec0 + c2) 256 NEXT(2, 1) 257 } 258 else 259 return 2; 260 } 261 else if (c == 0x8f) { 262 unsigned char c2, c3; 263 264 REQUIRE_INBUF(3) 265 c2 = IN2 ^ 0x80; 266 c3 = IN3 ^ 0x80; 267 268 /* JIS X 0213 Plane 2 or JIS X 0212 (see NOTES) */ 269 EMULATE_JISX0213_2000_DECODE_PLANE2(**outbuf, c2, c3) 270 else TRYMAP_DEC(jisx0213_2_bmp, **outbuf, c2, c3) ; 271 else TRYMAP_DEC(jisx0213_2_emp, code, c2, c3) { 272 WRITEUCS4(EMPBASE | code) 273 NEXT_IN(3) 274 continue; 275 } 276 else TRYMAP_DEC(jisx0212, **outbuf, c2, c3) ; 277 else return 3; 278 NEXT(3, 1) 279 } 280 else { 281 unsigned char c2; 282 283 REQUIRE_INBUF(2) 284 c ^= 0x80; 285 c2 = IN2 ^ 0x80; 286 287 /* JIS X 0213 Plane 1 */ 288 EMULATE_JISX0213_2000_DECODE_PLANE1(**outbuf, c, c2) 289 else if (c == 0x21 && c2 == 0x40) **outbuf = 0xff3c; 290 else if (c == 0x22 && c2 == 0x32) **outbuf = 0xff5e; 291 else TRYMAP_DEC(jisx0208, **outbuf, c, c2); 292 else TRYMAP_DEC(jisx0213_1_bmp, **outbuf, c, c2); 293 else TRYMAP_DEC(jisx0213_1_emp, code, c, c2) { 294 WRITEUCS4(EMPBASE | code) 295 NEXT_IN(2) 296 continue; 297 } 298 else TRYMAP_DEC(jisx0213_pair, code, c, c2) { 299 WRITE2(code >> 16, code & 0xffff) 300 NEXT(2, 2) 301 continue; 302 } 303 else return 2; 304 NEXT(2, 1) 305 } 306 } 307 308 return 0; 309 } 310 311 312 /* 313 * EUC-JP codec 314 */ 315 316 ENCODER(euc_jp) 317 { 318 while (inleft > 0) { 319 Py_UNICODE c = IN1; 320 DBCHAR code; 321 322 if (c < 0x80) { 323 WRITE1((unsigned char)c) 324 NEXT(1, 1) 325 continue; 326 } 327 328 UCS4INVALID(c) 329 330 TRYMAP_ENC(jisxcommon, code, c); 331 else if (c >= 0xff61 && c <= 0xff9f) { 332 /* JIS X 0201 half-width katakana */ 333 WRITE2(0x8e, c - 0xfec0) 334 NEXT(1, 2) 335 continue; 336 } 337 #ifndef STRICT_BUILD 338 else if (c == 0xff3c) /* FULL-WIDTH REVERSE SOLIDUS */ 339 code = 0x2140; 340 else if (c == 0xa5) { /* YEN SIGN */ 341 WRITE1(0x5c); 342 NEXT(1, 1) 343 continue; 344 } else if (c == 0x203e) { /* OVERLINE */ 345 WRITE1(0x7e); 346 NEXT(1, 1) 347 continue; 348 } 349 #endif 350 else 351 return 1; 352 353 if (code & 0x8000) { 354 /* JIS X 0212 */ 355 WRITE3(0x8f, code >> 8, (code & 0xFF) | 0x80) 356 NEXT(1, 3) 357 } else { 358 /* JIS X 0208 */ 359 WRITE2((code >> 8) | 0x80, (code & 0xFF) | 0x80) 360 NEXT(1, 2) 361 } 362 } 363 364 return 0; 365 } 366 367 DECODER(euc_jp) 368 { 369 while (inleft > 0) { 370 unsigned char c = IN1; 371 372 REQUIRE_OUTBUF(1) 373 374 if (c < 0x80) { 375 OUT1(c) 376 NEXT(1, 1) 377 continue; 378 } 379 380 if (c == 0x8e) { 381 /* JIS X 0201 half-width katakana */ 382 unsigned char c2; 383 384 REQUIRE_INBUF(2) 385 c2 = IN2; 386 if (c2 >= 0xa1 && c2 <= 0xdf) { 387 OUT1(0xfec0 + c2) 388 NEXT(2, 1) 389 } 390 else 391 return 2; 392 } 393 else if (c == 0x8f) { 394 unsigned char c2, c3; 395 396 REQUIRE_INBUF(3) 397 c2 = IN2; 398 c3 = IN3; 399 /* JIS X 0212 */ 400 TRYMAP_DEC(jisx0212, **outbuf, c2 ^ 0x80, c3 ^ 0x80) { 401 NEXT(3, 1) 402 } 403 else 404 return 3; 405 } 406 else { 407 unsigned char c2; 408 409 REQUIRE_INBUF(2) 410 c2 = IN2; 411 /* JIS X 0208 */ 412 #ifndef STRICT_BUILD 413 if (c == 0xa1 && c2 == 0xc0) 414 /* FULL-WIDTH REVERSE SOLIDUS */ 415 **outbuf = 0xff3c; 416 else 417 #endif 418 TRYMAP_DEC(jisx0208, **outbuf, 419 c ^ 0x80, c2 ^ 0x80) ; 420 else return 2; 421 NEXT(2, 1) 422 } 423 } 424 425 return 0; 426 } 427 428 429 /* 430 * SHIFT_JIS codec 431 */ 432 433 ENCODER(shift_jis) 434 { 435 while (inleft > 0) { 436 Py_UNICODE c = IN1; 437 DBCHAR code; 438 unsigned char c1, c2; 439 440 #ifdef STRICT_BUILD 441 JISX0201_R_ENCODE(c, code) 442 #else 443 if (c < 0x80) code = c; 444 else if (c == 0x00a5) code = 0x5c; /* YEN SIGN */ 445 else if (c == 0x203e) code = 0x7e; /* OVERLINE */ 446 #endif 447 else JISX0201_K_ENCODE(c, code) 448 else UCS4INVALID(c) 449 else code = NOCHAR; 450 451 if (code < 0x80 || (code >= 0xa1 && code <= 0xdf)) { 452 REQUIRE_OUTBUF(1) 453 454 OUT1((unsigned char)code) 455 NEXT(1, 1) 456 continue; 457 } 458 459 REQUIRE_OUTBUF(2) 460 461 if (code == NOCHAR) { 462 TRYMAP_ENC(jisxcommon, code, c); 463 #ifndef STRICT_BUILD 464 else if (c == 0xff3c) 465 code = 0x2140; /* FULL-WIDTH REVERSE SOLIDUS */ 466 #endif 467 else 468 return 1; 469 470 if (code & 0x8000) /* MSB set: JIS X 0212 */ 471 return 1; 472 } 473 474 c1 = code >> 8; 475 c2 = code & 0xff; 476 c2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21); 477 c1 = (c1 - 0x21) >> 1; 478 OUT1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1) 479 OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41) 480 NEXT(1, 2) 481 } 482 483 return 0; 484 } 485 486 DECODER(shift_jis) 487 { 488 while (inleft > 0) { 489 unsigned char c = IN1; 490 491 REQUIRE_OUTBUF(1) 492 493 #ifdef STRICT_BUILD 494 JISX0201_R_DECODE(c, **outbuf) 495 #else 496 if (c < 0x80) **outbuf = c; 497 #endif 498 else JISX0201_K_DECODE(c, **outbuf) 499 else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){ 500 unsigned char c1, c2; 501 502 REQUIRE_INBUF(2) 503 c2 = IN2; 504 if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc) 505 return 2; 506 507 c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1); 508 c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41); 509 c1 = (2 * c1 + (c2 < 0x5e ? 0 : 1) + 0x21); 510 c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21; 511 512 #ifndef STRICT_BUILD 513 if (c1 == 0x21 && c2 == 0x40) { 514 /* FULL-WIDTH REVERSE SOLIDUS */ 515 OUT1(0xff3c) 516 NEXT(2, 1) 517 continue; 518 } 519 #endif 520 TRYMAP_DEC(jisx0208, **outbuf, c1, c2) { 521 NEXT(2, 1) 522 continue; 523 } 524 else 525 return 2; 526 } 527 else 528 return 2; 529 530 NEXT(1, 1) /* JIS X 0201 */ 531 } 532 533 return 0; 534 } 535 536 537 /* 538 * SHIFT_JIS-2004 codec 539 */ 540 541 ENCODER(shift_jis_2004) 542 { 543 while (inleft > 0) { 544 ucs4_t c = IN1; 545 DBCHAR code = NOCHAR; 546 int c1, c2; 547 Py_ssize_t insize; 548 549 JISX0201_ENCODE(c, code) 550 else DECODE_SURROGATE(c) 551 552 if (code < 0x80 || (code >= 0xa1 && code <= 0xdf)) { 553 WRITE1((unsigned char)code) 554 NEXT(1, 1) 555 continue; 556 } 557 558 REQUIRE_OUTBUF(2) 559 insize = GET_INSIZE(c); 560 561 if (code == NOCHAR) { 562 if (c <= 0xffff) { 563 EMULATE_JISX0213_2000_ENCODE_BMP(code, c) 564 else TRYMAP_ENC(jisx0213_bmp, code, c) { 565 if (code == MULTIC) { 566 if (inleft < 2) { 567 if (flags & MBENC_FLUSH) { 568 code = find_pairencmap 569 ((ucs2_t)c, 0, 570 jisx0213_pair_encmap, 571 JISX0213_ENCPAIRS); 572 if (code == DBCINV) 573 return 1; 574 } 575 else 576 return MBERR_TOOFEW; 577 } 578 else { 579 code = find_pairencmap( 580 (ucs2_t)c, IN2, 581 jisx0213_pair_encmap, 582 JISX0213_ENCPAIRS); 583 if (code == DBCINV) { 584 code = find_pairencmap( 585 (ucs2_t)c, 0, 586 jisx0213_pair_encmap, 587 JISX0213_ENCPAIRS); 588 if (code == DBCINV) 589 return 1; 590 } 591 else 592 insize = 2; 593 } 594 } 595 } 596 else TRYMAP_ENC(jisxcommon, code, c) { 597 /* abandon JIS X 0212 codes */ 598 if (code & 0x8000) 599 return 1; 600 } 601 else return 1; 602 } 603 else if (c >> 16 == EMPBASE >> 16) { 604 EMULATE_JISX0213_2000_ENCODE_EMP(code, c) 605 else TRYMAP_ENC(jisx0213_emp, code, c&0xffff); 606 else return insize; 607 } 608 else 609 return insize; 610 } 611 612 c1 = code >> 8; 613 c2 = (code & 0xff) - 0x21; 614 615 if (c1 & 0x80) { /* Plane 2 */ 616 if (c1 >= 0xee) c1 -= 0x87; 617 else if (c1 >= 0xac || c1 == 0xa8) c1 -= 0x49; 618 else c1 -= 0x43; 619 } 620 else /* Plane 1 */ 621 c1 -= 0x21; 622 623 if (c1 & 1) c2 += 0x5e; 624 c1 >>= 1; 625 OUT1(c1 + (c1 < 0x1f ? 0x81 : 0xc1)) 626 OUT2(c2 + (c2 < 0x3f ? 0x40 : 0x41)) 627 628 NEXT(insize, 2) 629 } 630 631 return 0; 632 } 633 634 DECODER(shift_jis_2004) 635 { 636 while (inleft > 0) { 637 unsigned char c = IN1; 638 639 REQUIRE_OUTBUF(1) 640 JISX0201_DECODE(c, **outbuf) 641 else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xfc)){ 642 unsigned char c1, c2; 643 ucs4_t code; 644 645 REQUIRE_INBUF(2) 646 c2 = IN2; 647 if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc) 648 return 2; 649 650 c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1); 651 c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41); 652 c1 = (2 * c1 + (c2 < 0x5e ? 0 : 1)); 653 c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21; 654 655 if (c1 < 0x5e) { /* Plane 1 */ 656 c1 += 0x21; 657 EMULATE_JISX0213_2000_DECODE_PLANE1(**outbuf, 658 c1, c2) 659 else TRYMAP_DEC(jisx0208, **outbuf, c1, c2) { 660 NEXT_OUT(1) 661 } 662 else TRYMAP_DEC(jisx0213_1_bmp, **outbuf, 663 c1, c2) { 664 NEXT_OUT(1) 665 } 666 else TRYMAP_DEC(jisx0213_1_emp, code, c1, c2) { 667 WRITEUCS4(EMPBASE | code) 668 } 669 else TRYMAP_DEC(jisx0213_pair, code, c1, c2) { 670 WRITE2(code >> 16, code & 0xffff) 671 NEXT_OUT(2) 672 } 673 else 674 return 2; 675 NEXT_IN(2) 676 } 677 else { /* Plane 2 */ 678 if (c1 >= 0x67) c1 += 0x07; 679 else if (c1 >= 0x63 || c1 == 0x5f) c1 -= 0x37; 680 else c1 -= 0x3d; 681 682 EMULATE_JISX0213_2000_DECODE_PLANE2(**outbuf, 683 c1, c2) 684 else TRYMAP_DEC(jisx0213_2_bmp, **outbuf, 685 c1, c2) ; 686 else TRYMAP_DEC(jisx0213_2_emp, code, c1, c2) { 687 WRITEUCS4(EMPBASE | code) 688 NEXT_IN(2) 689 continue; 690 } 691 else 692 return 2; 693 NEXT(2, 1) 694 } 695 continue; 696 } 697 else 698 return 2; 699 700 NEXT(1, 1) /* JIS X 0201 */ 701 } 702 703 return 0; 704 } 705 706 707 BEGIN_MAPPINGS_LIST 708 MAPPING_DECONLY(jisx0208) 709 MAPPING_DECONLY(jisx0212) 710 MAPPING_ENCONLY(jisxcommon) 711 MAPPING_DECONLY(jisx0213_1_bmp) 712 MAPPING_DECONLY(jisx0213_2_bmp) 713 MAPPING_ENCONLY(jisx0213_bmp) 714 MAPPING_DECONLY(jisx0213_1_emp) 715 MAPPING_DECONLY(jisx0213_2_emp) 716 MAPPING_ENCONLY(jisx0213_emp) 717 MAPPING_ENCDEC(jisx0213_pair) 718 MAPPING_ENCDEC(cp932ext) 719 END_MAPPINGS_LIST 720 721 BEGIN_CODECS_LIST 722 CODEC_STATELESS(shift_jis) 723 CODEC_STATELESS(cp932) 724 CODEC_STATELESS(euc_jp) 725 CODEC_STATELESS(shift_jis_2004) 726 CODEC_STATELESS(euc_jis_2004) 727 { "euc_jisx0213", (void *)2000, NULL, _STATELESS_METHODS(euc_jis_2004) }, 728 { "shift_jisx0213", (void *)2000, NULL, _STATELESS_METHODS(shift_jis_2004) }, 729 END_CODECS_LIST 730 731 I_AM_A_MODULE_FOR(jp) 732