Home | History | Annotate | Download | only in cjkcodecs
      1 /*
      2  * _codecs_iso2022.c: Codecs collection for ISO-2022 encodings.
      3  *
      4  * Written by Hye-Shik Chang <perky (at) FreeBSD.org>
      5  */
      6 
      7 #define USING_IMPORTED_MAPS
      8 #define USING_BINARY_PAIR_SEARCH
      9 #define EXTERN_JISX0213_PAIR
     10 #define EMULATE_JISX0213_2000_ENCODE_INVALID MAP_UNMAPPABLE
     11 #define EMULATE_JISX0213_2000_DECODE_INVALID MAP_UNMAPPABLE
     12 
     13 #include "cjkcodecs.h"
     14 #include "alg_jisx0201.h"
     15 #include "emu_jisx0213_2000.h"
     16 #include "mappings_jisx0213_pair.h"
     17 
     18 /* STATE
     19 
     20    state->c[0-3]
     21 
     22     00000000
     23     ||^^^^^|
     24     |+-----+----  G0-3 Character Set
     25     +-----------  Is G0-3 double byte?
     26 
     27    state->c[4]
     28 
     29     00000000
     30           ||
     31           |+----  Locked-Shift?
     32           +-----  ESC Throughout
     33 */
     34 
     35 #define ESC                     0x1B
     36 #define SO                      0x0E
     37 #define SI                      0x0F
     38 #define LF                      0x0A
     39 
     40 #define MAX_ESCSEQLEN           16
     41 
     42 #define CHARSET_ISO8859_1       'A'
     43 #define CHARSET_ASCII           'B'
     44 #define CHARSET_ISO8859_7       'F'
     45 #define CHARSET_JISX0201_K      'I'
     46 #define CHARSET_JISX0201_R      'J'
     47 
     48 #define CHARSET_GB2312          ('A'|CHARSET_DBCS)
     49 #define CHARSET_JISX0208        ('B'|CHARSET_DBCS)
     50 #define CHARSET_KSX1001         ('C'|CHARSET_DBCS)
     51 #define CHARSET_JISX0212        ('D'|CHARSET_DBCS)
     52 #define CHARSET_GB2312_8565     ('E'|CHARSET_DBCS)
     53 #define CHARSET_CNS11643_1      ('G'|CHARSET_DBCS)
     54 #define CHARSET_CNS11643_2      ('H'|CHARSET_DBCS)
     55 #define CHARSET_JISX0213_2000_1 ('O'|CHARSET_DBCS)
     56 #define CHARSET_JISX0213_2      ('P'|CHARSET_DBCS)
     57 #define CHARSET_JISX0213_2004_1 ('Q'|CHARSET_DBCS)
     58 #define CHARSET_JISX0208_O      ('@'|CHARSET_DBCS)
     59 
     60 #define CHARSET_DBCS            0x80
     61 #define ESCMARK(mark)           ((mark) & 0x7f)
     62 
     63 #define IS_ESCEND(c)    (((c) >= 'A' && (c) <= 'Z') || (c) == '@')
     64 #define IS_ISO2022ESC(c2) \
     65         ((c2) == '(' || (c2) == ')' || (c2) == '$' || \
     66          (c2) == '.' || (c2) == '&')
     67     /* this is not a complete list of ISO-2022 escape sequence headers.
     68      * but, it's enough to implement CJK instances of iso-2022. */
     69 
     70 #define MAP_UNMAPPABLE          0xFFFF
     71 #define MAP_MULTIPLE_AVAIL      0xFFFE /* for JIS X 0213 */
     72 
     73 #define F_SHIFTED               0x01
     74 #define F_ESCTHROUGHOUT         0x02
     75 
     76 #define STATE_SETG(dn, v)       do { ((state)->c[dn]) = (v); } while (0)
     77 #define STATE_GETG(dn)          ((state)->c[dn])
     78 
     79 #define STATE_G0                STATE_GETG(0)
     80 #define STATE_G1                STATE_GETG(1)
     81 #define STATE_G2                STATE_GETG(2)
     82 #define STATE_G3                STATE_GETG(3)
     83 #define STATE_SETG0(v)          STATE_SETG(0, v)
     84 #define STATE_SETG1(v)          STATE_SETG(1, v)
     85 #define STATE_SETG2(v)          STATE_SETG(2, v)
     86 #define STATE_SETG3(v)          STATE_SETG(3, v)
     87 
     88 #define STATE_SETFLAG(f)        do { ((state)->c[4]) |= (f); } while (0)
     89 #define STATE_GETFLAG(f)        ((state)->c[4] & (f))
     90 #define STATE_CLEARFLAG(f)      do { ((state)->c[4]) &= ~(f); } while (0)
     91 #define STATE_CLEARFLAGS()      do { ((state)->c[4]) = 0; } while (0)
     92 
     93 #define ISO2022_CONFIG          ((const struct iso2022_config *)config)
     94 #define CONFIG_ISSET(flag)      (ISO2022_CONFIG->flags & (flag))
     95 #define CONFIG_DESIGNATIONS     (ISO2022_CONFIG->designations)
     96 
     97 /* iso2022_config.flags */
     98 #define NO_SHIFT                0x01
     99 #define USE_G2                  0x02
    100 #define USE_JISX0208_EXT        0x04
    101 
    102 /*-*- internal data structures -*-*/
    103 
    104 typedef int (*iso2022_init_func)(void);
    105 typedef Py_UCS4 (*iso2022_decode_func)(const unsigned char *data);
    106 typedef DBCHAR (*iso2022_encode_func)(const Py_UCS4 *data, Py_ssize_t *length);
    107 
    108 struct iso2022_designation {
    109     unsigned char mark;
    110     unsigned char plane;
    111     unsigned char width;
    112     iso2022_init_func initializer;
    113     iso2022_decode_func decoder;
    114     iso2022_encode_func encoder;
    115 };
    116 
    117 struct iso2022_config {
    118     int flags;
    119     const struct iso2022_designation *designations; /* non-ascii desigs */
    120 };
    121 
    122 /*-*- iso-2022 codec implementation -*-*/
    123 
    124 CODEC_INIT(iso2022)
    125 {
    126     const struct iso2022_designation *desig;
    127     for (desig = CONFIG_DESIGNATIONS; desig->mark; desig++)
    128         if (desig->initializer != NULL && desig->initializer() != 0)
    129             return -1;
    130     return 0;
    131 }
    132 
    133 ENCODER_INIT(iso2022)
    134 {
    135     STATE_CLEARFLAGS();
    136     STATE_SETG0(CHARSET_ASCII);
    137     STATE_SETG1(CHARSET_ASCII);
    138     return 0;
    139 }
    140 
    141 ENCODER_RESET(iso2022)
    142 {
    143     if (STATE_GETFLAG(F_SHIFTED)) {
    144         WRITEBYTE1(SI);
    145         NEXT_OUT(1);
    146         STATE_CLEARFLAG(F_SHIFTED);
    147     }
    148     if (STATE_G0 != CHARSET_ASCII) {
    149         WRITEBYTE3(ESC, '(', 'B');
    150         NEXT_OUT(3);
    151         STATE_SETG0(CHARSET_ASCII);
    152     }
    153     return 0;
    154 }
    155 
    156 ENCODER(iso2022)
    157 {
    158     while (*inpos < inlen) {
    159         const struct iso2022_designation *dsg;
    160         DBCHAR encoded;
    161         Py_UCS4 c = INCHAR1;
    162         Py_ssize_t insize;
    163 
    164         if (c < 0x80) {
    165             if (STATE_G0 != CHARSET_ASCII) {
    166                 WRITEBYTE3(ESC, '(', 'B');
    167                 STATE_SETG0(CHARSET_ASCII);
    168                 NEXT_OUT(3);
    169             }
    170             if (STATE_GETFLAG(F_SHIFTED)) {
    171                 WRITEBYTE1(SI);
    172                 STATE_CLEARFLAG(F_SHIFTED);
    173                 NEXT_OUT(1);
    174             }
    175             WRITEBYTE1((unsigned char)c);
    176             NEXT(1, 1);
    177             continue;
    178         }
    179 
    180         insize = 1;
    181 
    182         encoded = MAP_UNMAPPABLE;
    183         for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
    184             Py_ssize_t length = 1;
    185             encoded = dsg->encoder(&c, &length);
    186             if (encoded == MAP_MULTIPLE_AVAIL) {
    187                 /* this implementation won't work for pair
    188                  * of non-bmp characters. */
    189                 if (inlen - *inpos < 2) {
    190                     if (!(flags & MBENC_FLUSH))
    191                         return MBERR_TOOFEW;
    192                     length = -1;
    193                 }
    194                 else
    195                     length = 2;
    196                 encoded = dsg->encoder(&c, &length);
    197                 if (encoded != MAP_UNMAPPABLE) {
    198                     insize = length;
    199                     break;
    200                 }
    201             }
    202             else if (encoded != MAP_UNMAPPABLE)
    203                 break;
    204         }
    205 
    206         if (!dsg->mark)
    207             return 1;
    208         assert(dsg->width == 1 || dsg->width == 2);
    209 
    210         switch (dsg->plane) {
    211         case 0: /* G0 */
    212             if (STATE_GETFLAG(F_SHIFTED)) {
    213                 WRITEBYTE1(SI);
    214                 STATE_CLEARFLAG(F_SHIFTED);
    215                 NEXT_OUT(1);
    216             }
    217             if (STATE_G0 != dsg->mark) {
    218                 if (dsg->width == 1) {
    219                     WRITEBYTE3(ESC, '(', ESCMARK(dsg->mark));
    220                     STATE_SETG0(dsg->mark);
    221                     NEXT_OUT(3);
    222                 }
    223                 else if (dsg->mark == CHARSET_JISX0208) {
    224                     WRITEBYTE3(ESC, '$', ESCMARK(dsg->mark));
    225                     STATE_SETG0(dsg->mark);
    226                     NEXT_OUT(3);
    227                 }
    228                 else {
    229                     WRITEBYTE4(ESC, '$', '(',
    230                         ESCMARK(dsg->mark));
    231                     STATE_SETG0(dsg->mark);
    232                     NEXT_OUT(4);
    233                 }
    234             }
    235             break;
    236         case 1: /* G1 */
    237             if (STATE_G1 != dsg->mark) {
    238                 if (dsg->width == 1) {
    239                     WRITEBYTE3(ESC, ')', ESCMARK(dsg->mark));
    240                     STATE_SETG1(dsg->mark);
    241                     NEXT_OUT(3);
    242                 }
    243                 else {
    244                     WRITEBYTE4(ESC, '$', ')', ESCMARK(dsg->mark));
    245                     STATE_SETG1(dsg->mark);
    246                     NEXT_OUT(4);
    247                 }
    248             }
    249             if (!STATE_GETFLAG(F_SHIFTED)) {
    250                 WRITEBYTE1(SO);
    251                 STATE_SETFLAG(F_SHIFTED);
    252                 NEXT_OUT(1);
    253             }
    254             break;
    255         default: /* G2 and G3 is not supported: no encoding in
    256                   * CJKCodecs are using them yet */
    257             return MBERR_INTERNAL;
    258         }
    259 
    260         if (dsg->width == 1) {
    261             WRITEBYTE1((unsigned char)encoded);
    262             NEXT_OUT(1);
    263         }
    264         else {
    265             WRITEBYTE2(encoded >> 8, encoded & 0xff);
    266             NEXT_OUT(2);
    267         }
    268         NEXT_INCHAR(insize);
    269     }
    270 
    271     return 0;
    272 }
    273 
    274 DECODER_INIT(iso2022)
    275 {
    276     STATE_CLEARFLAGS();
    277     STATE_SETG0(CHARSET_ASCII);
    278     STATE_SETG1(CHARSET_ASCII);
    279     STATE_SETG2(CHARSET_ASCII);
    280     return 0;
    281 }
    282 
    283 DECODER_RESET(iso2022)
    284 {
    285     STATE_SETG0(CHARSET_ASCII);
    286     STATE_CLEARFLAG(F_SHIFTED);
    287     return 0;
    288 }
    289 
    290 static Py_ssize_t
    291 iso2022processesc(const void *config, MultibyteCodec_State *state,
    292                   const unsigned char **inbuf, Py_ssize_t *inleft)
    293 {
    294     unsigned char charset, designation;
    295     Py_ssize_t i, esclen = 0;
    296 
    297     for (i = 1;i < MAX_ESCSEQLEN;i++) {
    298         if (i >= *inleft)
    299             return MBERR_TOOFEW;
    300         if (IS_ESCEND((*inbuf)[i])) {
    301             esclen = i + 1;
    302             break;
    303         }
    304         else if (CONFIG_ISSET(USE_JISX0208_EXT) && i+1 < *inleft &&
    305                  (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@') {
    306             i += 2;
    307         }
    308     }
    309 
    310     switch (esclen) {
    311     case 0:
    312         return 1; /* unterminated escape sequence */
    313     case 3:
    314         if (INBYTE2 == '$') {
    315             charset = INBYTE3 | CHARSET_DBCS;
    316             designation = 0;
    317         }
    318         else {
    319             charset = INBYTE3;
    320             if (INBYTE2 == '(')
    321                 designation = 0;
    322             else if (INBYTE2 == ')')
    323                 designation = 1;
    324             else if (CONFIG_ISSET(USE_G2) && INBYTE2 == '.')
    325                 designation = 2;
    326             else
    327                 return 3;
    328         }
    329         break;
    330     case 4:
    331         if (INBYTE2 != '$')
    332             return 4;
    333 
    334         charset = INBYTE4 | CHARSET_DBCS;
    335         if (INBYTE3 == '(')
    336             designation = 0;
    337         else if (INBYTE3 == ')')
    338             designation = 1;
    339         else
    340             return 4;
    341         break;
    342     case 6: /* designation with prefix */
    343         if (CONFIG_ISSET(USE_JISX0208_EXT) &&
    344             (*inbuf)[3] == ESC && (*inbuf)[4] == '$' &&
    345             (*inbuf)[5] == 'B') {
    346             charset = 'B' | CHARSET_DBCS;
    347             designation = 0;
    348         }
    349         else
    350             return 6;
    351         break;
    352     default:
    353         return esclen;
    354     }
    355 
    356     /* raise error when the charset is not designated for this encoding */
    357     if (charset != CHARSET_ASCII) {
    358         const struct iso2022_designation *dsg;
    359 
    360         for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
    361             if (dsg->mark == charset)
    362                 break;
    363         }
    364         if (!dsg->mark)
    365             return esclen;
    366     }
    367 
    368     STATE_SETG(designation, charset);
    369     *inleft -= esclen;
    370     (*inbuf) += esclen;
    371     return 0;
    372 }
    373 
    374 #define ISO8859_7_DECODE(c, writer)                                \
    375     if ((c) < 0xa0) {                                              \
    376         OUTCHAR(c);                                                \
    377     } else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0)))) { \
    378         OUTCHAR(c);                                                \
    379     } else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 ||       \
    380              (0xbffffd77L & (1L << ((c)-0xb4))))) {                \
    381         OUTCHAR(0x02d0 + (c));                                     \
    382     } else if ((c) == 0xa1) {                                      \
    383         OUTCHAR(0x2018);                                           \
    384     } else if ((c) == 0xa2) {                                      \
    385         OUTCHAR(0x2019);                                           \
    386     } else if ((c) == 0xaf) {                                      \
    387         OUTCHAR(0x2015);                                           \
    388     }
    389 
    390 static Py_ssize_t
    391 iso2022processg2(const void *config, MultibyteCodec_State *state,
    392                  const unsigned char **inbuf, Py_ssize_t *inleft,
    393                  _PyUnicodeWriter *writer)
    394 {
    395     /* not written to use encoder, decoder functions because only few
    396      * encodings use G2 designations in CJKCodecs */
    397     if (STATE_G2 == CHARSET_ISO8859_1) {
    398         if (INBYTE3 < 0x80)
    399             OUTCHAR(INBYTE3 + 0x80);
    400         else
    401             return 3;
    402     }
    403     else if (STATE_G2 == CHARSET_ISO8859_7) {
    404         ISO8859_7_DECODE(INBYTE3 ^ 0x80, writer)
    405         else
    406             return 3;
    407     }
    408     else if (STATE_G2 == CHARSET_ASCII) {
    409         if (INBYTE3 & 0x80)
    410             return 3;
    411         else
    412             OUTCHAR(INBYTE3);
    413     }
    414     else
    415         return MBERR_INTERNAL;
    416 
    417     (*inbuf) += 3;
    418     *inleft -= 3;
    419     return 0;
    420 }
    421 
    422 DECODER(iso2022)
    423 {
    424     const struct iso2022_designation *dsgcache = NULL;
    425 
    426     while (inleft > 0) {
    427         unsigned char c = INBYTE1;
    428         Py_ssize_t err;
    429 
    430         if (STATE_GETFLAG(F_ESCTHROUGHOUT)) {
    431             /* ESC throughout mode:
    432              * for non-iso2022 escape sequences */
    433             OUTCHAR(c); /* assume as ISO-8859-1 */
    434             NEXT_IN(1);
    435             if (IS_ESCEND(c)) {
    436                 STATE_CLEARFLAG(F_ESCTHROUGHOUT);
    437             }
    438             continue;
    439         }
    440 
    441         switch (c) {
    442         case ESC:
    443             REQUIRE_INBUF(2);
    444             if (IS_ISO2022ESC(INBYTE2)) {
    445                 err = iso2022processesc(config, state,
    446                                         inbuf, &inleft);
    447                 if (err != 0)
    448                     return err;
    449             }
    450             else if (CONFIG_ISSET(USE_G2) && INBYTE2 == 'N') {/* SS2 */
    451                 REQUIRE_INBUF(3);
    452                 err = iso2022processg2(config, state,
    453                                        inbuf, &inleft, writer);
    454                 if (err != 0)
    455                     return err;
    456             }
    457             else {
    458                 OUTCHAR(ESC);
    459                 STATE_SETFLAG(F_ESCTHROUGHOUT);
    460                 NEXT_IN(1);
    461             }
    462             break;
    463         case SI:
    464             if (CONFIG_ISSET(NO_SHIFT))
    465                 goto bypass;
    466             STATE_CLEARFLAG(F_SHIFTED);
    467             NEXT_IN(1);
    468             break;
    469         case SO:
    470             if (CONFIG_ISSET(NO_SHIFT))
    471                 goto bypass;
    472             STATE_SETFLAG(F_SHIFTED);
    473             NEXT_IN(1);
    474             break;
    475         case LF:
    476             STATE_CLEARFLAG(F_SHIFTED);
    477             OUTCHAR(LF);
    478             NEXT_IN(1);
    479             break;
    480         default:
    481             if (c < 0x20) /* C0 */
    482                 goto bypass;
    483             else if (c >= 0x80)
    484                 return 1;
    485             else {
    486                 const struct iso2022_designation *dsg;
    487                 unsigned char charset;
    488                 Py_UCS4 decoded;
    489 
    490                 if (STATE_GETFLAG(F_SHIFTED))
    491                     charset = STATE_G1;
    492                 else
    493                     charset = STATE_G0;
    494 
    495                 if (charset == CHARSET_ASCII) {
    496 bypass:
    497                     OUTCHAR(c);
    498                     NEXT_IN(1);
    499                     break;
    500                 }
    501 
    502                 if (dsgcache != NULL &&
    503                     dsgcache->mark == charset)
    504                         dsg = dsgcache;
    505                 else {
    506                     for (dsg = CONFIG_DESIGNATIONS;
    507                          dsg->mark != charset
    508 #ifdef Py_DEBUG
    509                             && dsg->mark != '\0'
    510 #endif
    511                          ; dsg++)
    512                     {
    513                         /* noop */
    514                     }
    515                     assert(dsg->mark != '\0');
    516                     dsgcache = dsg;
    517                 }
    518 
    519                 REQUIRE_INBUF(dsg->width);
    520                 decoded = dsg->decoder(*inbuf);
    521                 if (decoded == MAP_UNMAPPABLE)
    522                     return dsg->width;
    523 
    524                 if (decoded < 0x10000) {
    525                     OUTCHAR(decoded);
    526                 }
    527                 else if (decoded < 0x30000) {
    528                     OUTCHAR(decoded);
    529                 }
    530                 else { /* JIS X 0213 pairs */
    531                     OUTCHAR2(decoded >> 16, decoded & 0xffff);
    532                 }
    533                 NEXT_IN(dsg->width);
    534             }
    535             break;
    536         }
    537     }
    538     return 0;
    539 }
    540 
    541 /*-*- mapping table holders -*-*/
    542 
    543 #define ENCMAP(enc) static const encode_map *enc##_encmap = NULL;
    544 #define DECMAP(enc) static const decode_map *enc##_decmap = NULL;
    545 
    546 /* kr */
    547 ENCMAP(cp949)
    548 DECMAP(ksx1001)
    549 
    550 /* jp */
    551 ENCMAP(jisxcommon)
    552 DECMAP(jisx0208)
    553 DECMAP(jisx0212)
    554 ENCMAP(jisx0213_bmp)
    555 DECMAP(jisx0213_1_bmp)
    556 DECMAP(jisx0213_2_bmp)
    557 ENCMAP(jisx0213_emp)
    558 DECMAP(jisx0213_1_emp)
    559 DECMAP(jisx0213_2_emp)
    560 
    561 /* cn */
    562 ENCMAP(gbcommon)
    563 DECMAP(gb2312)
    564 
    565 /* tw */
    566 
    567 /*-*- mapping access functions -*-*/
    568 
    569 static int
    570 ksx1001_init(void)
    571 {
    572     static int initialized = 0;
    573 
    574     if (!initialized && (
    575                     IMPORT_MAP(kr, cp949, &cp949_encmap, NULL) ||
    576                     IMPORT_MAP(kr, ksx1001, NULL, &ksx1001_decmap)))
    577         return -1;
    578     initialized = 1;
    579     return 0;
    580 }
    581 
    582 static Py_UCS4
    583 ksx1001_decoder(const unsigned char *data)
    584 {
    585     Py_UCS4 u;
    586     if (TRYMAP_DEC(ksx1001, u, data[0], data[1]))
    587         return u;
    588     else
    589         return MAP_UNMAPPABLE;
    590 }
    591 
    592 static DBCHAR
    593 ksx1001_encoder(const Py_UCS4 *data, Py_ssize_t *length)
    594 {
    595     DBCHAR coded;
    596     assert(*length == 1);
    597     if (*data < 0x10000) {
    598         if (TRYMAP_ENC(cp949, coded, *data)) {
    599             if (!(coded & 0x8000))
    600                 return coded;
    601         }
    602     }
    603     return MAP_UNMAPPABLE;
    604 }
    605 
    606 static int
    607 jisx0208_init(void)
    608 {
    609     static int initialized = 0;
    610 
    611     if (!initialized && (
    612                     IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) ||
    613                     IMPORT_MAP(jp, jisx0208, NULL, &jisx0208_decmap)))
    614         return -1;
    615     initialized = 1;
    616     return 0;
    617 }
    618 
    619 static Py_UCS4
    620 jisx0208_decoder(const unsigned char *data)
    621 {
    622     Py_UCS4 u;
    623     if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
    624         return 0xff3c;
    625     else if (TRYMAP_DEC(jisx0208, u, data[0], data[1]))
    626         return u;
    627     else
    628         return MAP_UNMAPPABLE;
    629 }
    630 
    631 static DBCHAR
    632 jisx0208_encoder(const Py_UCS4 *data, Py_ssize_t *length)
    633 {
    634     DBCHAR coded;
    635     assert(*length == 1);
    636     if (*data < 0x10000) {
    637         if (*data == 0xff3c) /* F/W REVERSE SOLIDUS */
    638             return 0x2140;
    639         else if (TRYMAP_ENC(jisxcommon, coded, *data)) {
    640             if (!(coded & 0x8000))
    641                 return coded;
    642         }
    643     }
    644     return MAP_UNMAPPABLE;
    645 }
    646 
    647 static int
    648 jisx0212_init(void)
    649 {
    650     static int initialized = 0;
    651 
    652     if (!initialized && (
    653                     IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) ||
    654                     IMPORT_MAP(jp, jisx0212, NULL, &jisx0212_decmap)))
    655         return -1;
    656     initialized = 1;
    657     return 0;
    658 }
    659 
    660 static Py_UCS4
    661 jisx0212_decoder(const unsigned char *data)
    662 {
    663     Py_UCS4 u;
    664     if (TRYMAP_DEC(jisx0212, u, data[0], data[1]))
    665         return u;
    666     else
    667         return MAP_UNMAPPABLE;
    668 }
    669 
    670 static DBCHAR
    671 jisx0212_encoder(const Py_UCS4 *data, Py_ssize_t *length)
    672 {
    673     DBCHAR coded;
    674     assert(*length == 1);
    675     if (*data < 0x10000) {
    676         if (TRYMAP_ENC(jisxcommon, coded, *data)) {
    677             if (coded & 0x8000)
    678                 return coded & 0x7fff;
    679         }
    680     }
    681     return MAP_UNMAPPABLE;
    682 }
    683 
    684 static int
    685 jisx0213_init(void)
    686 {
    687     static int initialized = 0;
    688 
    689     if (!initialized && (
    690                     jisx0208_init() ||
    691                     IMPORT_MAP(jp, jisx0213_bmp,
    692                                &jisx0213_bmp_encmap, NULL) ||
    693                     IMPORT_MAP(jp, jisx0213_1_bmp,
    694                                NULL, &jisx0213_1_bmp_decmap) ||
    695                     IMPORT_MAP(jp, jisx0213_2_bmp,
    696                                NULL, &jisx0213_2_bmp_decmap) ||
    697                     IMPORT_MAP(jp, jisx0213_emp,
    698                                &jisx0213_emp_encmap, NULL) ||
    699                     IMPORT_MAP(jp, jisx0213_1_emp,
    700                                NULL, &jisx0213_1_emp_decmap) ||
    701                     IMPORT_MAP(jp, jisx0213_2_emp,
    702                                NULL, &jisx0213_2_emp_decmap) ||
    703                     IMPORT_MAP(jp, jisx0213_pair, &jisx0213_pair_encmap,
    704                                &jisx0213_pair_decmap)))
    705         return -1;
    706     initialized = 1;
    707     return 0;
    708 }
    709 
    710 #define config ((void *)2000)
    711 static Py_UCS4
    712 jisx0213_2000_1_decoder(const unsigned char *data)
    713 {
    714     Py_UCS4 u;
    715     EMULATE_JISX0213_2000_DECODE_PLANE1(u, data[0], data[1])
    716     else if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
    717         return 0xff3c;
    718     else if (TRYMAP_DEC(jisx0208, u, data[0], data[1]))
    719         ;
    720     else if (TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]))
    721         ;
    722     else if (TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1]))
    723         u |= 0x20000;
    724     else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]))
    725         ;
    726     else
    727         return MAP_UNMAPPABLE;
    728     return u;
    729 }
    730 
    731 static Py_UCS4
    732 jisx0213_2000_2_decoder(const unsigned char *data)
    733 {
    734     Py_UCS4 u;
    735     EMULATE_JISX0213_2000_DECODE_PLANE2_CHAR(u, data[0], data[1])
    736     if (TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]))
    737         ;
    738     else if (TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1]))
    739         u |= 0x20000;
    740     else
    741         return MAP_UNMAPPABLE;
    742     return u;
    743 }
    744 #undef config
    745 
    746 static Py_UCS4
    747 jisx0213_2004_1_decoder(const unsigned char *data)
    748 {
    749     Py_UCS4 u;
    750     if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
    751         return 0xff3c;
    752     else if (TRYMAP_DEC(jisx0208, u, data[0], data[1]))
    753         ;
    754     else if (TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]))
    755         ;
    756     else if (TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1]))
    757         u |= 0x20000;
    758     else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]))
    759         ;
    760     else
    761         return MAP_UNMAPPABLE;
    762     return u;
    763 }
    764 
    765 static Py_UCS4
    766 jisx0213_2004_2_decoder(const unsigned char *data)
    767 {
    768     Py_UCS4 u;
    769     if (TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]))
    770         ;
    771     else if (TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1]))
    772         u |= 0x20000;
    773     else
    774         return MAP_UNMAPPABLE;
    775     return u;
    776 }
    777 
    778 static DBCHAR
    779 jisx0213_encoder(const Py_UCS4 *data, Py_ssize_t *length, void *config)
    780 {
    781     DBCHAR coded;
    782 
    783     switch (*length) {
    784     case 1: /* first character */
    785         if (*data >= 0x10000) {
    786             if ((*data) >> 16 == 0x20000 >> 16) {
    787                 EMULATE_JISX0213_2000_ENCODE_EMP(coded, *data)
    788                 else if (TRYMAP_ENC(jisx0213_emp, coded, (*data) & 0xffff))
    789                     return coded;
    790             }
    791             return MAP_UNMAPPABLE;
    792         }
    793 
    794         EMULATE_JISX0213_2000_ENCODE_BMP(coded, *data)
    795         else if (TRYMAP_ENC(jisx0213_bmp, coded, *data)) {
    796             if (coded == MULTIC)
    797                 return MAP_MULTIPLE_AVAIL;
    798         }
    799         else if (TRYMAP_ENC(jisxcommon, coded, *data)) {
    800             if (coded & 0x8000)
    801                 return MAP_UNMAPPABLE;
    802         }
    803         else
    804             return MAP_UNMAPPABLE;
    805         return coded;
    806 
    807     case 2: /* second character of unicode pair */
    808         coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
    809                                 jisx0213_pair_encmap, JISX0213_ENCPAIRS);
    810         if (coded == DBCINV) {
    811             *length = 1;
    812             coded = find_pairencmap((ucs2_t)data[0], 0,
    813                       jisx0213_pair_encmap, JISX0213_ENCPAIRS);
    814             if (coded == DBCINV)
    815                 return MAP_UNMAPPABLE;
    816         }
    817         else
    818             return coded;
    819 
    820     case -1: /* flush unterminated */
    821         *length = 1;
    822         coded = find_pairencmap((ucs2_t)data[0], 0,
    823                                 jisx0213_pair_encmap, JISX0213_ENCPAIRS);
    824         if (coded == DBCINV)
    825             return MAP_UNMAPPABLE;
    826         else
    827             return coded;
    828         break;
    829 
    830     default:
    831         return MAP_UNMAPPABLE;
    832     }
    833 }
    834 
    835 static DBCHAR
    836 jisx0213_2000_1_encoder(const Py_UCS4 *data, Py_ssize_t *length)
    837 {
    838     DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
    839     if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
    840         return coded;
    841     else if (coded & 0x8000)
    842         return MAP_UNMAPPABLE;
    843     else
    844         return coded;
    845 }
    846 
    847 static DBCHAR
    848 jisx0213_2000_1_encoder_paironly(const Py_UCS4 *data, Py_ssize_t *length)
    849 {
    850     DBCHAR coded;
    851     Py_ssize_t ilength = *length;
    852 
    853     coded = jisx0213_encoder(data, length, (void *)2000);
    854     switch (ilength) {
    855     case 1:
    856         if (coded == MAP_MULTIPLE_AVAIL)
    857             return MAP_MULTIPLE_AVAIL;
    858         else
    859             return MAP_UNMAPPABLE;
    860     case 2:
    861         if (*length != 2)
    862             return MAP_UNMAPPABLE;
    863         else
    864             return coded;
    865     default:
    866         return MAP_UNMAPPABLE;
    867     }
    868 }
    869 
    870 static DBCHAR
    871 jisx0213_2000_2_encoder(const Py_UCS4 *data, Py_ssize_t *length)
    872 {
    873     DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
    874     if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
    875         return coded;
    876     else if (coded & 0x8000)
    877         return coded & 0x7fff;
    878     else
    879         return MAP_UNMAPPABLE;
    880 }
    881 
    882 static DBCHAR
    883 jisx0213_2004_1_encoder(const Py_UCS4 *data, Py_ssize_t *length)
    884 {
    885     DBCHAR coded = jisx0213_encoder(data, length, NULL);
    886     if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
    887         return coded;
    888     else if (coded & 0x8000)
    889         return MAP_UNMAPPABLE;
    890     else
    891         return coded;
    892 }
    893 
    894 static DBCHAR
    895 jisx0213_2004_1_encoder_paironly(const Py_UCS4 *data, Py_ssize_t *length)
    896 {
    897     DBCHAR coded;
    898     Py_ssize_t ilength = *length;
    899 
    900     coded = jisx0213_encoder(data, length, NULL);
    901     switch (ilength) {
    902     case 1:
    903         if (coded == MAP_MULTIPLE_AVAIL)
    904             return MAP_MULTIPLE_AVAIL;
    905         else
    906             return MAP_UNMAPPABLE;
    907     case 2:
    908         if (*length != 2)
    909             return MAP_UNMAPPABLE;
    910         else
    911             return coded;
    912     default:
    913         return MAP_UNMAPPABLE;
    914     }
    915 }
    916 
    917 static DBCHAR
    918 jisx0213_2004_2_encoder(const Py_UCS4 *data, Py_ssize_t *length)
    919 {
    920     DBCHAR coded = jisx0213_encoder(data, length, NULL);
    921     if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
    922         return coded;
    923     else if (coded & 0x8000)
    924         return coded & 0x7fff;
    925     else
    926         return MAP_UNMAPPABLE;
    927 }
    928 
    929 static Py_UCS4
    930 jisx0201_r_decoder(const unsigned char *data)
    931 {
    932     Py_UCS4 u;
    933     JISX0201_R_DECODE_CHAR(*data, u)
    934     else
    935         return MAP_UNMAPPABLE;
    936     return u;
    937 }
    938 
    939 static DBCHAR
    940 jisx0201_r_encoder(const Py_UCS4 *data, Py_ssize_t *length)
    941 {
    942     DBCHAR coded;
    943     JISX0201_R_ENCODE(*data, coded)
    944     else
    945         return MAP_UNMAPPABLE;
    946     return coded;
    947 }
    948 
    949 static Py_UCS4
    950 jisx0201_k_decoder(const unsigned char *data)
    951 {
    952     Py_UCS4 u;
    953     JISX0201_K_DECODE_CHAR(*data ^ 0x80, u)
    954     else
    955         return MAP_UNMAPPABLE;
    956     return u;
    957 }
    958 
    959 static DBCHAR
    960 jisx0201_k_encoder(const Py_UCS4 *data, Py_ssize_t *length)
    961 {
    962     DBCHAR coded;
    963     JISX0201_K_ENCODE(*data, coded)
    964     else
    965         return MAP_UNMAPPABLE;
    966     return coded - 0x80;
    967 }
    968 
    969 static int
    970 gb2312_init(void)
    971 {
    972     static int initialized = 0;
    973 
    974     if (!initialized && (
    975                     IMPORT_MAP(cn, gbcommon, &gbcommon_encmap, NULL) ||
    976                     IMPORT_MAP(cn, gb2312, NULL, &gb2312_decmap)))
    977         return -1;
    978     initialized = 1;
    979     return 0;
    980 }
    981 
    982 static Py_UCS4
    983 gb2312_decoder(const unsigned char *data)
    984 {
    985     Py_UCS4 u;
    986     if (TRYMAP_DEC(gb2312, u, data[0], data[1]))
    987         return u;
    988     else
    989         return MAP_UNMAPPABLE;
    990 }
    991 
    992 static DBCHAR
    993 gb2312_encoder(const Py_UCS4 *data, Py_ssize_t *length)
    994 {
    995     DBCHAR coded;
    996     assert(*length == 1);
    997     if (*data < 0x10000) {
    998         if (TRYMAP_ENC(gbcommon, coded, *data)) {
    999             if (!(coded & 0x8000))
   1000                 return coded;
   1001         }
   1002     }
   1003     return MAP_UNMAPPABLE;
   1004 }
   1005 
   1006 
   1007 static Py_UCS4
   1008 dummy_decoder(const unsigned char *data)
   1009 {
   1010     return MAP_UNMAPPABLE;
   1011 }
   1012 
   1013 static DBCHAR
   1014 dummy_encoder(const Py_UCS4 *data, Py_ssize_t *length)
   1015 {
   1016     return MAP_UNMAPPABLE;
   1017 }
   1018 
   1019 /*-*- registry tables -*-*/
   1020 
   1021 #define REGISTRY_KSX1001_G0     { CHARSET_KSX1001, 0, 2,                \
   1022                   ksx1001_init,                                         \
   1023                   ksx1001_decoder, ksx1001_encoder }
   1024 #define REGISTRY_KSX1001_G1     { CHARSET_KSX1001, 1, 2,                \
   1025                   ksx1001_init,                                         \
   1026                   ksx1001_decoder, ksx1001_encoder }
   1027 #define REGISTRY_JISX0201_R     { CHARSET_JISX0201_R, 0, 1,             \
   1028                   NULL,                                                 \
   1029                   jisx0201_r_decoder, jisx0201_r_encoder }
   1030 #define REGISTRY_JISX0201_K     { CHARSET_JISX0201_K, 0, 1,             \
   1031                   NULL,                                                 \
   1032                   jisx0201_k_decoder, jisx0201_k_encoder }
   1033 #define REGISTRY_JISX0208       { CHARSET_JISX0208, 0, 2,               \
   1034                   jisx0208_init,                                        \
   1035                   jisx0208_decoder, jisx0208_encoder }
   1036 #define REGISTRY_JISX0208_O     { CHARSET_JISX0208_O, 0, 2,             \
   1037                   jisx0208_init,                                        \
   1038                   jisx0208_decoder, jisx0208_encoder }
   1039 #define REGISTRY_JISX0212       { CHARSET_JISX0212, 0, 2,               \
   1040                   jisx0212_init,                                        \
   1041                   jisx0212_decoder, jisx0212_encoder }
   1042 #define REGISTRY_JISX0213_2000_1 { CHARSET_JISX0213_2000_1, 0, 2,       \
   1043                   jisx0213_init,                                        \
   1044                   jisx0213_2000_1_decoder,                              \
   1045                   jisx0213_2000_1_encoder }
   1046 #define REGISTRY_JISX0213_2000_1_PAIRONLY { CHARSET_JISX0213_2000_1, 0, 2, \
   1047                   jisx0213_init,                                        \
   1048                   jisx0213_2000_1_decoder,                              \
   1049                   jisx0213_2000_1_encoder_paironly }
   1050 #define REGISTRY_JISX0213_2000_2 { CHARSET_JISX0213_2, 0, 2,            \
   1051                   jisx0213_init,                                        \
   1052                   jisx0213_2000_2_decoder,                              \
   1053                   jisx0213_2000_2_encoder }
   1054 #define REGISTRY_JISX0213_2004_1 { CHARSET_JISX0213_2004_1, 0, 2,       \
   1055                   jisx0213_init,                                        \
   1056                   jisx0213_2004_1_decoder,                              \
   1057                   jisx0213_2004_1_encoder }
   1058 #define REGISTRY_JISX0213_2004_1_PAIRONLY { CHARSET_JISX0213_2004_1, 0, 2, \
   1059                   jisx0213_init,                                        \
   1060                   jisx0213_2004_1_decoder,                              \
   1061                   jisx0213_2004_1_encoder_paironly }
   1062 #define REGISTRY_JISX0213_2004_2 { CHARSET_JISX0213_2, 0, 2,            \
   1063                   jisx0213_init,                                        \
   1064                   jisx0213_2004_2_decoder,                              \
   1065                   jisx0213_2004_2_encoder }
   1066 #define REGISTRY_GB2312         { CHARSET_GB2312, 0, 2,                 \
   1067                   gb2312_init,                                          \
   1068                   gb2312_decoder, gb2312_encoder }
   1069 #define REGISTRY_CNS11643_1     { CHARSET_CNS11643_1, 1, 2,             \
   1070                   cns11643_init,                                        \
   1071                   cns11643_1_decoder, cns11643_1_encoder }
   1072 #define REGISTRY_CNS11643_2     { CHARSET_CNS11643_2, 2, 2,             \
   1073                   cns11643_init,                                        \
   1074                   cns11643_2_decoder, cns11643_2_encoder }
   1075 #define REGISTRY_ISO8859_1      { CHARSET_ISO8859_1, 2, 1,              \
   1076                   NULL, dummy_decoder, dummy_encoder }
   1077 #define REGISTRY_ISO8859_7      { CHARSET_ISO8859_7, 2, 1,              \
   1078                   NULL, dummy_decoder, dummy_encoder }
   1079 #define REGISTRY_SENTINEL       { 0, }
   1080 #define CONFIGDEF(var, attrs)                                           \
   1081     static const struct iso2022_config iso2022_##var##_config = {       \
   1082         attrs, iso2022_##var##_designations                             \
   1083     };
   1084 
   1085 static const struct iso2022_designation iso2022_kr_designations[] = {
   1086     REGISTRY_KSX1001_G1, REGISTRY_SENTINEL
   1087 };
   1088 CONFIGDEF(kr, 0)
   1089 
   1090 static const struct iso2022_designation iso2022_jp_designations[] = {
   1091     REGISTRY_JISX0208, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
   1092     REGISTRY_SENTINEL
   1093 };
   1094 CONFIGDEF(jp, NO_SHIFT | USE_JISX0208_EXT)
   1095 
   1096 static const struct iso2022_designation iso2022_jp_1_designations[] = {
   1097     REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
   1098     REGISTRY_JISX0208_O, REGISTRY_SENTINEL
   1099 };
   1100 CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT)
   1101 
   1102 static const struct iso2022_designation iso2022_jp_2_designations[] = {
   1103     REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0,
   1104     REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
   1105     REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL
   1106 };
   1107 CONFIGDEF(jp_2, NO_SHIFT | USE_G2 | USE_JISX0208_EXT)
   1108 
   1109 static const struct iso2022_designation iso2022_jp_2004_designations[] = {
   1110     REGISTRY_JISX0213_2004_1_PAIRONLY, REGISTRY_JISX0208,
   1111     REGISTRY_JISX0213_2004_1, REGISTRY_JISX0213_2004_2, REGISTRY_SENTINEL
   1112 };
   1113 CONFIGDEF(jp_2004, NO_SHIFT | USE_JISX0208_EXT)
   1114 
   1115 static const struct iso2022_designation iso2022_jp_3_designations[] = {
   1116     REGISTRY_JISX0213_2000_1_PAIRONLY, REGISTRY_JISX0208,
   1117     REGISTRY_JISX0213_2000_1, REGISTRY_JISX0213_2000_2, REGISTRY_SENTINEL
   1118 };
   1119 CONFIGDEF(jp_3, NO_SHIFT | USE_JISX0208_EXT)
   1120 
   1121 static const struct iso2022_designation iso2022_jp_ext_designations[] = {
   1122     REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
   1123     REGISTRY_JISX0201_K, REGISTRY_JISX0208_O, REGISTRY_SENTINEL
   1124 };
   1125 CONFIGDEF(jp_ext, NO_SHIFT | USE_JISX0208_EXT)
   1126 
   1127 
   1128 BEGIN_MAPPINGS_LIST
   1129   /* no mapping table here */
   1130 END_MAPPINGS_LIST
   1131 
   1132 #define ISO2022_CODEC(variation) {              \
   1133     "iso2022_" #variation,                      \
   1134     &iso2022_##variation##_config,              \
   1135     iso2022_codec_init,                         \
   1136     _STATEFUL_METHODS(iso2022)                  \
   1137 },
   1138 
   1139 BEGIN_CODECS_LIST
   1140   ISO2022_CODEC(kr)
   1141   ISO2022_CODEC(jp)
   1142   ISO2022_CODEC(jp_1)
   1143   ISO2022_CODEC(jp_2)
   1144   ISO2022_CODEC(jp_2004)
   1145   ISO2022_CODEC(jp_3)
   1146   ISO2022_CODEC(jp_ext)
   1147 END_CODECS_LIST
   1148 
   1149 I_AM_A_MODULE_FOR(iso2022)
   1150