Home | History | Annotate | Download | only in cjkcodecs
      1 /*
      2  * _codecs_iso2022.c: Codecs collection for ISO-2022 encodings.
      3  *
      4  * Written by Hye-Shik Chang <perky (at) FreeBSD.org>
      5  */
      6 
      7 #define USING_IMPORTED_MAPS
      8 #define USING_BINARY_PAIR_SEARCH
      9 #define EXTERN_JISX0213_PAIR
     10 #define EMULATE_JISX0213_2000_ENCODE_INVALID MAP_UNMAPPABLE
     11 #define EMULATE_JISX0213_2000_DECODE_INVALID MAP_UNMAPPABLE
     12 
     13 #include "cjkcodecs.h"
     14 #include "alg_jisx0201.h"
     15 #include "emu_jisx0213_2000.h"
     16 #include "mappings_jisx0213_pair.h"
     17 
     18 /* STATE
     19 
     20    state->c[0-3]
     21 
     22     00000000
     23     ||^^^^^|
     24     |+-----+----  G0-3 Character Set
     25     +-----------  Is G0-3 double byte?
     26 
     27    state->c[4]
     28 
     29     00000000
     30           ||
     31           |+----  Locked-Shift?
     32           +-----  ESC Throughout
     33 */
     34 
     35 #define ESC                     0x1B
     36 #define SO                      0x0E
     37 #define SI                      0x0F
     38 #define LF                      0x0A
     39 
     40 #define MAX_ESCSEQLEN           16
     41 
     42 #define CHARSET_ISO8859_1       'A'
     43 #define CHARSET_ASCII           'B'
     44 #define CHARSET_ISO8859_7       'F'
     45 #define CHARSET_JISX0201_K      'I'
     46 #define CHARSET_JISX0201_R      'J'
     47 
     48 #define CHARSET_GB2312          ('A'|CHARSET_DBCS)
     49 #define CHARSET_JISX0208        ('B'|CHARSET_DBCS)
     50 #define CHARSET_KSX1001         ('C'|CHARSET_DBCS)
     51 #define CHARSET_JISX0212        ('D'|CHARSET_DBCS)
     52 #define CHARSET_GB2312_8565     ('E'|CHARSET_DBCS)
     53 #define CHARSET_CNS11643_1      ('G'|CHARSET_DBCS)
     54 #define CHARSET_CNS11643_2      ('H'|CHARSET_DBCS)
     55 #define CHARSET_JISX0213_2000_1 ('O'|CHARSET_DBCS)
     56 #define CHARSET_JISX0213_2      ('P'|CHARSET_DBCS)
     57 #define CHARSET_JISX0213_2004_1 ('Q'|CHARSET_DBCS)
     58 #define CHARSET_JISX0208_O      ('@'|CHARSET_DBCS)
     59 
     60 #define CHARSET_DBCS            0x80
     61 #define ESCMARK(mark)           ((mark) & 0x7f)
     62 
     63 #define IS_ESCEND(c)    (((c) >= 'A' && (c) <= 'Z') || (c) == '@')
     64 #define IS_ISO2022ESC(c2) \
     65         ((c2) == '(' || (c2) == ')' || (c2) == '$' || \
     66          (c2) == '.' || (c2) == '&')
     67     /* this is not a complete list of ISO-2022 escape sequence headers.
     68      * but, it's enough to implement CJK instances of iso-2022. */
     69 
     70 #define MAP_UNMAPPABLE          0xFFFF
     71 #define MAP_MULTIPLE_AVAIL      0xFFFE /* for JIS X 0213 */
     72 
     73 #define F_SHIFTED               0x01
     74 #define F_ESCTHROUGHOUT         0x02
     75 
     76 #define STATE_SETG(dn, v)       ((state)->c[dn]) = (v);
     77 #define STATE_GETG(dn)          ((state)->c[dn])
     78 
     79 #define STATE_G0                STATE_GETG(0)
     80 #define STATE_G1                STATE_GETG(1)
     81 #define STATE_G2                STATE_GETG(2)
     82 #define STATE_G3                STATE_GETG(3)
     83 #define STATE_SETG0(v)          STATE_SETG(0, v)
     84 #define STATE_SETG1(v)          STATE_SETG(1, v)
     85 #define STATE_SETG2(v)          STATE_SETG(2, v)
     86 #define STATE_SETG3(v)          STATE_SETG(3, v)
     87 
     88 #define STATE_SETFLAG(f)        ((state)->c[4]) |= (f);
     89 #define STATE_GETFLAG(f)        ((state)->c[4] & (f))
     90 #define STATE_CLEARFLAG(f)      ((state)->c[4]) &= ~(f);
     91 #define STATE_CLEARFLAGS()      ((state)->c[4]) = 0;
     92 
     93 #define ISO2022_CONFIG          ((const struct iso2022_config *)config)
     94 #define CONFIG_ISSET(flag)      (ISO2022_CONFIG->flags & (flag))
     95 #define CONFIG_DESIGNATIONS     (ISO2022_CONFIG->designations)
     96 
     97 /* iso2022_config.flags */
     98 #define NO_SHIFT                0x01
     99 #define USE_G2                  0x02
    100 #define USE_JISX0208_EXT        0x04
    101 
    102 /*-*- internal data structures -*-*/
    103 
    104 typedef int (*iso2022_init_func)(void);
    105 typedef ucs4_t (*iso2022_decode_func)(const unsigned char *data);
    106 typedef DBCHAR (*iso2022_encode_func)(const ucs4_t *data, Py_ssize_t *length);
    107 
    108 struct iso2022_designation {
    109     unsigned char mark;
    110     unsigned char plane;
    111     unsigned char width;
    112     iso2022_init_func initializer;
    113     iso2022_decode_func decoder;
    114     iso2022_encode_func encoder;
    115 };
    116 
    117 struct iso2022_config {
    118     int flags;
    119     const struct iso2022_designation *designations; /* non-ascii desigs */
    120 };
    121 
    122 /*-*- iso-2022 codec implementation -*-*/
    123 
    124 CODEC_INIT(iso2022)
    125 {
    126     const struct iso2022_designation *desig = CONFIG_DESIGNATIONS;
    127     for (desig = CONFIG_DESIGNATIONS; desig->mark; desig++)
    128         if (desig->initializer != NULL && desig->initializer() != 0)
    129             return -1;
    130     return 0;
    131 }
    132 
    133 ENCODER_INIT(iso2022)
    134 {
    135     STATE_CLEARFLAGS()
    136     STATE_SETG0(CHARSET_ASCII)
    137     STATE_SETG1(CHARSET_ASCII)
    138     return 0;
    139 }
    140 
    141 ENCODER_RESET(iso2022)
    142 {
    143     if (STATE_GETFLAG(F_SHIFTED)) {
    144         WRITE1(SI)
    145         NEXT_OUT(1)
    146         STATE_CLEARFLAG(F_SHIFTED)
    147     }
    148     if (STATE_G0 != CHARSET_ASCII) {
    149         WRITE3(ESC, '(', 'B')
    150         NEXT_OUT(3)
    151         STATE_SETG0(CHARSET_ASCII)
    152     }
    153     return 0;
    154 }
    155 
    156 ENCODER(iso2022)
    157 {
    158     while (inleft > 0) {
    159         const struct iso2022_designation *dsg;
    160         DBCHAR encoded;
    161         ucs4_t c = **inbuf;
    162         Py_ssize_t insize;
    163 
    164         if (c < 0x80) {
    165             if (STATE_G0 != CHARSET_ASCII) {
    166                 WRITE3(ESC, '(', 'B')
    167                 STATE_SETG0(CHARSET_ASCII)
    168                 NEXT_OUT(3)
    169             }
    170             if (STATE_GETFLAG(F_SHIFTED)) {
    171                 WRITE1(SI)
    172                 STATE_CLEARFLAG(F_SHIFTED)
    173                 NEXT_OUT(1)
    174             }
    175             WRITE1((unsigned char)c)
    176             NEXT(1, 1)
    177             continue;
    178         }
    179 
    180         DECODE_SURROGATE(c)
    181         insize = GET_INSIZE(c);
    182 
    183         encoded = MAP_UNMAPPABLE;
    184         for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
    185             Py_ssize_t length = 1;
    186             encoded = dsg->encoder(&c, &length);
    187             if (encoded == MAP_MULTIPLE_AVAIL) {
    188                 /* this implementation won't work for pair
    189                  * of non-bmp characters. */
    190                 if (inleft < 2) {
    191                     if (!(flags & MBENC_FLUSH))
    192                         return MBERR_TOOFEW;
    193                     length = -1;
    194                 }
    195                 else
    196                     length = 2;
    197 #if Py_UNICODE_SIZE == 2
    198                 if (length == 2) {
    199                     ucs4_t u4in[2];
    200                     u4in[0] = (ucs4_t)IN1;
    201                     u4in[1] = (ucs4_t)IN2;
    202                     encoded = dsg->encoder(u4in, &length);
    203                 } else
    204                     encoded = dsg->encoder(&c, &length);
    205 #else
    206                 encoded = dsg->encoder(&c, &length);
    207 #endif
    208                 if (encoded != MAP_UNMAPPABLE) {
    209                     insize = length;
    210                     break;
    211                 }
    212             }
    213             else if (encoded != MAP_UNMAPPABLE)
    214                 break;
    215         }
    216 
    217         if (!dsg->mark)
    218             return 1;
    219         assert(dsg->width == 1 || dsg->width == 2);
    220 
    221         switch (dsg->plane) {
    222         case 0: /* G0 */
    223             if (STATE_GETFLAG(F_SHIFTED)) {
    224                 WRITE1(SI)
    225                 STATE_CLEARFLAG(F_SHIFTED)
    226                 NEXT_OUT(1)
    227             }
    228             if (STATE_G0 != dsg->mark) {
    229                 if (dsg->width == 1) {
    230                     WRITE3(ESC, '(', ESCMARK(dsg->mark))
    231                     STATE_SETG0(dsg->mark)
    232                     NEXT_OUT(3)
    233                 }
    234                 else if (dsg->mark == CHARSET_JISX0208) {
    235                     WRITE3(ESC, '$', ESCMARK(dsg->mark))
    236                     STATE_SETG0(dsg->mark)
    237                     NEXT_OUT(3)
    238                 }
    239                 else {
    240                     WRITE4(ESC, '$', '(',
    241                         ESCMARK(dsg->mark))
    242                     STATE_SETG0(dsg->mark)
    243                     NEXT_OUT(4)
    244                 }
    245             }
    246             break;
    247         case 1: /* G1 */
    248             if (STATE_G1 != dsg->mark) {
    249                 if (dsg->width == 1) {
    250                     WRITE3(ESC, ')', ESCMARK(dsg->mark))
    251                     STATE_SETG1(dsg->mark)
    252                     NEXT_OUT(3)
    253                 }
    254                 else {
    255                     WRITE4(ESC, '$', ')',
    256                         ESCMARK(dsg->mark))
    257                     STATE_SETG1(dsg->mark)
    258                     NEXT_OUT(4)
    259                 }
    260             }
    261             if (!STATE_GETFLAG(F_SHIFTED)) {
    262                 WRITE1(SO)
    263                 STATE_SETFLAG(F_SHIFTED)
    264                 NEXT_OUT(1)
    265             }
    266             break;
    267         default: /* G2 and G3 is not supported: no encoding in
    268                   * CJKCodecs are using them yet */
    269             return MBERR_INTERNAL;
    270         }
    271 
    272         if (dsg->width == 1) {
    273             WRITE1((unsigned char)encoded)
    274             NEXT_OUT(1)
    275         }
    276         else {
    277             WRITE2(encoded >> 8, encoded & 0xff)
    278             NEXT_OUT(2)
    279         }
    280         NEXT_IN(insize)
    281     }
    282 
    283     return 0;
    284 }
    285 
    286 DECODER_INIT(iso2022)
    287 {
    288     STATE_CLEARFLAGS()
    289     STATE_SETG0(CHARSET_ASCII)
    290     STATE_SETG1(CHARSET_ASCII)
    291     STATE_SETG2(CHARSET_ASCII)
    292     return 0;
    293 }
    294 
    295 DECODER_RESET(iso2022)
    296 {
    297     STATE_SETG0(CHARSET_ASCII)
    298     STATE_CLEARFLAG(F_SHIFTED)
    299     return 0;
    300 }
    301 
    302 static Py_ssize_t
    303 iso2022processesc(const void *config, MultibyteCodec_State *state,
    304                   const unsigned char **inbuf, Py_ssize_t *inleft)
    305 {
    306     unsigned char charset, designation;
    307     Py_ssize_t i, esclen;
    308 
    309     for (i = 1;i < MAX_ESCSEQLEN;i++) {
    310         if (i >= *inleft)
    311             return MBERR_TOOFEW;
    312         if (IS_ESCEND((*inbuf)[i])) {
    313             esclen = i + 1;
    314             break;
    315         }
    316         else if (CONFIG_ISSET(USE_JISX0208_EXT) && i+1 < *inleft &&
    317                  (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@')
    318             i += 2;
    319     }
    320 
    321     if (i >= MAX_ESCSEQLEN)
    322         return 1; /* unterminated escape sequence */
    323 
    324     switch (esclen) {
    325     case 3:
    326         if (IN2 == '$') {
    327             charset = IN3 | CHARSET_DBCS;
    328             designation = 0;
    329         }
    330         else {
    331             charset = IN3;
    332             if (IN2 == '(') designation = 0;
    333             else if (IN2 == ')') designation = 1;
    334             else if (CONFIG_ISSET(USE_G2) && IN2 == '.')
    335                 designation = 2;
    336             else return 3;
    337         }
    338         break;
    339     case 4:
    340         if (IN2 != '$')
    341             return 4;
    342 
    343         charset = IN4 | CHARSET_DBCS;
    344         if (IN3 == '(') designation = 0;
    345         else if (IN3 == ')') designation = 1;
    346         else return 4;
    347         break;
    348     case 6: /* designation with prefix */
    349         if (CONFIG_ISSET(USE_JISX0208_EXT) &&
    350             (*inbuf)[3] == ESC && (*inbuf)[4] == '$' &&
    351             (*inbuf)[5] == 'B') {
    352             charset = 'B' | CHARSET_DBCS;
    353             designation = 0;
    354         }
    355         else
    356             return 6;
    357         break;
    358     default:
    359         return esclen;
    360     }
    361 
    362     /* raise error when the charset is not designated for this encoding */
    363     if (charset != CHARSET_ASCII) {
    364         const struct iso2022_designation *dsg;
    365 
    366         for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++)
    367             if (dsg->mark == charset)
    368                 break;
    369         if (!dsg->mark)
    370             return esclen;
    371     }
    372 
    373     STATE_SETG(designation, charset)
    374     *inleft -= esclen;
    375     (*inbuf) += esclen;
    376     return 0;
    377 }
    378 
    379 #define ISO8859_7_DECODE(c, assi)                                       \
    380     if ((c) < 0xa0) (assi) = (c);                                       \
    381     else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0))))          \
    382         (assi) = (c);                                                   \
    383     else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 ||              \
    384              (0xbffffd77L & (1L << ((c)-0xb4)))))                       \
    385         (assi) = 0x02d0 + (c);                                          \
    386     else if ((c) == 0xa1) (assi) = 0x2018;                              \
    387     else if ((c) == 0xa2) (assi) = 0x2019;                              \
    388     else if ((c) == 0xaf) (assi) = 0x2015;
    389 
    390 static Py_ssize_t
    391 iso2022processg2(const void *config, MultibyteCodec_State *state,
    392                  const unsigned char **inbuf, Py_ssize_t *inleft,
    393                  Py_UNICODE **outbuf, Py_ssize_t *outleft)
    394 {
    395     /* not written to use encoder, decoder functions because only few
    396      * encodings use G2 designations in CJKCodecs */
    397     if (STATE_G2 == CHARSET_ISO8859_1) {
    398         if (IN3 < 0x80)
    399             OUT1(IN3 + 0x80)
    400         else
    401             return 3;
    402     }
    403     else if (STATE_G2 == CHARSET_ISO8859_7) {
    404         ISO8859_7_DECODE(IN3 ^ 0x80, **outbuf)
    405         else return 3;
    406     }
    407     else if (STATE_G2 == CHARSET_ASCII) {
    408         if (IN3 & 0x80) return 3;
    409         else **outbuf = IN3;
    410     }
    411     else
    412         return MBERR_INTERNAL;
    413 
    414     (*inbuf) += 3;
    415     *inleft -= 3;
    416     (*outbuf) += 1;
    417     *outleft -= 1;
    418     return 0;
    419 }
    420 
    421 DECODER(iso2022)
    422 {
    423     const struct iso2022_designation *dsgcache = NULL;
    424 
    425     while (inleft > 0) {
    426         unsigned char c = IN1;
    427         Py_ssize_t err;
    428 
    429         if (STATE_GETFLAG(F_ESCTHROUGHOUT)) {
    430             /* ESC throughout mode:
    431              * for non-iso2022 escape sequences */
    432             WRITE1(c) /* assume as ISO-8859-1 */
    433             NEXT(1, 1)
    434             if (IS_ESCEND(c)) {
    435                 STATE_CLEARFLAG(F_ESCTHROUGHOUT)
    436             }
    437             continue;
    438         }
    439 
    440         switch (c) {
    441         case ESC:
    442             REQUIRE_INBUF(2)
    443             if (IS_ISO2022ESC(IN2)) {
    444                 err = iso2022processesc(config, state,
    445                                         inbuf, &inleft);
    446                 if (err != 0)
    447                     return err;
    448             }
    449             else if (CONFIG_ISSET(USE_G2) && IN2 == 'N') {/* SS2 */
    450                 REQUIRE_INBUF(3)
    451                 err = iso2022processg2(config, state,
    452                     inbuf, &inleft, outbuf, &outleft);
    453                 if (err != 0)
    454                     return err;
    455             }
    456             else {
    457                 WRITE1(ESC)
    458                 STATE_SETFLAG(F_ESCTHROUGHOUT)
    459                 NEXT(1, 1)
    460             }
    461             break;
    462         case SI:
    463             if (CONFIG_ISSET(NO_SHIFT))
    464                 goto bypass;
    465             STATE_CLEARFLAG(F_SHIFTED)
    466             NEXT_IN(1)
    467             break;
    468         case SO:
    469             if (CONFIG_ISSET(NO_SHIFT))
    470                 goto bypass;
    471             STATE_SETFLAG(F_SHIFTED)
    472             NEXT_IN(1)
    473             break;
    474         case LF:
    475             STATE_CLEARFLAG(F_SHIFTED)
    476             WRITE1(LF)
    477             NEXT(1, 1)
    478             break;
    479         default:
    480             if (c < 0x20) /* C0 */
    481                 goto bypass;
    482             else if (c >= 0x80)
    483                 return 1;
    484             else {
    485                 const struct iso2022_designation *dsg;
    486                 unsigned char charset;
    487                 ucs4_t decoded;
    488 
    489                 if (STATE_GETFLAG(F_SHIFTED))
    490                     charset = STATE_G1;
    491                 else
    492                     charset = STATE_G0;
    493 
    494                 if (charset == CHARSET_ASCII) {
    495 bypass:                                 WRITE1(c)
    496                                         NEXT(1, 1)
    497                                         break;
    498                                 }
    499 
    500                                 if (dsgcache != NULL &&
    501                                     dsgcache->mark == charset)
    502                                         dsg = dsgcache;
    503                                 else {
    504                                         for (dsg = CONFIG_DESIGNATIONS;
    505                                              dsg->mark != charset
    506 #ifdef Py_DEBUG
    507                                                 && dsg->mark != '\0'
    508 #endif
    509                                              ;dsg++)
    510                                                 /* noop */;
    511                                         assert(dsg->mark != '\0');
    512                                         dsgcache = dsg;
    513                                 }
    514 
    515                                 REQUIRE_INBUF(dsg->width)
    516                                 decoded = dsg->decoder(*inbuf);
    517                                 if (decoded == MAP_UNMAPPABLE)
    518                                         return dsg->width;
    519 
    520                                 if (decoded < 0x10000) {
    521                                         WRITE1(decoded)
    522                                         NEXT_OUT(1)
    523                                 }
    524                                 else if (decoded < 0x30000) {
    525                                         WRITEUCS4(decoded)
    526                                 }
    527                                 else { /* JIS X 0213 pairs */
    528                     WRITE2(decoded >> 16, decoded & 0xffff)
    529                     NEXT_OUT(2)
    530                 }
    531                 NEXT_IN(dsg->width)
    532             }
    533             break;
    534         }
    535     }
    536     return 0;
    537 }
    538 
    539 /*-*- mapping table holders -*-*/
    540 
    541 #define ENCMAP(enc) static const encode_map *enc##_encmap = NULL;
    542 #define DECMAP(enc) static const decode_map *enc##_decmap = NULL;
    543 
    544 /* kr */
    545 ENCMAP(cp949)
    546 DECMAP(ksx1001)
    547 
    548 /* jp */
    549 ENCMAP(jisxcommon)
    550 DECMAP(jisx0208)
    551 DECMAP(jisx0212)
    552 ENCMAP(jisx0213_bmp)
    553 DECMAP(jisx0213_1_bmp)
    554 DECMAP(jisx0213_2_bmp)
    555 ENCMAP(jisx0213_emp)
    556 DECMAP(jisx0213_1_emp)
    557 DECMAP(jisx0213_2_emp)
    558 
    559 /* cn */
    560 ENCMAP(gbcommon)
    561 DECMAP(gb2312)
    562 
    563 /* tw */
    564 
    565 /*-*- mapping access functions -*-*/
    566 
    567 static int
    568 ksx1001_init(void)
    569 {
    570     static int initialized = 0;
    571 
    572     if (!initialized && (
    573                     IMPORT_MAP(kr, cp949, &cp949_encmap, NULL) ||
    574                     IMPORT_MAP(kr, ksx1001, NULL, &ksx1001_decmap)))
    575         return -1;
    576     initialized = 1;
    577     return 0;
    578 }
    579 
    580 static ucs4_t
    581 ksx1001_decoder(const unsigned char *data)
    582 {
    583     ucs4_t u;
    584     TRYMAP_DEC(ksx1001, u, data[0], data[1])
    585         return u;
    586     else
    587         return MAP_UNMAPPABLE;
    588 }
    589 
    590 static DBCHAR
    591 ksx1001_encoder(const ucs4_t *data, Py_ssize_t *length)
    592 {
    593     DBCHAR coded;
    594     assert(*length == 1);
    595     if (*data < 0x10000) {
    596         TRYMAP_ENC(cp949, coded, *data)
    597             if (!(coded & 0x8000))
    598                 return coded;
    599     }
    600     return MAP_UNMAPPABLE;
    601 }
    602 
    603 static int
    604 jisx0208_init(void)
    605 {
    606     static int initialized = 0;
    607 
    608     if (!initialized && (
    609                     IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) ||
    610                     IMPORT_MAP(jp, jisx0208, NULL, &jisx0208_decmap)))
    611         return -1;
    612     initialized = 1;
    613     return 0;
    614 }
    615 
    616 static ucs4_t
    617 jisx0208_decoder(const unsigned char *data)
    618 {
    619     ucs4_t u;
    620     if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
    621         return 0xff3c;
    622     else TRYMAP_DEC(jisx0208, u, data[0], data[1])
    623         return u;
    624     else
    625         return MAP_UNMAPPABLE;
    626 }
    627 
    628 static DBCHAR
    629 jisx0208_encoder(const ucs4_t *data, Py_ssize_t *length)
    630 {
    631     DBCHAR coded;
    632     assert(*length == 1);
    633     if (*data < 0x10000) {
    634         if (*data == 0xff3c) /* F/W REVERSE SOLIDUS */
    635             return 0x2140;
    636         else TRYMAP_ENC(jisxcommon, coded, *data) {
    637             if (!(coded & 0x8000))
    638                 return coded;
    639         }
    640     }
    641     return MAP_UNMAPPABLE;
    642 }
    643 
    644 static int
    645 jisx0212_init(void)
    646 {
    647     static int initialized = 0;
    648 
    649     if (!initialized && (
    650                     IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) ||
    651                     IMPORT_MAP(jp, jisx0212, NULL, &jisx0212_decmap)))
    652         return -1;
    653     initialized = 1;
    654     return 0;
    655 }
    656 
    657 static ucs4_t
    658 jisx0212_decoder(const unsigned char *data)
    659 {
    660     ucs4_t u;
    661     TRYMAP_DEC(jisx0212, u, data[0], data[1])
    662         return u;
    663     else
    664         return MAP_UNMAPPABLE;
    665 }
    666 
    667 static DBCHAR
    668 jisx0212_encoder(const ucs4_t *data, Py_ssize_t *length)
    669 {
    670     DBCHAR coded;
    671     assert(*length == 1);
    672     if (*data < 0x10000) {
    673         TRYMAP_ENC(jisxcommon, coded, *data) {
    674             if (coded & 0x8000)
    675                 return coded & 0x7fff;
    676         }
    677     }
    678     return MAP_UNMAPPABLE;
    679 }
    680 
    681 static int
    682 jisx0213_init(void)
    683 {
    684     static int initialized = 0;
    685 
    686     if (!initialized && (
    687                     jisx0208_init() ||
    688                     IMPORT_MAP(jp, jisx0213_bmp,
    689                                &jisx0213_bmp_encmap, NULL) ||
    690                     IMPORT_MAP(jp, jisx0213_1_bmp,
    691                                NULL, &jisx0213_1_bmp_decmap) ||
    692                     IMPORT_MAP(jp, jisx0213_2_bmp,
    693                                NULL, &jisx0213_2_bmp_decmap) ||
    694                     IMPORT_MAP(jp, jisx0213_emp,
    695                                &jisx0213_emp_encmap, NULL) ||
    696                     IMPORT_MAP(jp, jisx0213_1_emp,
    697                                NULL, &jisx0213_1_emp_decmap) ||
    698                     IMPORT_MAP(jp, jisx0213_2_emp,
    699                                NULL, &jisx0213_2_emp_decmap) ||
    700                     IMPORT_MAP(jp, jisx0213_pair, &jisx0213_pair_encmap,
    701                                &jisx0213_pair_decmap)))
    702         return -1;
    703     initialized = 1;
    704     return 0;
    705 }
    706 
    707 #define config ((void *)2000)
    708 static ucs4_t
    709 jisx0213_2000_1_decoder(const unsigned char *data)
    710 {
    711     ucs4_t u;
    712     EMULATE_JISX0213_2000_DECODE_PLANE1(u, data[0], data[1])
    713     else if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
    714         return 0xff3c;
    715     else TRYMAP_DEC(jisx0208, u, data[0], data[1]);
    716     else TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]);
    717     else TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1])
    718         u |= 0x20000;
    719     else TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]);
    720     else
    721         return MAP_UNMAPPABLE;
    722     return u;
    723 }
    724 
    725 static ucs4_t
    726 jisx0213_2000_2_decoder(const unsigned char *data)
    727 {
    728     ucs4_t u;
    729     EMULATE_JISX0213_2000_DECODE_PLANE2(u, data[0], data[1])
    730     TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]);
    731     else TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1])
    732         u |= 0x20000;
    733     else
    734         return MAP_UNMAPPABLE;
    735     return u;
    736 }
    737 #undef config
    738 
    739 static ucs4_t
    740 jisx0213_2004_1_decoder(const unsigned char *data)
    741 {
    742     ucs4_t u;
    743     if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
    744         return 0xff3c;
    745     else TRYMAP_DEC(jisx0208, u, data[0], data[1]);
    746     else TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]);
    747     else TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1])
    748         u |= 0x20000;
    749     else TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]);
    750     else
    751         return MAP_UNMAPPABLE;
    752     return u;
    753 }
    754 
    755 static ucs4_t
    756 jisx0213_2004_2_decoder(const unsigned char *data)
    757 {
    758     ucs4_t u;
    759     TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]);
    760     else TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1])
    761         u |= 0x20000;
    762     else
    763         return MAP_UNMAPPABLE;
    764     return u;
    765 }
    766 
    767 static DBCHAR
    768 jisx0213_encoder(const ucs4_t *data, Py_ssize_t *length, void *config)
    769 {
    770     DBCHAR coded;
    771 
    772     switch (*length) {
    773     case 1: /* first character */
    774         if (*data >= 0x10000) {
    775             if ((*data) >> 16 == 0x20000 >> 16) {
    776                 EMULATE_JISX0213_2000_ENCODE_EMP(coded, *data)
    777                 else TRYMAP_ENC(jisx0213_emp, coded,
    778                                 (*data) & 0xffff)
    779                     return coded;
    780             }
    781             return MAP_UNMAPPABLE;
    782         }
    783 
    784         EMULATE_JISX0213_2000_ENCODE_BMP(coded, *data)
    785         else TRYMAP_ENC(jisx0213_bmp, coded, *data) {
    786             if (coded == MULTIC)
    787                 return MAP_MULTIPLE_AVAIL;
    788         }
    789         else TRYMAP_ENC(jisxcommon, coded, *data) {
    790             if (coded & 0x8000)
    791                 return MAP_UNMAPPABLE;
    792         }
    793         else
    794             return MAP_UNMAPPABLE;
    795         return coded;
    796     case 2: /* second character of unicode pair */
    797         coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
    798                         jisx0213_pair_encmap, JISX0213_ENCPAIRS);
    799         if (coded == DBCINV) {
    800             *length = 1;
    801             coded = find_pairencmap((ucs2_t)data[0], 0,
    802                       jisx0213_pair_encmap, JISX0213_ENCPAIRS);
    803             if (coded == DBCINV)
    804                 return MAP_UNMAPPABLE;
    805         }
    806         else
    807             return coded;
    808     case -1: /* flush unterminated */
    809         *length = 1;
    810         coded = find_pairencmap((ucs2_t)data[0], 0,
    811                         jisx0213_pair_encmap, JISX0213_ENCPAIRS);
    812         if (coded == DBCINV)
    813             return MAP_UNMAPPABLE;
    814         else
    815             return coded;
    816     default:
    817         return MAP_UNMAPPABLE;
    818     }
    819 }
    820 
    821 static DBCHAR
    822 jisx0213_2000_1_encoder(const ucs4_t *data, Py_ssize_t *length)
    823 {
    824     DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
    825     if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
    826         return coded;
    827     else if (coded & 0x8000)
    828         return MAP_UNMAPPABLE;
    829     else
    830         return coded;
    831 }
    832 
    833 static DBCHAR
    834 jisx0213_2000_1_encoder_paironly(const ucs4_t *data, Py_ssize_t *length)
    835 {
    836     DBCHAR coded;
    837     Py_ssize_t ilength = *length;
    838 
    839     coded = jisx0213_encoder(data, length, (void *)2000);
    840     switch (ilength) {
    841     case 1:
    842         if (coded == MAP_MULTIPLE_AVAIL)
    843             return MAP_MULTIPLE_AVAIL;
    844         else
    845             return MAP_UNMAPPABLE;
    846     case 2:
    847         if (*length != 2)
    848             return MAP_UNMAPPABLE;
    849         else
    850             return coded;
    851     default:
    852         return MAP_UNMAPPABLE;
    853     }
    854 }
    855 
    856 static DBCHAR
    857 jisx0213_2000_2_encoder(const ucs4_t *data, Py_ssize_t *length)
    858 {
    859     DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
    860     if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
    861         return coded;
    862     else if (coded & 0x8000)
    863         return coded & 0x7fff;
    864     else
    865         return MAP_UNMAPPABLE;
    866 }
    867 
    868 static DBCHAR
    869 jisx0213_2004_1_encoder(const ucs4_t *data, Py_ssize_t *length)
    870 {
    871     DBCHAR coded = jisx0213_encoder(data, length, NULL);
    872     if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
    873         return coded;
    874     else if (coded & 0x8000)
    875         return MAP_UNMAPPABLE;
    876     else
    877         return coded;
    878 }
    879 
    880 static DBCHAR
    881 jisx0213_2004_1_encoder_paironly(const ucs4_t *data, Py_ssize_t *length)
    882 {
    883     DBCHAR coded;
    884     Py_ssize_t ilength = *length;
    885 
    886     coded = jisx0213_encoder(data, length, NULL);
    887     switch (ilength) {
    888     case 1:
    889         if (coded == MAP_MULTIPLE_AVAIL)
    890             return MAP_MULTIPLE_AVAIL;
    891         else
    892             return MAP_UNMAPPABLE;
    893     case 2:
    894         if (*length != 2)
    895             return MAP_UNMAPPABLE;
    896         else
    897             return coded;
    898     default:
    899         return MAP_UNMAPPABLE;
    900     }
    901 }
    902 
    903 static DBCHAR
    904 jisx0213_2004_2_encoder(const ucs4_t *data, Py_ssize_t *length)
    905 {
    906     DBCHAR coded = jisx0213_encoder(data, length, NULL);
    907     if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
    908         return coded;
    909     else if (coded & 0x8000)
    910         return coded & 0x7fff;
    911     else
    912         return MAP_UNMAPPABLE;
    913 }
    914 
    915 static ucs4_t
    916 jisx0201_r_decoder(const unsigned char *data)
    917 {
    918     ucs4_t u;
    919     JISX0201_R_DECODE(*data, u)
    920     else return MAP_UNMAPPABLE;
    921     return u;
    922 }
    923 
    924 static DBCHAR
    925 jisx0201_r_encoder(const ucs4_t *data, Py_ssize_t *length)
    926 {
    927     DBCHAR coded;
    928     JISX0201_R_ENCODE(*data, coded)
    929     else return MAP_UNMAPPABLE;
    930     return coded;
    931 }
    932 
    933 static ucs4_t
    934 jisx0201_k_decoder(const unsigned char *data)
    935 {
    936     ucs4_t u;
    937     JISX0201_K_DECODE(*data ^ 0x80, u)
    938     else return MAP_UNMAPPABLE;
    939     return u;
    940 }
    941 
    942 static DBCHAR
    943 jisx0201_k_encoder(const ucs4_t *data, Py_ssize_t *length)
    944 {
    945     DBCHAR coded;
    946     JISX0201_K_ENCODE(*data, coded)
    947     else return MAP_UNMAPPABLE;
    948     return coded - 0x80;
    949 }
    950 
    951 static int
    952 gb2312_init(void)
    953 {
    954     static int initialized = 0;
    955 
    956     if (!initialized && (
    957                     IMPORT_MAP(cn, gbcommon, &gbcommon_encmap, NULL) ||
    958                     IMPORT_MAP(cn, gb2312, NULL, &gb2312_decmap)))
    959         return -1;
    960     initialized = 1;
    961     return 0;
    962 }
    963 
    964 static ucs4_t
    965 gb2312_decoder(const unsigned char *data)
    966 {
    967     ucs4_t u;
    968     TRYMAP_DEC(gb2312, u, data[0], data[1])
    969         return u;
    970     else
    971         return MAP_UNMAPPABLE;
    972 }
    973 
    974 static DBCHAR
    975 gb2312_encoder(const ucs4_t *data, Py_ssize_t *length)
    976 {
    977     DBCHAR coded;
    978     assert(*length == 1);
    979     if (*data < 0x10000) {
    980         TRYMAP_ENC(gbcommon, coded, *data) {
    981             if (!(coded & 0x8000))
    982                 return coded;
    983         }
    984     }
    985     return MAP_UNMAPPABLE;
    986 }
    987 
    988 
    989 static ucs4_t
    990 dummy_decoder(const unsigned char *data)
    991 {
    992     return MAP_UNMAPPABLE;
    993 }
    994 
    995 static DBCHAR
    996 dummy_encoder(const ucs4_t *data, Py_ssize_t *length)
    997 {
    998     return MAP_UNMAPPABLE;
    999 }
   1000 
   1001 /*-*- registry tables -*-*/
   1002 
   1003 #define REGISTRY_KSX1001_G0     { CHARSET_KSX1001, 0, 2,                \
   1004                   ksx1001_init,                                         \
   1005                   ksx1001_decoder, ksx1001_encoder }
   1006 #define REGISTRY_KSX1001_G1     { CHARSET_KSX1001, 1, 2,                \
   1007                   ksx1001_init,                                         \
   1008                   ksx1001_decoder, ksx1001_encoder }
   1009 #define REGISTRY_JISX0201_R     { CHARSET_JISX0201_R, 0, 1,             \
   1010                   NULL,                                                 \
   1011                   jisx0201_r_decoder, jisx0201_r_encoder }
   1012 #define REGISTRY_JISX0201_K     { CHARSET_JISX0201_K, 0, 1,             \
   1013                   NULL,                                                 \
   1014                   jisx0201_k_decoder, jisx0201_k_encoder }
   1015 #define REGISTRY_JISX0208       { CHARSET_JISX0208, 0, 2,               \
   1016                   jisx0208_init,                                        \
   1017                   jisx0208_decoder, jisx0208_encoder }
   1018 #define REGISTRY_JISX0208_O     { CHARSET_JISX0208_O, 0, 2,             \
   1019                   jisx0208_init,                                        \
   1020                   jisx0208_decoder, jisx0208_encoder }
   1021 #define REGISTRY_JISX0212       { CHARSET_JISX0212, 0, 2,               \
   1022                   jisx0212_init,                                        \
   1023                   jisx0212_decoder, jisx0212_encoder }
   1024 #define REGISTRY_JISX0213_2000_1 { CHARSET_JISX0213_2000_1, 0, 2,       \
   1025                   jisx0213_init,                                        \
   1026                   jisx0213_2000_1_decoder,                              \
   1027                   jisx0213_2000_1_encoder }
   1028 #define REGISTRY_JISX0213_2000_1_PAIRONLY { CHARSET_JISX0213_2000_1, 0, 2, \
   1029                   jisx0213_init,                                        \
   1030                   jisx0213_2000_1_decoder,                              \
   1031                   jisx0213_2000_1_encoder_paironly }
   1032 #define REGISTRY_JISX0213_2000_2 { CHARSET_JISX0213_2, 0, 2,            \
   1033                   jisx0213_init,                                        \
   1034                   jisx0213_2000_2_decoder,                              \
   1035                   jisx0213_2000_2_encoder }
   1036 #define REGISTRY_JISX0213_2004_1 { CHARSET_JISX0213_2004_1, 0, 2,       \
   1037                   jisx0213_init,                                        \
   1038                   jisx0213_2004_1_decoder,                              \
   1039                   jisx0213_2004_1_encoder }
   1040 #define REGISTRY_JISX0213_2004_1_PAIRONLY { CHARSET_JISX0213_2004_1, 0, 2, \
   1041                   jisx0213_init,                                        \
   1042                   jisx0213_2004_1_decoder,                              \
   1043                   jisx0213_2004_1_encoder_paironly }
   1044 #define REGISTRY_JISX0213_2004_2 { CHARSET_JISX0213_2, 0, 2,            \
   1045                   jisx0213_init,                                        \
   1046                   jisx0213_2004_2_decoder,                              \
   1047                   jisx0213_2004_2_encoder }
   1048 #define REGISTRY_GB2312         { CHARSET_GB2312, 0, 2,                 \
   1049                   gb2312_init,                                          \
   1050                   gb2312_decoder, gb2312_encoder }
   1051 #define REGISTRY_CNS11643_1     { CHARSET_CNS11643_1, 1, 2,             \
   1052                   cns11643_init,                                        \
   1053                   cns11643_1_decoder, cns11643_1_encoder }
   1054 #define REGISTRY_CNS11643_2     { CHARSET_CNS11643_2, 2, 2,             \
   1055                   cns11643_init,                                        \
   1056                   cns11643_2_decoder, cns11643_2_encoder }
   1057 #define REGISTRY_ISO8859_1      { CHARSET_ISO8859_1, 2, 1,              \
   1058                   NULL, dummy_decoder, dummy_encoder }
   1059 #define REGISTRY_ISO8859_7      { CHARSET_ISO8859_7, 2, 1,              \
   1060                   NULL, dummy_decoder, dummy_encoder }
   1061 #define REGISTRY_SENTINEL       { 0, }
   1062 #define CONFIGDEF(var, attrs)                                           \
   1063     static const struct iso2022_config iso2022_##var##_config = {       \
   1064         attrs, iso2022_##var##_designations                             \
   1065     };
   1066 
   1067 static const struct iso2022_designation iso2022_kr_designations[] = {
   1068     REGISTRY_KSX1001_G1, REGISTRY_SENTINEL
   1069 };
   1070 CONFIGDEF(kr, 0)
   1071 
   1072 static const struct iso2022_designation iso2022_jp_designations[] = {
   1073     REGISTRY_JISX0208, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
   1074     REGISTRY_SENTINEL
   1075 };
   1076 CONFIGDEF(jp, NO_SHIFT | USE_JISX0208_EXT)
   1077 
   1078 static const struct iso2022_designation iso2022_jp_1_designations[] = {
   1079     REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
   1080     REGISTRY_JISX0208_O, REGISTRY_SENTINEL
   1081 };
   1082 CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT)
   1083 
   1084 static const struct iso2022_designation iso2022_jp_2_designations[] = {
   1085     REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0,
   1086     REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
   1087     REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL
   1088 };
   1089 CONFIGDEF(jp_2, NO_SHIFT | USE_G2 | USE_JISX0208_EXT)
   1090 
   1091 static const struct iso2022_designation iso2022_jp_2004_designations[] = {
   1092     REGISTRY_JISX0213_2004_1_PAIRONLY, REGISTRY_JISX0208,
   1093     REGISTRY_JISX0213_2004_1, REGISTRY_JISX0213_2004_2, REGISTRY_SENTINEL
   1094 };
   1095 CONFIGDEF(jp_2004, NO_SHIFT | USE_JISX0208_EXT)
   1096 
   1097 static const struct iso2022_designation iso2022_jp_3_designations[] = {
   1098     REGISTRY_JISX0213_2000_1_PAIRONLY, REGISTRY_JISX0208,
   1099     REGISTRY_JISX0213_2000_1, REGISTRY_JISX0213_2000_2, REGISTRY_SENTINEL
   1100 };
   1101 CONFIGDEF(jp_3, NO_SHIFT | USE_JISX0208_EXT)
   1102 
   1103 static const struct iso2022_designation iso2022_jp_ext_designations[] = {
   1104     REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
   1105     REGISTRY_JISX0201_K, REGISTRY_JISX0208_O, REGISTRY_SENTINEL
   1106 };
   1107 CONFIGDEF(jp_ext, NO_SHIFT | USE_JISX0208_EXT)
   1108 
   1109 
   1110 BEGIN_MAPPINGS_LIST
   1111   /* no mapping table here */
   1112 END_MAPPINGS_LIST
   1113 
   1114 #define ISO2022_CODEC(variation) {              \
   1115     "iso2022_" #variation,                      \
   1116     &iso2022_##variation##_config,              \
   1117     iso2022_codec_init,                         \
   1118     _STATEFUL_METHODS(iso2022)                  \
   1119 },
   1120 
   1121 BEGIN_CODECS_LIST
   1122   ISO2022_CODEC(kr)
   1123   ISO2022_CODEC(jp)
   1124   ISO2022_CODEC(jp_1)
   1125   ISO2022_CODEC(jp_2)
   1126   ISO2022_CODEC(jp_2004)
   1127   ISO2022_CODEC(jp_3)
   1128   ISO2022_CODEC(jp_ext)
   1129 END_CODECS_LIST
   1130 
   1131 I_AM_A_MODULE_FOR(iso2022)
   1132