Home | History | Annotate | Download | only in cjkcodecs
      1 /*
      2  * _codecs_cn.c: Codecs collection for Mainland Chinese encodings
      3  *
      4  * Written by Hye-Shik Chang <perky (at) FreeBSD.org>
      5  */
      6 
      7 #include "cjkcodecs.h"
      8 #include "mappings_cn.h"
      9 
     10 /**
     11  * hz is predefined as 100 on AIX. So we undefine it to avoid
     12  * conflict against hz codec's.
     13  */
     14 #ifdef _AIX
     15 #undef hz
     16 #endif
     17 
     18 /* GBK and GB2312 map differently in few codepoints that are listed below:
     19  *
     20  *              gb2312                          gbk
     21  * A1A4         U+30FB KATAKANA MIDDLE DOT      U+00B7 MIDDLE DOT
     22  * A1AA         U+2015 HORIZONTAL BAR           U+2014 EM DASH
     23  * A844         undefined                       U+2015 HORIZONTAL BAR
     24  */
     25 
     26 #define GBK_DECODE(dc1, dc2, assi) \
     27     if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \
     28     else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \
     29     else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; \
     30     else TRYMAP_DEC(gb2312, assi, dc1 ^ 0x80, dc2 ^ 0x80); \
     31     else TRYMAP_DEC(gbkext, assi, dc1, dc2);
     32 
     33 #define GBK_ENCODE(code, assi) \
     34     if ((code) == 0x2014) (assi) = 0xa1aa; \
     35     else if ((code) == 0x2015) (assi) = 0xa844; \
     36     else if ((code) == 0x00b7) (assi) = 0xa1a4; \
     37     else if ((code) != 0x30fb && TRYMAP_ENC_COND(gbcommon, assi, code));
     38 
     39 /*
     40  * GB2312 codec
     41  */
     42 
     43 ENCODER(gb2312)
     44 {
     45     while (inleft > 0) {
     46         Py_UNICODE c = IN1;
     47         DBCHAR code;
     48 
     49         if (c < 0x80) {
     50             WRITE1((unsigned char)c)
     51             NEXT(1, 1)
     52             continue;
     53         }
     54         UCS4INVALID(c)
     55 
     56         REQUIRE_OUTBUF(2)
     57         TRYMAP_ENC(gbcommon, code, c);
     58         else return 1;
     59 
     60         if (code & 0x8000) /* MSB set: GBK */
     61             return 1;
     62 
     63         OUT1((code >> 8) | 0x80)
     64         OUT2((code & 0xFF) | 0x80)
     65         NEXT(1, 2)
     66     }
     67 
     68     return 0;
     69 }
     70 
     71 DECODER(gb2312)
     72 {
     73     while (inleft > 0) {
     74         unsigned char c = **inbuf;
     75 
     76         REQUIRE_OUTBUF(1)
     77 
     78         if (c < 0x80) {
     79             OUT1(c)
     80             NEXT(1, 1)
     81             continue;
     82         }
     83 
     84         REQUIRE_INBUF(2)
     85         TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
     86             NEXT(2, 1)
     87         }
     88         else return 2;
     89     }
     90 
     91     return 0;
     92 }
     93 
     94 
     95 /*
     96  * GBK codec
     97  */
     98 
     99 ENCODER(gbk)
    100 {
    101     while (inleft > 0) {
    102         Py_UNICODE c = IN1;
    103         DBCHAR code;
    104 
    105         if (c < 0x80) {
    106             WRITE1((unsigned char)c)
    107             NEXT(1, 1)
    108             continue;
    109         }
    110         UCS4INVALID(c)
    111 
    112         REQUIRE_OUTBUF(2)
    113 
    114         GBK_ENCODE(c, code)
    115         else return 1;
    116 
    117         OUT1((code >> 8) | 0x80)
    118         if (code & 0x8000)
    119             OUT2((code & 0xFF)) /* MSB set: GBK */
    120         else
    121             OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
    122         NEXT(1, 2)
    123     }
    124 
    125     return 0;
    126 }
    127 
    128 DECODER(gbk)
    129 {
    130     while (inleft > 0) {
    131         unsigned char c = IN1;
    132 
    133         REQUIRE_OUTBUF(1)
    134 
    135         if (c < 0x80) {
    136             OUT1(c)
    137             NEXT(1, 1)
    138             continue;
    139         }
    140 
    141         REQUIRE_INBUF(2)
    142 
    143         GBK_DECODE(c, IN2, **outbuf)
    144         else return 2;
    145 
    146         NEXT(2, 1)
    147     }
    148 
    149     return 0;
    150 }
    151 
    152 
    153 /*
    154  * GB18030 codec
    155  */
    156 
    157 ENCODER(gb18030)
    158 {
    159     while (inleft > 0) {
    160         ucs4_t c = IN1;
    161         DBCHAR code;
    162 
    163         if (c < 0x80) {
    164             WRITE1(c)
    165             NEXT(1, 1)
    166             continue;
    167         }
    168 
    169         DECODE_SURROGATE(c)
    170         if (c > 0x10FFFF)
    171 #if Py_UNICODE_SIZE == 2
    172             return 2; /* surrogates pair */
    173 #else
    174             return 1;
    175 #endif
    176         else if (c >= 0x10000) {
    177             ucs4_t tc = c - 0x10000;
    178 
    179             REQUIRE_OUTBUF(4)
    180 
    181             OUT4((unsigned char)(tc % 10) + 0x30)
    182             tc /= 10;
    183             OUT3((unsigned char)(tc % 126) + 0x81)
    184             tc /= 126;
    185             OUT2((unsigned char)(tc % 10) + 0x30)
    186             tc /= 10;
    187             OUT1((unsigned char)(tc + 0x90))
    188 
    189 #if Py_UNICODE_SIZE == 2
    190             NEXT(2, 4) /* surrogates pair */
    191 #else
    192             NEXT(1, 4)
    193 #endif
    194             continue;
    195         }
    196 
    197         REQUIRE_OUTBUF(2)
    198 
    199         GBK_ENCODE(c, code)
    200         else TRYMAP_ENC(gb18030ext, code, c);
    201         else {
    202             const struct _gb18030_to_unibmp_ranges *utrrange;
    203 
    204             REQUIRE_OUTBUF(4)
    205 
    206             for (utrrange = gb18030_to_unibmp_ranges;
    207                  utrrange->first != 0;
    208                  utrrange++)
    209                 if (utrrange->first <= c &&
    210                     c <= utrrange->last) {
    211                     Py_UNICODE tc;
    212 
    213                     tc = c - utrrange->first +
    214                          utrrange->base;
    215 
    216                     OUT4((unsigned char)(tc % 10) + 0x30)
    217                     tc /= 10;
    218                     OUT3((unsigned char)(tc % 126) + 0x81)
    219                     tc /= 126;
    220                     OUT2((unsigned char)(tc % 10) + 0x30)
    221                     tc /= 10;
    222                     OUT1((unsigned char)tc + 0x81)
    223 
    224                     NEXT(1, 4)
    225                     break;
    226                 }
    227 
    228             if (utrrange->first == 0)
    229                 return 1;
    230             continue;
    231         }
    232 
    233         OUT1((code >> 8) | 0x80)
    234         if (code & 0x8000)
    235             OUT2((code & 0xFF)) /* MSB set: GBK or GB18030ext */
    236         else
    237             OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
    238 
    239         NEXT(1, 2)
    240     }
    241 
    242     return 0;
    243 }
    244 
    245 DECODER(gb18030)
    246 {
    247     while (inleft > 0) {
    248         unsigned char c = IN1, c2;
    249 
    250         REQUIRE_OUTBUF(1)
    251 
    252         if (c < 0x80) {
    253             OUT1(c)
    254             NEXT(1, 1)
    255             continue;
    256         }
    257 
    258         REQUIRE_INBUF(2)
    259 
    260         c2 = IN2;
    261         if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
    262             const struct _gb18030_to_unibmp_ranges *utr;
    263             unsigned char c3, c4;
    264             ucs4_t lseq;
    265 
    266             REQUIRE_INBUF(4)
    267             c3 = IN3;
    268             c4 = IN4;
    269             if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39)
    270                 return 4;
    271             c -= 0x81;  c2 -= 0x30;
    272             c3 -= 0x81; c4 -= 0x30;
    273 
    274             if (c < 4) { /* U+0080 - U+FFFF */
    275                 lseq = ((ucs4_t)c * 10 + c2) * 1260 +
    276                     (ucs4_t)c3 * 10 + c4;
    277                 if (lseq < 39420) {
    278                     for (utr = gb18030_to_unibmp_ranges;
    279                          lseq >= (utr + 1)->base;
    280                          utr++) ;
    281                     OUT1(utr->first - utr->base + lseq)
    282                     NEXT(4, 1)
    283                     continue;
    284                 }
    285             }
    286             else if (c >= 15) { /* U+10000 - U+10FFFF */
    287                 lseq = 0x10000 + (((ucs4_t)c-15) * 10 + c2)
    288                     * 1260 + (ucs4_t)c3 * 10 + c4;
    289                 if (lseq <= 0x10FFFF) {
    290                     WRITEUCS4(lseq);
    291                     NEXT_IN(4)
    292                     continue;
    293                 }
    294             }
    295             return 4;
    296         }
    297 
    298         GBK_DECODE(c, c2, **outbuf)
    299         else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);
    300         else return 2;
    301 
    302         NEXT(2, 1)
    303     }
    304 
    305     return 0;
    306 }
    307 
    308 
    309 /*
    310  * HZ codec
    311  */
    312 
    313 ENCODER_INIT(hz)
    314 {
    315     state->i = 0;
    316     return 0;
    317 }
    318 
    319 ENCODER_RESET(hz)
    320 {
    321     if (state->i != 0) {
    322         WRITE2('~', '}')
    323         state->i = 0;
    324         NEXT_OUT(2)
    325     }
    326     return 0;
    327 }
    328 
    329 ENCODER(hz)
    330 {
    331     while (inleft > 0) {
    332         Py_UNICODE c = IN1;
    333         DBCHAR code;
    334 
    335         if (c < 0x80) {
    336             if (state->i == 0) {
    337                 WRITE1((unsigned char)c)
    338                 NEXT(1, 1)
    339             }
    340             else {
    341                 WRITE3('~', '}', (unsigned char)c)
    342                 NEXT(1, 3)
    343                 state->i = 0;
    344             }
    345             continue;
    346         }
    347 
    348         UCS4INVALID(c)
    349 
    350         TRYMAP_ENC(gbcommon, code, c);
    351         else return 1;
    352 
    353         if (code & 0x8000) /* MSB set: GBK */
    354             return 1;
    355 
    356         if (state->i == 0) {
    357             WRITE4('~', '{', code >> 8, code & 0xff)
    358             NEXT(1, 4)
    359             state->i = 1;
    360         }
    361         else {
    362             WRITE2(code >> 8, code & 0xff)
    363             NEXT(1, 2)
    364         }
    365     }
    366 
    367     return 0;
    368 }
    369 
    370 DECODER_INIT(hz)
    371 {
    372     state->i = 0;
    373     return 0;
    374 }
    375 
    376 DECODER_RESET(hz)
    377 {
    378     state->i = 0;
    379     return 0;
    380 }
    381 
    382 DECODER(hz)
    383 {
    384     while (inleft > 0) {
    385         unsigned char c = IN1;
    386 
    387         if (c == '~') {
    388             unsigned char c2 = IN2;
    389 
    390             REQUIRE_INBUF(2)
    391             if (c2 == '~') {
    392                 WRITE1('~')
    393                 NEXT(2, 1)
    394                 continue;
    395             }
    396             else if (c2 == '{' && state->i == 0)
    397                 state->i = 1; /* set GB */
    398             else if (c2 == '}' && state->i == 1)
    399                 state->i = 0; /* set ASCII */
    400             else if (c2 == '\n')
    401                 ; /* line-continuation */
    402             else
    403                 return 2;
    404             NEXT(2, 0);
    405             continue;
    406         }
    407 
    408         if (c & 0x80)
    409             return 1;
    410 
    411         if (state->i == 0) { /* ASCII mode */
    412             WRITE1(c)
    413             NEXT(1, 1)
    414         }
    415         else { /* GB mode */
    416             REQUIRE_INBUF(2)
    417             REQUIRE_OUTBUF(1)
    418             TRYMAP_DEC(gb2312, **outbuf, c, IN2) {
    419                 NEXT(2, 1)
    420             }
    421             else
    422                 return 2;
    423         }
    424     }
    425 
    426     return 0;
    427 }
    428 
    429 
    430 BEGIN_MAPPINGS_LIST
    431   MAPPING_DECONLY(gb2312)
    432   MAPPING_DECONLY(gbkext)
    433   MAPPING_ENCONLY(gbcommon)
    434   MAPPING_ENCDEC(gb18030ext)
    435 END_MAPPINGS_LIST
    436 
    437 BEGIN_CODECS_LIST
    438   CODEC_STATELESS(gb2312)
    439   CODEC_STATELESS(gbk)
    440   CODEC_STATELESS(gb18030)
    441   CODEC_STATEFUL(hz)
    442 END_CODECS_LIST
    443 
    444 I_AM_A_MODULE_FOR(cn)
    445