Home | History | Annotate | Download | only in cjkcodecs
      1 /*
      2  * _codecs_kr.c: Codecs collection for Korean encodings
      3  *
      4  * Written by Hye-Shik Chang <perky (at) FreeBSD.org>
      5  */
      6 
      7 #include "cjkcodecs.h"
      8 #include "mappings_kr.h"
      9 
     10 /*
     11  * EUC-KR codec
     12  */
     13 
     14 #define EUCKR_JAMO_FIRSTBYTE    0xA4
     15 #define EUCKR_JAMO_FILLER       0xD4
     16 
     17 static const unsigned char u2cgk_choseong[19] = {
     18     0xa1, 0xa2, 0xa4, 0xa7, 0xa8, 0xa9, 0xb1, 0xb2,
     19     0xb3, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb,
     20     0xbc, 0xbd, 0xbe
     21 };
     22 static const unsigned char u2cgk_jungseong[21] = {
     23     0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6,
     24     0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce,
     25     0xcf, 0xd0, 0xd1, 0xd2, 0xd3
     26 };
     27 static const unsigned char u2cgk_jongseong[28] = {
     28     0xd4, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
     29     0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
     30     0xb1, 0xb2, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xba,
     31     0xbb, 0xbc, 0xbd, 0xbe
     32 };
     33 
     34 ENCODER(euc_kr)
     35 {
     36     while (inleft > 0) {
     37         Py_UNICODE c = IN1;
     38         DBCHAR code;
     39 
     40         if (c < 0x80) {
     41             WRITE1((unsigned char)c)
     42             NEXT(1, 1)
     43             continue;
     44         }
     45         UCS4INVALID(c)
     46 
     47         REQUIRE_OUTBUF(2)
     48         TRYMAP_ENC(cp949, code, c);
     49         else return 1;
     50 
     51         if ((code & 0x8000) == 0) {
     52             /* KS X 1001 coded character */
     53             OUT1((code >> 8) | 0x80)
     54             OUT2((code & 0xFF) | 0x80)
     55             NEXT(1, 2)
     56         }
     57         else {          /* Mapping is found in CP949 extension,
     58                  * but we encode it in KS X 1001:1998 Annex 3,
     59                  * make-up sequence for EUC-KR. */
     60 
     61             REQUIRE_OUTBUF(8)
     62 
     63             /* syllable composition precedence */
     64             OUT1(EUCKR_JAMO_FIRSTBYTE)
     65             OUT2(EUCKR_JAMO_FILLER)
     66 
     67             /* All code points in CP949 extension are in unicode
     68              * Hangul Syllable area. */
     69             assert(0xac00 <= c && c <= 0xd7a3);
     70             c -= 0xac00;
     71 
     72             OUT3(EUCKR_JAMO_FIRSTBYTE)
     73             OUT4(u2cgk_choseong[c / 588])
     74             NEXT_OUT(4)
     75 
     76             OUT1(EUCKR_JAMO_FIRSTBYTE)
     77             OUT2(u2cgk_jungseong[(c / 28) % 21])
     78             OUT3(EUCKR_JAMO_FIRSTBYTE)
     79             OUT4(u2cgk_jongseong[c % 28])
     80             NEXT(1, 4)
     81         }
     82     }
     83 
     84     return 0;
     85 }
     86 
     87 #define NONE    127
     88 
     89 static const unsigned char cgk2u_choseong[] = { /* [A1, BE] */
     90        0,    1, NONE,    2, NONE, NONE,    3,    4,
     91        5, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
     92        6,    7,    8, NONE,    9,   10,   11,   12,
     93       13,   14,   15,   16,   17,   18
     94 };
     95 static const unsigned char cgk2u_jongseong[] = { /* [A1, BE] */
     96        1,    2,    3,    4,    5,    6,    7, NONE,
     97        8,    9,   10,   11,   12,   13,   14,   15,
     98       16,   17, NONE,   18,   19,   20,   21,   22,
     99     NONE,   23,   24,   25,   26,   27
    100 };
    101 
    102 DECODER(euc_kr)
    103 {
    104     while (inleft > 0) {
    105         unsigned char c = IN1;
    106 
    107         REQUIRE_OUTBUF(1)
    108 
    109         if (c < 0x80) {
    110             OUT1(c)
    111             NEXT(1, 1)
    112             continue;
    113         }
    114 
    115         REQUIRE_INBUF(2)
    116 
    117         if (c == EUCKR_JAMO_FIRSTBYTE &&
    118             IN2 == EUCKR_JAMO_FILLER) {
    119             /* KS X 1001:1998 Annex 3 make-up sequence */
    120             DBCHAR cho, jung, jong;
    121 
    122             REQUIRE_INBUF(8)
    123             if ((*inbuf)[2] != EUCKR_JAMO_FIRSTBYTE ||
    124                 (*inbuf)[4] != EUCKR_JAMO_FIRSTBYTE ||
    125                 (*inbuf)[6] != EUCKR_JAMO_FIRSTBYTE)
    126                 return 8;
    127 
    128             c = (*inbuf)[3];
    129             if (0xa1 <= c && c <= 0xbe)
    130                 cho = cgk2u_choseong[c - 0xa1];
    131             else
    132                 cho = NONE;
    133 
    134             c = (*inbuf)[5];
    135             jung = (0xbf <= c && c <= 0xd3) ? c - 0xbf : NONE;
    136 
    137             c = (*inbuf)[7];
    138             if (c == EUCKR_JAMO_FILLER)
    139                 jong = 0;
    140             else if (0xa1 <= c && c <= 0xbe)
    141                 jong = cgk2u_jongseong[c - 0xa1];
    142             else
    143                 jong = NONE;
    144 
    145             if (cho == NONE || jung == NONE || jong == NONE)
    146                 return 8;
    147 
    148             OUT1(0xac00 + cho*588 + jung*28 + jong);
    149             NEXT(8, 1)
    150         }
    151         else TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
    152             NEXT(2, 1)
    153         }
    154         else
    155             return 2;
    156     }
    157 
    158     return 0;
    159 }
    160 #undef NONE
    161 
    162 
    163 /*
    164  * CP949 codec
    165  */
    166 
    167 ENCODER(cp949)
    168 {
    169     while (inleft > 0) {
    170         Py_UNICODE c = IN1;
    171         DBCHAR code;
    172 
    173         if (c < 0x80) {
    174             WRITE1((unsigned char)c)
    175             NEXT(1, 1)
    176             continue;
    177         }
    178         UCS4INVALID(c)
    179 
    180         REQUIRE_OUTBUF(2)
    181         TRYMAP_ENC(cp949, code, c);
    182         else return 1;
    183 
    184         OUT1((code >> 8) | 0x80)
    185         if (code & 0x8000)
    186             OUT2(code & 0xFF) /* MSB set: CP949 */
    187         else
    188             OUT2((code & 0xFF) | 0x80) /* MSB unset: ks x 1001 */
    189         NEXT(1, 2)
    190     }
    191 
    192     return 0;
    193 }
    194 
    195 DECODER(cp949)
    196 {
    197     while (inleft > 0) {
    198         unsigned char c = IN1;
    199 
    200         REQUIRE_OUTBUF(1)
    201 
    202         if (c < 0x80) {
    203             OUT1(c)
    204             NEXT(1, 1)
    205             continue;
    206         }
    207 
    208         REQUIRE_INBUF(2)
    209         TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80);
    210         else TRYMAP_DEC(cp949ext, **outbuf, c, IN2);
    211         else return 2;
    212 
    213         NEXT(2, 1)
    214     }
    215 
    216     return 0;
    217 }
    218 
    219 
    220 /*
    221  * JOHAB codec
    222  */
    223 
    224 static const unsigned char u2johabidx_choseong[32] = {
    225                 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
    226     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
    227     0x10, 0x11, 0x12, 0x13, 0x14,
    228 };
    229 static const unsigned char u2johabidx_jungseong[32] = {
    230                       0x03, 0x04, 0x05, 0x06, 0x07,
    231                 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
    232                 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
    233                 0x1a, 0x1b, 0x1c, 0x1d,
    234 };
    235 static const unsigned char u2johabidx_jongseong[32] = {
    236           0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
    237     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
    238     0x10, 0x11,       0x13, 0x14, 0x15, 0x16, 0x17,
    239     0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d,
    240 };
    241 static const DBCHAR u2johabjamo[] = {
    242             0x8841, 0x8c41, 0x8444, 0x9041, 0x8446, 0x8447, 0x9441,
    243     0x9841, 0x9c41, 0x844a, 0x844b, 0x844c, 0x844d, 0x844e, 0x844f,
    244     0x8450, 0xa041, 0xa441, 0xa841, 0x8454, 0xac41, 0xb041, 0xb441,
    245     0xb841, 0xbc41, 0xc041, 0xc441, 0xc841, 0xcc41, 0xd041, 0x8461,
    246     0x8481, 0x84a1, 0x84c1, 0x84e1, 0x8541, 0x8561, 0x8581, 0x85a1,
    247     0x85c1, 0x85e1, 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1,
    248     0x8741, 0x8761, 0x8781, 0x87a1,
    249 };
    250 
    251 ENCODER(johab)
    252 {
    253     while (inleft > 0) {
    254         Py_UNICODE c = IN1;
    255         DBCHAR code;
    256 
    257         if (c < 0x80) {
    258             WRITE1((unsigned char)c)
    259             NEXT(1, 1)
    260             continue;
    261         }
    262         UCS4INVALID(c)
    263 
    264         REQUIRE_OUTBUF(2)
    265 
    266         if (c >= 0xac00 && c <= 0xd7a3) {
    267             c -= 0xac00;
    268             code = 0x8000 |
    269                 (u2johabidx_choseong[c / 588] << 10) |
    270                 (u2johabidx_jungseong[(c / 28) % 21] << 5) |
    271                 u2johabidx_jongseong[c % 28];
    272         }
    273         else if (c >= 0x3131 && c <= 0x3163)
    274             code = u2johabjamo[c - 0x3131];
    275         else TRYMAP_ENC(cp949, code, c) {
    276             unsigned char c1, c2, t2;
    277             unsigned short t1;
    278 
    279             assert((code & 0x8000) == 0);
    280             c1 = code >> 8;
    281             c2 = code & 0xff;
    282             if (((c1 >= 0x21 && c1 <= 0x2c) ||
    283                 (c1 >= 0x4a && c1 <= 0x7d)) &&
    284                 (c2 >= 0x21 && c2 <= 0x7e)) {
    285                 t1 = (c1 < 0x4a ? (c1 - 0x21 + 0x1b2) :
    286                           (c1 - 0x21 + 0x197));
    287                 t2 = ((t1 & 1) ? 0x5e : 0) + (c2 - 0x21);
    288                 OUT1(t1 >> 1)
    289                 OUT2(t2 < 0x4e ? t2 + 0x31 : t2 + 0x43)
    290                 NEXT(1, 2)
    291                 continue;
    292             }
    293             else
    294                 return 1;
    295         }
    296         else
    297             return 1;
    298 
    299         OUT1(code >> 8)
    300         OUT2(code & 0xff)
    301         NEXT(1, 2)
    302     }
    303 
    304     return 0;
    305 }
    306 
    307 #define FILL 0xfd
    308 #define NONE 0xff
    309 
    310 static const unsigned char johabidx_choseong[32] = {
    311     NONE, FILL, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
    312     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
    313     0x0e, 0x0f, 0x10, 0x11, 0x12, NONE, NONE, NONE,
    314     NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
    315 };
    316 static const unsigned char johabidx_jungseong[32] = {
    317     NONE, NONE, FILL, 0x00, 0x01, 0x02, 0x03, 0x04,
    318     NONE, NONE, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
    319     NONE, NONE, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
    320     NONE, NONE, 0x11, 0x12, 0x13, 0x14, NONE, NONE,
    321 };
    322 static const unsigned char johabidx_jongseong[32] = {
    323     NONE, FILL, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
    324     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
    325     0x0f, 0x10, NONE, 0x11, 0x12, 0x13, 0x14, 0x15,
    326     0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, NONE, NONE,
    327 };
    328 
    329 static const unsigned char johabjamo_choseong[32] = {
    330     NONE, FILL, 0x31, 0x32, 0x34, 0x37, 0x38, 0x39,
    331     0x41, 0x42, 0x43, 0x45, 0x46, 0x47, 0x48, 0x49,
    332     0x4a, 0x4b, 0x4c, 0x4d, 0x4e, NONE, NONE, NONE,
    333     NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
    334 };
    335 static const unsigned char johabjamo_jungseong[32] = {
    336     NONE, NONE, FILL, 0x4f, 0x50, 0x51, 0x52, 0x53,
    337     NONE, NONE, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
    338     NONE, NONE, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
    339     NONE, NONE, 0x60, 0x61, 0x62, 0x63, NONE, NONE,
    340 };
    341 static const unsigned char johabjamo_jongseong[32] = {
    342     NONE, FILL, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
    343     0x37, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
    344     0x40, 0x41, NONE, 0x42, 0x44, 0x45, 0x46, 0x47,
    345     0x48, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, NONE, NONE,
    346 };
    347 
    348 DECODER(johab)
    349 {
    350     while (inleft > 0) {
    351         unsigned char    c = IN1, c2;
    352 
    353         REQUIRE_OUTBUF(1)
    354 
    355         if (c < 0x80) {
    356             OUT1(c)
    357             NEXT(1, 1)
    358             continue;
    359         }
    360 
    361         REQUIRE_INBUF(2)
    362         c2 = IN2;
    363 
    364         if (c < 0xd8) {
    365             /* johab hangul */
    366             unsigned char c_cho, c_jung, c_jong;
    367             unsigned char i_cho, i_jung, i_jong;
    368 
    369             c_cho = (c >> 2) & 0x1f;
    370             c_jung = ((c << 3) | c2 >> 5) & 0x1f;
    371             c_jong = c2 & 0x1f;
    372 
    373             i_cho = johabidx_choseong[c_cho];
    374             i_jung = johabidx_jungseong[c_jung];
    375             i_jong = johabidx_jongseong[c_jong];
    376 
    377             if (i_cho == NONE || i_jung == NONE || i_jong == NONE)
    378                 return 2;
    379 
    380             /* we don't use U+1100 hangul jamo yet. */
    381             if (i_cho == FILL) {
    382                 if (i_jung == FILL) {
    383                     if (i_jong == FILL)
    384                         OUT1(0x3000)
    385                     else
    386                         OUT1(0x3100 |
    387                           johabjamo_jongseong[c_jong])
    388                 }
    389                 else {
    390                     if (i_jong == FILL)
    391                         OUT1(0x3100 |
    392                           johabjamo_jungseong[c_jung])
    393                     else
    394                         return 2;
    395                 }
    396             } else {
    397                 if (i_jung == FILL) {
    398                     if (i_jong == FILL)
    399                         OUT1(0x3100 |
    400                           johabjamo_choseong[c_cho])
    401                     else
    402                         return 2;
    403                 }
    404                 else
    405                     OUT1(0xac00 +
    406                          i_cho * 588 +
    407                          i_jung * 28 +
    408                          (i_jong == FILL ? 0 : i_jong))
    409             }
    410             NEXT(2, 1)
    411         } else {
    412             /* KS X 1001 except hangul jamos and syllables */
    413             if (c == 0xdf || c > 0xf9 ||
    414                 c2 < 0x31 || (c2 >= 0x80 && c2 < 0x91) ||
    415                 (c2 & 0x7f) == 0x7f ||
    416                 (c == 0xda && (c2 >= 0xa1 && c2 <= 0xd3)))
    417                 return 2;
    418             else {
    419                 unsigned char t1, t2;
    420 
    421                 t1 = (c < 0xe0 ? 2 * (c - 0xd9) :
    422                          2 * c - 0x197);
    423                 t2 = (c2 < 0x91 ? c2 - 0x31 : c2 - 0x43);
    424                 t1 = t1 + (t2 < 0x5e ? 0 : 1) + 0x21;
    425                 t2 = (t2 < 0x5e ? t2 : t2 - 0x5e) + 0x21;
    426 
    427                 TRYMAP_DEC(ksx1001, **outbuf, t1, t2);
    428                 else return 2;
    429                 NEXT(2, 1)
    430             }
    431         }
    432     }
    433 
    434     return 0;
    435 }
    436 #undef NONE
    437 #undef FILL
    438 
    439 
    440 BEGIN_MAPPINGS_LIST
    441   MAPPING_DECONLY(ksx1001)
    442   MAPPING_ENCONLY(cp949)
    443   MAPPING_DECONLY(cp949ext)
    444 END_MAPPINGS_LIST
    445 
    446 BEGIN_CODECS_LIST
    447   CODEC_STATELESS(euc_kr)
    448   CODEC_STATELESS(cp949)
    449   CODEC_STATELESS(johab)
    450 END_CODECS_LIST
    451 
    452 I_AM_A_MODULE_FOR(kr)
    453