1 /* 2 * cjkcodecs.h: common header for cjkcodecs 3 * 4 * Written by Hye-Shik Chang <perky (at) FreeBSD.org> 5 */ 6 7 #ifndef _CJKCODECS_H_ 8 #define _CJKCODECS_H_ 9 10 #define PY_SSIZE_T_CLEAN 11 #include "Python.h" 12 #include "multibytecodec.h" 13 14 15 /* a unicode "undefined" code point */ 16 #define UNIINV 0xFFFE 17 18 /* internal-use DBCS code points which aren't used by any charsets */ 19 #define NOCHAR 0xFFFF 20 #define MULTIC 0xFFFE 21 #define DBCINV 0xFFFD 22 23 /* shorter macros to save source size of mapping tables */ 24 #define U UNIINV 25 #define N NOCHAR 26 #define M MULTIC 27 #define D DBCINV 28 29 struct dbcs_index { 30 const ucs2_t *map; 31 unsigned char bottom, top; 32 }; 33 typedef struct dbcs_index decode_map; 34 35 struct widedbcs_index { 36 const ucs4_t *map; 37 unsigned char bottom, top; 38 }; 39 typedef struct widedbcs_index widedecode_map; 40 41 struct unim_index { 42 const DBCHAR *map; 43 unsigned char bottom, top; 44 }; 45 typedef struct unim_index encode_map; 46 47 struct unim_index_bytebased { 48 const unsigned char *map; 49 unsigned char bottom, top; 50 }; 51 52 struct dbcs_map { 53 const char *charset; 54 const struct unim_index *encmap; 55 const struct dbcs_index *decmap; 56 }; 57 58 struct pair_encodemap { 59 ucs4_t uniseq; 60 DBCHAR code; 61 }; 62 63 static const MultibyteCodec *codec_list; 64 static const struct dbcs_map *mapping_list; 65 66 #define CODEC_INIT(encoding) \ 67 static int encoding##_codec_init(const void *config) 68 69 #define ENCODER_INIT(encoding) \ 70 static int encoding##_encode_init( \ 71 MultibyteCodec_State *state, const void *config) 72 #define ENCODER(encoding) \ 73 static Py_ssize_t encoding##_encode( \ 74 MultibyteCodec_State *state, const void *config, \ 75 const Py_UNICODE **inbuf, Py_ssize_t inleft, \ 76 unsigned char **outbuf, Py_ssize_t outleft, int flags) 77 #define ENCODER_RESET(encoding) \ 78 static Py_ssize_t encoding##_encode_reset( \ 79 MultibyteCodec_State *state, const void *config, \ 80 unsigned char **outbuf, Py_ssize_t outleft) 81 82 #define DECODER_INIT(encoding) \ 83 static int encoding##_decode_init( \ 84 MultibyteCodec_State *state, const void *config) 85 #define DECODER(encoding) \ 86 static Py_ssize_t encoding##_decode( \ 87 MultibyteCodec_State *state, const void *config, \ 88 const unsigned char **inbuf, Py_ssize_t inleft, \ 89 Py_UNICODE **outbuf, Py_ssize_t outleft) 90 #define DECODER_RESET(encoding) \ 91 static Py_ssize_t encoding##_decode_reset( \ 92 MultibyteCodec_State *state, const void *config) 93 94 #if Py_UNICODE_SIZE == 4 95 #define UCS4INVALID(code) \ 96 if ((code) > 0xFFFF) \ 97 return 1; 98 #else 99 #define UCS4INVALID(code) \ 100 if (0) ; 101 #endif 102 103 #define NEXT_IN(i) \ 104 (*inbuf) += (i); \ 105 (inleft) -= (i); 106 #define NEXT_OUT(o) \ 107 (*outbuf) += (o); \ 108 (outleft) -= (o); 109 #define NEXT(i, o) \ 110 NEXT_IN(i) NEXT_OUT(o) 111 112 #define REQUIRE_INBUF(n) \ 113 if (inleft < (n)) \ 114 return MBERR_TOOFEW; 115 #define REQUIRE_OUTBUF(n) \ 116 if (outleft < (n)) \ 117 return MBERR_TOOSMALL; 118 119 #define IN1 ((*inbuf)[0]) 120 #define IN2 ((*inbuf)[1]) 121 #define IN3 ((*inbuf)[2]) 122 #define IN4 ((*inbuf)[3]) 123 124 #define OUT1(c) ((*outbuf)[0]) = (c); 125 #define OUT2(c) ((*outbuf)[1]) = (c); 126 #define OUT3(c) ((*outbuf)[2]) = (c); 127 #define OUT4(c) ((*outbuf)[3]) = (c); 128 129 #define WRITE1(c1) \ 130 REQUIRE_OUTBUF(1) \ 131 (*outbuf)[0] = (c1); 132 #define WRITE2(c1, c2) \ 133 REQUIRE_OUTBUF(2) \ 134 (*outbuf)[0] = (c1); \ 135 (*outbuf)[1] = (c2); 136 #define WRITE3(c1, c2, c3) \ 137 REQUIRE_OUTBUF(3) \ 138 (*outbuf)[0] = (c1); \ 139 (*outbuf)[1] = (c2); \ 140 (*outbuf)[2] = (c3); 141 #define WRITE4(c1, c2, c3, c4) \ 142 REQUIRE_OUTBUF(4) \ 143 (*outbuf)[0] = (c1); \ 144 (*outbuf)[1] = (c2); \ 145 (*outbuf)[2] = (c3); \ 146 (*outbuf)[3] = (c4); 147 148 #if Py_UNICODE_SIZE == 2 149 # define WRITEUCS4(c) \ 150 REQUIRE_OUTBUF(2) \ 151 (*outbuf)[0] = 0xd800 + (((c) - 0x10000) >> 10); \ 152 (*outbuf)[1] = 0xdc00 + (((c) - 0x10000) & 0x3ff); \ 153 NEXT_OUT(2) 154 #else 155 # define WRITEUCS4(c) \ 156 REQUIRE_OUTBUF(1) \ 157 **outbuf = (Py_UNICODE)(c); \ 158 NEXT_OUT(1) 159 #endif 160 161 #define _TRYMAP_ENC(m, assi, val) \ 162 ((m)->map != NULL && (val) >= (m)->bottom && \ 163 (val)<= (m)->top && ((assi) = (m)->map[(val) - \ 164 (m)->bottom]) != NOCHAR) 165 #define TRYMAP_ENC_COND(charset, assi, uni) \ 166 _TRYMAP_ENC(&charset##_encmap[(uni) >> 8], assi, (uni) & 0xff) 167 #define TRYMAP_ENC(charset, assi, uni) \ 168 if TRYMAP_ENC_COND(charset, assi, uni) 169 170 #define _TRYMAP_DEC(m, assi, val) \ 171 ((m)->map != NULL && (val) >= (m)->bottom && \ 172 (val)<= (m)->top && ((assi) = (m)->map[(val) - \ 173 (m)->bottom]) != UNIINV) 174 #define TRYMAP_DEC(charset, assi, c1, c2) \ 175 if _TRYMAP_DEC(&charset##_decmap[c1], assi, c2) 176 177 #define _TRYMAP_ENC_MPLANE(m, assplane, asshi, asslo, val) \ 178 ((m)->map != NULL && (val) >= (m)->bottom && \ 179 (val)<= (m)->top && \ 180 ((assplane) = (m)->map[((val) - (m)->bottom)*3]) != 0 && \ 181 (((asshi) = (m)->map[((val) - (m)->bottom)*3 + 1]), 1) && \ 182 (((asslo) = (m)->map[((val) - (m)->bottom)*3 + 2]), 1)) 183 #define TRYMAP_ENC_MPLANE(charset, assplane, asshi, asslo, uni) \ 184 if _TRYMAP_ENC_MPLANE(&charset##_encmap[(uni) >> 8], \ 185 assplane, asshi, asslo, (uni) & 0xff) 186 #define TRYMAP_DEC_MPLANE(charset, assi, plane, c1, c2) \ 187 if _TRYMAP_DEC(&charset##_decmap[plane][c1], assi, c2) 188 189 #if Py_UNICODE_SIZE == 2 190 #define DECODE_SURROGATE(c) \ 191 if (c >> 10 == 0xd800 >> 10) { /* high surrogate */ \ 192 REQUIRE_INBUF(2) \ 193 if (IN2 >> 10 == 0xdc00 >> 10) { /* low surrogate */ \ 194 c = 0x10000 + ((ucs4_t)(c - 0xd800) << 10) + \ 195 ((ucs4_t)(IN2) - 0xdc00); \ 196 } \ 197 } 198 #define GET_INSIZE(c) ((c) > 0xffff ? 2 : 1) 199 #else 200 #define DECODE_SURROGATE(c) {;} 201 #define GET_INSIZE(c) 1 202 #endif 203 204 #define BEGIN_MAPPINGS_LIST static const struct dbcs_map _mapping_list[] = { 205 #define MAPPING_ENCONLY(enc) {#enc, (void*)enc##_encmap, NULL}, 206 #define MAPPING_DECONLY(enc) {#enc, NULL, (void*)enc##_decmap}, 207 #define MAPPING_ENCDEC(enc) {#enc, (void*)enc##_encmap, (void*)enc##_decmap}, 208 #define END_MAPPINGS_LIST \ 209 {"", NULL, NULL} }; \ 210 static const struct dbcs_map *mapping_list = \ 211 (const struct dbcs_map *)_mapping_list; 212 213 #define BEGIN_CODECS_LIST static const MultibyteCodec _codec_list[] = { 214 #define _STATEFUL_METHODS(enc) \ 215 enc##_encode, \ 216 enc##_encode_init, \ 217 enc##_encode_reset, \ 218 enc##_decode, \ 219 enc##_decode_init, \ 220 enc##_decode_reset, 221 #define _STATELESS_METHODS(enc) \ 222 enc##_encode, NULL, NULL, \ 223 enc##_decode, NULL, NULL, 224 #define CODEC_STATEFUL(enc) { \ 225 #enc, NULL, NULL, \ 226 _STATEFUL_METHODS(enc) \ 227 }, 228 #define CODEC_STATELESS(enc) { \ 229 #enc, NULL, NULL, \ 230 _STATELESS_METHODS(enc) \ 231 }, 232 #define CODEC_STATELESS_WINIT(enc) { \ 233 #enc, NULL, \ 234 enc##_codec_init, \ 235 _STATELESS_METHODS(enc) \ 236 }, 237 #define END_CODECS_LIST \ 238 {"", NULL,} }; \ 239 static const MultibyteCodec *codec_list = \ 240 (const MultibyteCodec *)_codec_list; 241 242 static PyObject * 243 getmultibytecodec(void) 244 { 245 static PyObject *cofunc = NULL; 246 247 if (cofunc == NULL) { 248 PyObject *mod = PyImport_ImportModuleNoBlock("_multibytecodec"); 249 if (mod == NULL) 250 return NULL; 251 cofunc = PyObject_GetAttrString(mod, "__create_codec"); 252 Py_DECREF(mod); 253 } 254 return cofunc; 255 } 256 257 static PyObject * 258 getcodec(PyObject *self, PyObject *encoding) 259 { 260 PyObject *codecobj, *r, *cofunc; 261 const MultibyteCodec *codec; 262 const char *enc; 263 264 if (!PyString_Check(encoding)) { 265 PyErr_SetString(PyExc_TypeError, 266 "encoding name must be a string."); 267 return NULL; 268 } 269 270 cofunc = getmultibytecodec(); 271 if (cofunc == NULL) 272 return NULL; 273 274 enc = PyString_AS_STRING(encoding); 275 for (codec = codec_list; codec->encoding[0]; codec++) 276 if (strcmp(codec->encoding, enc) == 0) 277 break; 278 279 if (codec->encoding[0] == '\0') { 280 PyErr_SetString(PyExc_LookupError, 281 "no such codec is supported."); 282 return NULL; 283 } 284 285 codecobj = PyCapsule_New((void *)codec, PyMultibyteCodec_CAPSULE_NAME, NULL); 286 if (codecobj == NULL) 287 return NULL; 288 289 r = PyObject_CallFunctionObjArgs(cofunc, codecobj, NULL); 290 Py_DECREF(codecobj); 291 292 return r; 293 } 294 295 static struct PyMethodDef __methods[] = { 296 {"getcodec", (PyCFunction)getcodec, METH_O, ""}, 297 {NULL, NULL}, 298 }; 299 300 static int 301 register_maps(PyObject *module) 302 { 303 const struct dbcs_map *h; 304 305 for (h = mapping_list; h->charset[0] != '\0'; h++) { 306 char mhname[256] = "__map_"; 307 int r; 308 strcpy(mhname + sizeof("__map_") - 1, h->charset); 309 r = PyModule_AddObject(module, mhname, 310 PyCapsule_New((void *)h, PyMultibyteCodec_CAPSULE_NAME, NULL)); 311 if (r == -1) 312 return -1; 313 } 314 return 0; 315 } 316 317 #ifdef USING_BINARY_PAIR_SEARCH 318 static DBCHAR 319 find_pairencmap(ucs2_t body, ucs2_t modifier, 320 const struct pair_encodemap *haystack, int haystacksize) 321 { 322 int pos, min, max; 323 ucs4_t value = body << 16 | modifier; 324 325 min = 0; 326 max = haystacksize; 327 328 for (pos = haystacksize >> 1; min != max; pos = (min + max) >> 1) { 329 if (value < haystack[pos].uniseq) { 330 if (max != pos) { 331 max = pos; 332 continue; 333 } 334 } 335 else if (value > haystack[pos].uniseq) { 336 if (min != pos) { 337 min = pos; 338 continue; 339 } 340 } 341 break; 342 } 343 344 if (value == haystack[pos].uniseq) { 345 return haystack[pos].code; 346 } 347 return DBCINV; 348 } 349 #endif 350 351 #ifdef USING_IMPORTED_MAPS 352 #define IMPORT_MAP(locale, charset, encmap, decmap) \ 353 importmap("_codecs_" #locale, "__map_" #charset, \ 354 (const void**)encmap, (const void**)decmap) 355 356 static int 357 importmap(const char *modname, const char *symbol, 358 const void **encmap, const void **decmap) 359 { 360 PyObject *o, *mod; 361 362 mod = PyImport_ImportModule((char *)modname); 363 if (mod == NULL) 364 return -1; 365 366 o = PyObject_GetAttrString(mod, (char*)symbol); 367 if (o == NULL) 368 goto errorexit; 369 else if (!PyCapsule_IsValid(o, PyMultibyteCodec_CAPSULE_NAME)) { 370 PyErr_SetString(PyExc_ValueError, 371 "map data must be a Capsule."); 372 goto errorexit; 373 } 374 else { 375 struct dbcs_map *map; 376 map = PyCapsule_GetPointer(o, PyMultibyteCodec_CAPSULE_NAME); 377 if (encmap != NULL) 378 *encmap = map->encmap; 379 if (decmap != NULL) 380 *decmap = map->decmap; 381 Py_DECREF(o); 382 } 383 384 Py_DECREF(mod); 385 return 0; 386 387 errorexit: 388 Py_DECREF(mod); 389 return -1; 390 } 391 #endif 392 393 #define I_AM_A_MODULE_FOR(loc) \ 394 void \ 395 init_codecs_##loc(void) \ 396 { \ 397 PyObject *m = Py_InitModule("_codecs_" #loc, __methods);\ 398 if (m != NULL) \ 399 (void)register_maps(m); \ 400 } 401 402 #endif 403