1 /* 2 * multibytecodec.c: Common Multibyte Codec Implementation 3 * 4 * Written by Hye-Shik Chang <perky (at) FreeBSD.org> 5 */ 6 7 #define PY_SSIZE_T_CLEAN 8 #include "Python.h" 9 #include "structmember.h" 10 #include "multibytecodec.h" 11 12 typedef struct { 13 const Py_UNICODE *inbuf, *inbuf_top, *inbuf_end; 14 unsigned char *outbuf, *outbuf_end; 15 PyObject *excobj, *outobj; 16 } MultibyteEncodeBuffer; 17 18 typedef struct { 19 const unsigned char *inbuf, *inbuf_top, *inbuf_end; 20 Py_UNICODE *outbuf, *outbuf_end; 21 PyObject *excobj, *outobj; 22 } MultibyteDecodeBuffer; 23 24 PyDoc_STRVAR(MultibyteCodec_Encode__doc__, 25 "I.encode(unicode[, errors]) -> (string, length consumed)\n\ 26 \n\ 27 Return an encoded string version of `unicode'. errors may be given to\n\ 28 set a different error handling scheme. Default is 'strict' meaning that\n\ 29 encoding errors raise a UnicodeEncodeError. Other possible values are\n\ 30 'ignore', 'replace' and 'xmlcharrefreplace' as well as any other name\n\ 31 registered with codecs.register_error that can handle UnicodeEncodeErrors."); 32 33 PyDoc_STRVAR(MultibyteCodec_Decode__doc__, 34 "I.decode(string[, errors]) -> (unicodeobject, length consumed)\n\ 35 \n\ 36 Decodes `string' using I, an MultibyteCodec instance. errors may be given\n\ 37 to set a different error handling scheme. Default is 'strict' meaning\n\ 38 that encoding errors raise a UnicodeDecodeError. Other possible values\n\ 39 are 'ignore' and 'replace' as well as any other name registered with\n\ 40 codecs.register_error that is able to handle UnicodeDecodeErrors."); 41 42 static char *codeckwarglist[] = {"input", "errors", NULL}; 43 static char *incnewkwarglist[] = {"errors", NULL}; 44 static char *incrementalkwarglist[] = {"input", "final", NULL}; 45 static char *streamkwarglist[] = {"stream", "errors", NULL}; 46 47 static PyObject *multibytecodec_encode(MultibyteCodec *, 48 MultibyteCodec_State *, const Py_UNICODE **, Py_ssize_t, 49 PyObject *, int); 50 51 #define MBENC_RESET MBENC_MAX<<1 /* reset after an encoding session */ 52 53 static PyObject * 54 make_tuple(PyObject *object, Py_ssize_t len) 55 { 56 PyObject *v, *w; 57 58 if (object == NULL) 59 return NULL; 60 61 v = PyTuple_New(2); 62 if (v == NULL) { 63 Py_DECREF(object); 64 return NULL; 65 } 66 PyTuple_SET_ITEM(v, 0, object); 67 68 w = PyInt_FromSsize_t(len); 69 if (w == NULL) { 70 Py_DECREF(v); 71 return NULL; 72 } 73 PyTuple_SET_ITEM(v, 1, w); 74 75 return v; 76 } 77 78 static PyObject * 79 internal_error_callback(const char *errors) 80 { 81 if (errors == NULL || strcmp(errors, "strict") == 0) 82 return ERROR_STRICT; 83 else if (strcmp(errors, "ignore") == 0) 84 return ERROR_IGNORE; 85 else if (strcmp(errors, "replace") == 0) 86 return ERROR_REPLACE; 87 else 88 return PyString_FromString(errors); 89 } 90 91 static PyObject * 92 call_error_callback(PyObject *errors, PyObject *exc) 93 { 94 PyObject *args, *cb, *r; 95 96 assert(PyString_Check(errors)); 97 cb = PyCodec_LookupError(PyString_AS_STRING(errors)); 98 if (cb == NULL) 99 return NULL; 100 101 args = PyTuple_New(1); 102 if (args == NULL) { 103 Py_DECREF(cb); 104 return NULL; 105 } 106 107 PyTuple_SET_ITEM(args, 0, exc); 108 Py_INCREF(exc); 109 110 r = PyObject_CallObject(cb, args); 111 Py_DECREF(args); 112 Py_DECREF(cb); 113 return r; 114 } 115 116 static PyObject * 117 codecctx_errors_get(MultibyteStatefulCodecContext *self) 118 { 119 const char *errors; 120 121 if (self->errors == ERROR_STRICT) 122 errors = "strict"; 123 else if (self->errors == ERROR_IGNORE) 124 errors = "ignore"; 125 else if (self->errors == ERROR_REPLACE) 126 errors = "replace"; 127 else { 128 Py_INCREF(self->errors); 129 return self->errors; 130 } 131 132 return PyString_FromString(errors); 133 } 134 135 static int 136 codecctx_errors_set(MultibyteStatefulCodecContext *self, PyObject *value, 137 void *closure) 138 { 139 PyObject *cb; 140 141 if (!PyString_Check(value)) { 142 PyErr_SetString(PyExc_TypeError, "errors must be a string"); 143 return -1; 144 } 145 146 cb = internal_error_callback(PyString_AS_STRING(value)); 147 if (cb == NULL) 148 return -1; 149 150 ERROR_DECREF(self->errors); 151 self->errors = cb; 152 return 0; 153 } 154 155 /* This getset handlers list is used by all the stateful codec objects */ 156 static PyGetSetDef codecctx_getsets[] = { 157 {"errors", (getter)codecctx_errors_get, 158 (setter)codecctx_errors_set, 159 PyDoc_STR("how to treat errors")}, 160 {NULL,} 161 }; 162 163 static int 164 expand_encodebuffer(MultibyteEncodeBuffer *buf, Py_ssize_t esize) 165 { 166 Py_ssize_t orgpos, orgsize, incsize; 167 168 orgpos = (Py_ssize_t)((char *)buf->outbuf - 169 PyString_AS_STRING(buf->outobj)); 170 orgsize = PyString_GET_SIZE(buf->outobj); 171 incsize = (esize < (orgsize >> 1) ? (orgsize >> 1) | 1 : esize); 172 173 if (orgsize > PY_SSIZE_T_MAX - incsize) 174 return -1; 175 176 if (_PyString_Resize(&buf->outobj, orgsize + incsize) == -1) 177 return -1; 178 179 buf->outbuf = (unsigned char *)PyString_AS_STRING(buf->outobj) +orgpos; 180 buf->outbuf_end = (unsigned char *)PyString_AS_STRING(buf->outobj) 181 + PyString_GET_SIZE(buf->outobj); 182 183 return 0; 184 } 185 #define REQUIRE_ENCODEBUFFER(buf, s) { \ 186 if ((s) < 1 || (buf)->outbuf + (s) > (buf)->outbuf_end) \ 187 if (expand_encodebuffer(buf, s) == -1) \ 188 goto errorexit; \ 189 } 190 191 static int 192 expand_decodebuffer(MultibyteDecodeBuffer *buf, Py_ssize_t esize) 193 { 194 Py_ssize_t orgpos, orgsize; 195 196 orgpos = (Py_ssize_t)(buf->outbuf - PyUnicode_AS_UNICODE(buf->outobj)); 197 orgsize = PyUnicode_GET_SIZE(buf->outobj); 198 if (PyUnicode_Resize(&buf->outobj, orgsize + ( 199 esize < (orgsize >> 1) ? (orgsize >> 1) | 1 : esize)) == -1) 200 return -1; 201 202 buf->outbuf = PyUnicode_AS_UNICODE(buf->outobj) + orgpos; 203 buf->outbuf_end = PyUnicode_AS_UNICODE(buf->outobj) 204 + PyUnicode_GET_SIZE(buf->outobj); 205 206 return 0; 207 } 208 #define REQUIRE_DECODEBUFFER(buf, s) { \ 209 if ((s) < 1 || (buf)->outbuf + (s) > (buf)->outbuf_end) \ 210 if (expand_decodebuffer(buf, s) == -1) \ 211 goto errorexit; \ 212 } 213 214 215 /** 216 * MultibyteCodec object 217 */ 218 219 static int 220 multibytecodec_encerror(MultibyteCodec *codec, 221 MultibyteCodec_State *state, 222 MultibyteEncodeBuffer *buf, 223 PyObject *errors, Py_ssize_t e) 224 { 225 PyObject *retobj = NULL, *retstr = NULL, *tobj; 226 Py_ssize_t retstrsize, newpos; 227 Py_ssize_t esize, start, end; 228 const char *reason; 229 230 if (e > 0) { 231 reason = "illegal multibyte sequence"; 232 esize = e; 233 } 234 else { 235 switch (e) { 236 case MBERR_TOOSMALL: 237 REQUIRE_ENCODEBUFFER(buf, -1); 238 return 0; /* retry it */ 239 case MBERR_TOOFEW: 240 reason = "incomplete multibyte sequence"; 241 esize = (Py_ssize_t)(buf->inbuf_end - buf->inbuf); 242 break; 243 case MBERR_INTERNAL: 244 PyErr_SetString(PyExc_RuntimeError, 245 "internal codec error"); 246 return -1; 247 default: 248 PyErr_SetString(PyExc_RuntimeError, 249 "unknown runtime error"); 250 return -1; 251 } 252 } 253 254 if (errors == ERROR_REPLACE) { 255 const Py_UNICODE replchar = '?', *inbuf = &replchar; 256 Py_ssize_t r; 257 258 for (;;) { 259 Py_ssize_t outleft; 260 261 outleft = (Py_ssize_t)(buf->outbuf_end - buf->outbuf); 262 r = codec->encode(state, codec->config, &inbuf, 1, 263 &buf->outbuf, outleft, 0); 264 if (r == MBERR_TOOSMALL) { 265 REQUIRE_ENCODEBUFFER(buf, -1); 266 continue; 267 } 268 else 269 break; 270 } 271 272 if (r != 0) { 273 REQUIRE_ENCODEBUFFER(buf, 1); 274 *buf->outbuf++ = '?'; 275 } 276 } 277 if (errors == ERROR_IGNORE || errors == ERROR_REPLACE) { 278 buf->inbuf += esize; 279 return 0; 280 } 281 282 start = (Py_ssize_t)(buf->inbuf - buf->inbuf_top); 283 end = start + esize; 284 285 /* use cached exception object if available */ 286 if (buf->excobj == NULL) { 287 buf->excobj = PyUnicodeEncodeError_Create(codec->encoding, 288 buf->inbuf_top, 289 buf->inbuf_end - buf->inbuf_top, 290 start, end, reason); 291 if (buf->excobj == NULL) 292 goto errorexit; 293 } 294 else 295 if (PyUnicodeEncodeError_SetStart(buf->excobj, start) != 0 || 296 PyUnicodeEncodeError_SetEnd(buf->excobj, end) != 0 || 297 PyUnicodeEncodeError_SetReason(buf->excobj, reason) != 0) 298 goto errorexit; 299 300 if (errors == ERROR_STRICT) { 301 PyCodec_StrictErrors(buf->excobj); 302 goto errorexit; 303 } 304 305 retobj = call_error_callback(errors, buf->excobj); 306 if (retobj == NULL) 307 goto errorexit; 308 309 if (!PyTuple_Check(retobj) || PyTuple_GET_SIZE(retobj) != 2 || 310 !PyUnicode_Check((tobj = PyTuple_GET_ITEM(retobj, 0))) || 311 !(PyInt_Check(PyTuple_GET_ITEM(retobj, 1)) || 312 PyLong_Check(PyTuple_GET_ITEM(retobj, 1)))) { 313 PyErr_SetString(PyExc_TypeError, 314 "encoding error handler must return " 315 "(unicode, int) tuple"); 316 goto errorexit; 317 } 318 319 { 320 const Py_UNICODE *uraw = PyUnicode_AS_UNICODE(tobj); 321 322 retstr = multibytecodec_encode(codec, state, &uraw, 323 PyUnicode_GET_SIZE(tobj), ERROR_STRICT, 324 MBENC_FLUSH); 325 if (retstr == NULL) 326 goto errorexit; 327 } 328 329 retstrsize = PyString_GET_SIZE(retstr); 330 REQUIRE_ENCODEBUFFER(buf, retstrsize); 331 332 memcpy(buf->outbuf, PyString_AS_STRING(retstr), retstrsize); 333 buf->outbuf += retstrsize; 334 335 newpos = PyInt_AsSsize_t(PyTuple_GET_ITEM(retobj, 1)); 336 if (newpos < 0 && !PyErr_Occurred()) 337 newpos += (Py_ssize_t)(buf->inbuf_end - buf->inbuf_top); 338 if (newpos < 0 || buf->inbuf_top + newpos > buf->inbuf_end) { 339 PyErr_Clear(); 340 PyErr_Format(PyExc_IndexError, 341 "position %zd from error handler out of bounds", 342 newpos); 343 goto errorexit; 344 } 345 buf->inbuf = buf->inbuf_top + newpos; 346 347 Py_DECREF(retobj); 348 Py_DECREF(retstr); 349 return 0; 350 351 errorexit: 352 Py_XDECREF(retobj); 353 Py_XDECREF(retstr); 354 return -1; 355 } 356 357 static int 358 multibytecodec_decerror(MultibyteCodec *codec, 359 MultibyteCodec_State *state, 360 MultibyteDecodeBuffer *buf, 361 PyObject *errors, Py_ssize_t e) 362 { 363 PyObject *retobj = NULL, *retuni = NULL; 364 Py_ssize_t retunisize, newpos; 365 const char *reason; 366 Py_ssize_t esize, start, end; 367 368 if (e > 0) { 369 reason = "illegal multibyte sequence"; 370 esize = e; 371 } 372 else { 373 switch (e) { 374 case MBERR_TOOSMALL: 375 REQUIRE_DECODEBUFFER(buf, -1); 376 return 0; /* retry it */ 377 case MBERR_TOOFEW: 378 reason = "incomplete multibyte sequence"; 379 esize = (Py_ssize_t)(buf->inbuf_end - buf->inbuf); 380 break; 381 case MBERR_INTERNAL: 382 PyErr_SetString(PyExc_RuntimeError, 383 "internal codec error"); 384 return -1; 385 default: 386 PyErr_SetString(PyExc_RuntimeError, 387 "unknown runtime error"); 388 return -1; 389 } 390 } 391 392 if (errors == ERROR_REPLACE) { 393 REQUIRE_DECODEBUFFER(buf, 1); 394 *buf->outbuf++ = Py_UNICODE_REPLACEMENT_CHARACTER; 395 } 396 if (errors == ERROR_IGNORE || errors == ERROR_REPLACE) { 397 buf->inbuf += esize; 398 return 0; 399 } 400 401 start = (Py_ssize_t)(buf->inbuf - buf->inbuf_top); 402 end = start + esize; 403 404 /* use cached exception object if available */ 405 if (buf->excobj == NULL) { 406 buf->excobj = PyUnicodeDecodeError_Create(codec->encoding, 407 (const char *)buf->inbuf_top, 408 (Py_ssize_t)(buf->inbuf_end - buf->inbuf_top), 409 start, end, reason); 410 if (buf->excobj == NULL) 411 goto errorexit; 412 } 413 else 414 if (PyUnicodeDecodeError_SetStart(buf->excobj, start) || 415 PyUnicodeDecodeError_SetEnd(buf->excobj, end) || 416 PyUnicodeDecodeError_SetReason(buf->excobj, reason)) 417 goto errorexit; 418 419 if (errors == ERROR_STRICT) { 420 PyCodec_StrictErrors(buf->excobj); 421 goto errorexit; 422 } 423 424 retobj = call_error_callback(errors, buf->excobj); 425 if (retobj == NULL) 426 goto errorexit; 427 428 if (!PyTuple_Check(retobj) || PyTuple_GET_SIZE(retobj) != 2 || 429 !PyUnicode_Check((retuni = PyTuple_GET_ITEM(retobj, 0))) || 430 !(PyInt_Check(PyTuple_GET_ITEM(retobj, 1)) || 431 PyLong_Check(PyTuple_GET_ITEM(retobj, 1)))) { 432 PyErr_SetString(PyExc_TypeError, 433 "decoding error handler must return " 434 "(unicode, int) tuple"); 435 goto errorexit; 436 } 437 438 retunisize = PyUnicode_GET_SIZE(retuni); 439 if (retunisize > 0) { 440 REQUIRE_DECODEBUFFER(buf, retunisize); 441 memcpy((char *)buf->outbuf, PyUnicode_AS_DATA(retuni), 442 retunisize * Py_UNICODE_SIZE); 443 buf->outbuf += retunisize; 444 } 445 446 newpos = PyInt_AsSsize_t(PyTuple_GET_ITEM(retobj, 1)); 447 if (newpos < 0 && !PyErr_Occurred()) 448 newpos += (Py_ssize_t)(buf->inbuf_end - buf->inbuf_top); 449 if (newpos < 0 || buf->inbuf_top + newpos > buf->inbuf_end) { 450 PyErr_Clear(); 451 PyErr_Format(PyExc_IndexError, 452 "position %zd from error handler out of bounds", 453 newpos); 454 goto errorexit; 455 } 456 buf->inbuf = buf->inbuf_top + newpos; 457 Py_DECREF(retobj); 458 return 0; 459 460 errorexit: 461 Py_XDECREF(retobj); 462 return -1; 463 } 464 465 static PyObject * 466 multibytecodec_encode(MultibyteCodec *codec, 467 MultibyteCodec_State *state, 468 const Py_UNICODE **data, Py_ssize_t datalen, 469 PyObject *errors, int flags) 470 { 471 MultibyteEncodeBuffer buf; 472 Py_ssize_t finalsize, r = 0; 473 474 if (datalen == 0 && !(flags & MBENC_RESET)) 475 return PyString_FromString(""); 476 477 buf.excobj = NULL; 478 buf.inbuf = buf.inbuf_top = *data; 479 buf.inbuf_end = buf.inbuf_top + datalen; 480 481 if (datalen > (PY_SSIZE_T_MAX - 16) / 2) { 482 PyErr_NoMemory(); 483 goto errorexit; 484 } 485 486 buf.outobj = PyString_FromStringAndSize(NULL, datalen * 2 + 16); 487 if (buf.outobj == NULL) 488 goto errorexit; 489 buf.outbuf = (unsigned char *)PyString_AS_STRING(buf.outobj); 490 buf.outbuf_end = buf.outbuf + PyString_GET_SIZE(buf.outobj); 491 492 while (buf.inbuf < buf.inbuf_end) { 493 Py_ssize_t inleft, outleft; 494 495 /* we don't reuse inleft and outleft here. 496 * error callbacks can relocate the cursor anywhere on buffer*/ 497 inleft = (Py_ssize_t)(buf.inbuf_end - buf.inbuf); 498 outleft = (Py_ssize_t)(buf.outbuf_end - buf.outbuf); 499 r = codec->encode(state, codec->config, &buf.inbuf, inleft, 500 &buf.outbuf, outleft, flags); 501 if ((r == 0) || (r == MBERR_TOOFEW && !(flags & MBENC_FLUSH))) 502 break; 503 else if (multibytecodec_encerror(codec, state, &buf, errors,r)) 504 goto errorexit; 505 else if (r == MBERR_TOOFEW) 506 break; 507 } 508 509 if (codec->encreset != NULL && (flags & MBENC_RESET)) 510 for (;;) { 511 Py_ssize_t outleft; 512 513 outleft = (Py_ssize_t)(buf.outbuf_end - buf.outbuf); 514 r = codec->encreset(state, codec->config, &buf.outbuf, 515 outleft); 516 if (r == 0) 517 break; 518 else if (multibytecodec_encerror(codec, state, 519 &buf, errors, r)) 520 goto errorexit; 521 } 522 523 finalsize = (Py_ssize_t)((char *)buf.outbuf - 524 PyString_AS_STRING(buf.outobj)); 525 526 if (finalsize != PyString_GET_SIZE(buf.outobj)) 527 if (_PyString_Resize(&buf.outobj, finalsize) == -1) 528 goto errorexit; 529 530 *data = buf.inbuf; 531 Py_XDECREF(buf.excobj); 532 return buf.outobj; 533 534 errorexit: 535 Py_XDECREF(buf.excobj); 536 Py_XDECREF(buf.outobj); 537 return NULL; 538 } 539 540 static PyObject * 541 MultibyteCodec_Encode(MultibyteCodecObject *self, 542 PyObject *args, PyObject *kwargs) 543 { 544 MultibyteCodec_State state; 545 Py_UNICODE *data; 546 PyObject *errorcb, *r, *arg, *ucvt; 547 const char *errors = NULL; 548 Py_ssize_t datalen; 549 550 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|z:encode", 551 codeckwarglist, &arg, &errors)) 552 return NULL; 553 554 if (PyUnicode_Check(arg)) 555 ucvt = NULL; 556 else { 557 arg = ucvt = PyObject_Unicode(arg); 558 if (arg == NULL) 559 return NULL; 560 else if (!PyUnicode_Check(arg)) { 561 PyErr_SetString(PyExc_TypeError, 562 "couldn't convert the object to unicode."); 563 Py_DECREF(ucvt); 564 return NULL; 565 } 566 } 567 568 data = PyUnicode_AS_UNICODE(arg); 569 datalen = PyUnicode_GET_SIZE(arg); 570 571 errorcb = internal_error_callback(errors); 572 if (errorcb == NULL) { 573 Py_XDECREF(ucvt); 574 return NULL; 575 } 576 577 if (self->codec->encinit != NULL && 578 self->codec->encinit(&state, self->codec->config) != 0) 579 goto errorexit; 580 r = multibytecodec_encode(self->codec, &state, 581 (const Py_UNICODE **)&data, datalen, errorcb, 582 MBENC_FLUSH | MBENC_RESET); 583 if (r == NULL) 584 goto errorexit; 585 586 ERROR_DECREF(errorcb); 587 Py_XDECREF(ucvt); 588 return make_tuple(r, datalen); 589 590 errorexit: 591 ERROR_DECREF(errorcb); 592 Py_XDECREF(ucvt); 593 return NULL; 594 } 595 596 static PyObject * 597 MultibyteCodec_Decode(MultibyteCodecObject *self, 598 PyObject *args, PyObject *kwargs) 599 { 600 MultibyteCodec_State state; 601 MultibyteDecodeBuffer buf; 602 PyObject *errorcb; 603 Py_buffer pdata; 604 const char *data, *errors = NULL; 605 Py_ssize_t datalen, finalsize; 606 607 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*|z:decode", 608 codeckwarglist, &pdata, &errors)) 609 return NULL; 610 data = pdata.buf; 611 datalen = pdata.len; 612 613 errorcb = internal_error_callback(errors); 614 if (errorcb == NULL) { 615 PyBuffer_Release(&pdata); 616 return NULL; 617 } 618 619 if (datalen == 0) { 620 PyBuffer_Release(&pdata); 621 ERROR_DECREF(errorcb); 622 return make_tuple(PyUnicode_FromUnicode(NULL, 0), 0); 623 } 624 625 buf.excobj = NULL; 626 buf.inbuf = buf.inbuf_top = (unsigned char *)data; 627 buf.inbuf_end = buf.inbuf_top + datalen; 628 buf.outobj = PyUnicode_FromUnicode(NULL, datalen); 629 if (buf.outobj == NULL) 630 goto errorexit; 631 buf.outbuf = PyUnicode_AS_UNICODE(buf.outobj); 632 buf.outbuf_end = buf.outbuf + PyUnicode_GET_SIZE(buf.outobj); 633 634 if (self->codec->decinit != NULL && 635 self->codec->decinit(&state, self->codec->config) != 0) 636 goto errorexit; 637 638 while (buf.inbuf < buf.inbuf_end) { 639 Py_ssize_t inleft, outleft, r; 640 641 inleft = (Py_ssize_t)(buf.inbuf_end - buf.inbuf); 642 outleft = (Py_ssize_t)(buf.outbuf_end - buf.outbuf); 643 644 r = self->codec->decode(&state, self->codec->config, 645 &buf.inbuf, inleft, &buf.outbuf, outleft); 646 if (r == 0) 647 break; 648 else if (multibytecodec_decerror(self->codec, &state, 649 &buf, errorcb, r)) 650 goto errorexit; 651 } 652 653 finalsize = (Py_ssize_t)(buf.outbuf - 654 PyUnicode_AS_UNICODE(buf.outobj)); 655 656 if (finalsize != PyUnicode_GET_SIZE(buf.outobj)) 657 if (PyUnicode_Resize(&buf.outobj, finalsize) == -1) 658 goto errorexit; 659 660 PyBuffer_Release(&pdata); 661 Py_XDECREF(buf.excobj); 662 ERROR_DECREF(errorcb); 663 return make_tuple(buf.outobj, datalen); 664 665 errorexit: 666 PyBuffer_Release(&pdata); 667 ERROR_DECREF(errorcb); 668 Py_XDECREF(buf.excobj); 669 Py_XDECREF(buf.outobj); 670 671 return NULL; 672 } 673 674 static struct PyMethodDef multibytecodec_methods[] = { 675 {"encode", (PyCFunction)MultibyteCodec_Encode, 676 METH_VARARGS | METH_KEYWORDS, 677 MultibyteCodec_Encode__doc__}, 678 {"decode", (PyCFunction)MultibyteCodec_Decode, 679 METH_VARARGS | METH_KEYWORDS, 680 MultibyteCodec_Decode__doc__}, 681 {NULL, NULL}, 682 }; 683 684 static void 685 multibytecodec_dealloc(MultibyteCodecObject *self) 686 { 687 PyObject_Del(self); 688 } 689 690 static PyTypeObject MultibyteCodec_Type = { 691 PyVarObject_HEAD_INIT(NULL, 0) 692 "MultibyteCodec", /* tp_name */ 693 sizeof(MultibyteCodecObject), /* tp_basicsize */ 694 0, /* tp_itemsize */ 695 /* methods */ 696 (destructor)multibytecodec_dealloc, /* tp_dealloc */ 697 0, /* tp_print */ 698 0, /* tp_getattr */ 699 0, /* tp_setattr */ 700 0, /* tp_compare */ 701 0, /* tp_repr */ 702 0, /* tp_as_number */ 703 0, /* tp_as_sequence */ 704 0, /* tp_as_mapping */ 705 0, /* tp_hash */ 706 0, /* tp_call */ 707 0, /* tp_str */ 708 PyObject_GenericGetAttr, /* tp_getattro */ 709 0, /* tp_setattro */ 710 0, /* tp_as_buffer */ 711 Py_TPFLAGS_DEFAULT, /* tp_flags */ 712 0, /* tp_doc */ 713 0, /* tp_traverse */ 714 0, /* tp_clear */ 715 0, /* tp_richcompare */ 716 0, /* tp_weaklistoffset */ 717 0, /* tp_iter */ 718 0, /* tp_iterext */ 719 multibytecodec_methods, /* tp_methods */ 720 }; 721 722 723 /** 724 * Utility functions for stateful codec mechanism 725 */ 726 727 #define STATEFUL_DCTX(o) ((MultibyteStatefulDecoderContext *)(o)) 728 #define STATEFUL_ECTX(o) ((MultibyteStatefulEncoderContext *)(o)) 729 730 static PyObject * 731 encoder_encode_stateful(MultibyteStatefulEncoderContext *ctx, 732 PyObject *unistr, int final) 733 { 734 PyObject *ucvt, *r = NULL; 735 Py_UNICODE *inbuf, *inbuf_end, *inbuf_tmp = NULL; 736 Py_ssize_t datalen, origpending; 737 738 if (PyUnicode_Check(unistr)) 739 ucvt = NULL; 740 else { 741 unistr = ucvt = PyObject_Unicode(unistr); 742 if (unistr == NULL) 743 return NULL; 744 else if (!PyUnicode_Check(unistr)) { 745 PyErr_SetString(PyExc_TypeError, 746 "couldn't convert the object to unicode."); 747 Py_DECREF(ucvt); 748 return NULL; 749 } 750 } 751 752 datalen = PyUnicode_GET_SIZE(unistr); 753 origpending = ctx->pendingsize; 754 755 if (origpending > 0) { 756 if (datalen > PY_SSIZE_T_MAX - ctx->pendingsize) { 757 PyErr_NoMemory(); 758 /* inbuf_tmp == NULL */ 759 goto errorexit; 760 } 761 inbuf_tmp = PyMem_New(Py_UNICODE, datalen + ctx->pendingsize); 762 if (inbuf_tmp == NULL) 763 goto errorexit; 764 memcpy(inbuf_tmp, ctx->pending, 765 Py_UNICODE_SIZE * ctx->pendingsize); 766 memcpy(inbuf_tmp + ctx->pendingsize, 767 PyUnicode_AS_UNICODE(unistr), 768 Py_UNICODE_SIZE * datalen); 769 datalen += ctx->pendingsize; 770 ctx->pendingsize = 0; 771 inbuf = inbuf_tmp; 772 } 773 else 774 inbuf = (Py_UNICODE *)PyUnicode_AS_UNICODE(unistr); 775 776 inbuf_end = inbuf + datalen; 777 778 r = multibytecodec_encode(ctx->codec, &ctx->state, 779 (const Py_UNICODE **)&inbuf, datalen, 780 ctx->errors, final ? MBENC_FLUSH | MBENC_RESET : 0); 781 if (r == NULL) { 782 /* recover the original pending buffer */ 783 if (origpending > 0) 784 memcpy(ctx->pending, inbuf_tmp, 785 Py_UNICODE_SIZE * origpending); 786 ctx->pendingsize = origpending; 787 goto errorexit; 788 } 789 790 if (inbuf < inbuf_end) { 791 ctx->pendingsize = (Py_ssize_t)(inbuf_end - inbuf); 792 if (ctx->pendingsize > MAXENCPENDING) { 793 /* normal codecs can't reach here */ 794 ctx->pendingsize = 0; 795 PyErr_SetString(PyExc_UnicodeError, 796 "pending buffer overflow"); 797 goto errorexit; 798 } 799 memcpy(ctx->pending, inbuf, 800 ctx->pendingsize * Py_UNICODE_SIZE); 801 } 802 803 if (inbuf_tmp != NULL) 804 PyMem_Del(inbuf_tmp); 805 Py_XDECREF(ucvt); 806 return r; 807 808 errorexit: 809 if (inbuf_tmp != NULL) 810 PyMem_Del(inbuf_tmp); 811 Py_XDECREF(r); 812 Py_XDECREF(ucvt); 813 return NULL; 814 } 815 816 static int 817 decoder_append_pending(MultibyteStatefulDecoderContext *ctx, 818 MultibyteDecodeBuffer *buf) 819 { 820 Py_ssize_t npendings; 821 822 npendings = (Py_ssize_t)(buf->inbuf_end - buf->inbuf); 823 if (npendings + ctx->pendingsize > MAXDECPENDING || 824 npendings > PY_SSIZE_T_MAX - ctx->pendingsize) { 825 PyErr_SetString(PyExc_UnicodeError, "pending buffer overflow"); 826 return -1; 827 } 828 memcpy(ctx->pending + ctx->pendingsize, buf->inbuf, npendings); 829 ctx->pendingsize += npendings; 830 return 0; 831 } 832 833 static int 834 decoder_prepare_buffer(MultibyteDecodeBuffer *buf, const char *data, 835 Py_ssize_t size) 836 { 837 buf->inbuf = buf->inbuf_top = (const unsigned char *)data; 838 buf->inbuf_end = buf->inbuf_top + size; 839 if (buf->outobj == NULL) { /* only if outobj is not allocated yet */ 840 buf->outobj = PyUnicode_FromUnicode(NULL, size); 841 if (buf->outobj == NULL) 842 return -1; 843 buf->outbuf = PyUnicode_AS_UNICODE(buf->outobj); 844 buf->outbuf_end = buf->outbuf + 845 PyUnicode_GET_SIZE(buf->outobj); 846 } 847 848 return 0; 849 } 850 851 static int 852 decoder_feed_buffer(MultibyteStatefulDecoderContext *ctx, 853 MultibyteDecodeBuffer *buf) 854 { 855 while (buf->inbuf < buf->inbuf_end) { 856 Py_ssize_t inleft, outleft; 857 Py_ssize_t r; 858 859 inleft = (Py_ssize_t)(buf->inbuf_end - buf->inbuf); 860 outleft = (Py_ssize_t)(buf->outbuf_end - buf->outbuf); 861 862 r = ctx->codec->decode(&ctx->state, ctx->codec->config, 863 &buf->inbuf, inleft, &buf->outbuf, outleft); 864 if (r == 0 || r == MBERR_TOOFEW) 865 break; 866 else if (multibytecodec_decerror(ctx->codec, &ctx->state, 867 buf, ctx->errors, r)) 868 return -1; 869 } 870 return 0; 871 } 872 873 874 /** 875 * MultibyteIncrementalEncoder object 876 */ 877 878 static PyObject * 879 mbiencoder_encode(MultibyteIncrementalEncoderObject *self, 880 PyObject *args, PyObject *kwargs) 881 { 882 PyObject *data; 883 int final = 0; 884 885 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|i:encode", 886 incrementalkwarglist, &data, &final)) 887 return NULL; 888 889 return encoder_encode_stateful(STATEFUL_ECTX(self), data, final); 890 } 891 892 static PyObject * 893 mbiencoder_reset(MultibyteIncrementalEncoderObject *self) 894 { 895 if (self->codec->decreset != NULL && 896 self->codec->decreset(&self->state, self->codec->config) != 0) 897 return NULL; 898 self->pendingsize = 0; 899 900 Py_RETURN_NONE; 901 } 902 903 static struct PyMethodDef mbiencoder_methods[] = { 904 {"encode", (PyCFunction)mbiencoder_encode, 905 METH_VARARGS | METH_KEYWORDS, NULL}, 906 {"reset", (PyCFunction)mbiencoder_reset, 907 METH_NOARGS, NULL}, 908 {NULL, NULL}, 909 }; 910 911 static PyObject * 912 mbiencoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 913 { 914 MultibyteIncrementalEncoderObject *self; 915 PyObject *codec = NULL; 916 char *errors = NULL; 917 918 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|s:IncrementalEncoder", 919 incnewkwarglist, &errors)) 920 return NULL; 921 922 self = (MultibyteIncrementalEncoderObject *)type->tp_alloc(type, 0); 923 if (self == NULL) 924 return NULL; 925 926 codec = PyObject_GetAttrString((PyObject *)type, "codec"); 927 if (codec == NULL) 928 goto errorexit; 929 if (!MultibyteCodec_Check(codec)) { 930 PyErr_SetString(PyExc_TypeError, "codec is unexpected type"); 931 goto errorexit; 932 } 933 934 self->codec = ((MultibyteCodecObject *)codec)->codec; 935 self->pendingsize = 0; 936 self->errors = internal_error_callback(errors); 937 if (self->errors == NULL) 938 goto errorexit; 939 if (self->codec->encinit != NULL && 940 self->codec->encinit(&self->state, self->codec->config) != 0) 941 goto errorexit; 942 943 Py_DECREF(codec); 944 return (PyObject *)self; 945 946 errorexit: 947 Py_XDECREF(self); 948 Py_XDECREF(codec); 949 return NULL; 950 } 951 952 static int 953 mbiencoder_init(PyObject *self, PyObject *args, PyObject *kwds) 954 { 955 return 0; 956 } 957 958 static int 959 mbiencoder_traverse(MultibyteIncrementalEncoderObject *self, 960 visitproc visit, void *arg) 961 { 962 if (ERROR_ISCUSTOM(self->errors)) 963 Py_VISIT(self->errors); 964 return 0; 965 } 966 967 static void 968 mbiencoder_dealloc(MultibyteIncrementalEncoderObject *self) 969 { 970 PyObject_GC_UnTrack(self); 971 ERROR_DECREF(self->errors); 972 Py_TYPE(self)->tp_free(self); 973 } 974 975 static PyTypeObject MultibyteIncrementalEncoder_Type = { 976 PyVarObject_HEAD_INIT(NULL, 0) 977 "MultibyteIncrementalEncoder", /* tp_name */ 978 sizeof(MultibyteIncrementalEncoderObject), /* tp_basicsize */ 979 0, /* tp_itemsize */ 980 /* methods */ 981 (destructor)mbiencoder_dealloc, /* tp_dealloc */ 982 0, /* tp_print */ 983 0, /* tp_getattr */ 984 0, /* tp_setattr */ 985 0, /* tp_compare */ 986 0, /* tp_repr */ 987 0, /* tp_as_number */ 988 0, /* tp_as_sequence */ 989 0, /* tp_as_mapping */ 990 0, /* tp_hash */ 991 0, /* tp_call */ 992 0, /* tp_str */ 993 PyObject_GenericGetAttr, /* tp_getattro */ 994 0, /* tp_setattro */ 995 0, /* tp_as_buffer */ 996 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC 997 | Py_TPFLAGS_BASETYPE, /* tp_flags */ 998 0, /* tp_doc */ 999 (traverseproc)mbiencoder_traverse, /* tp_traverse */ 1000 0, /* tp_clear */ 1001 0, /* tp_richcompare */ 1002 0, /* tp_weaklistoffset */ 1003 0, /* tp_iter */ 1004 0, /* tp_iterext */ 1005 mbiencoder_methods, /* tp_methods */ 1006 0, /* tp_members */ 1007 codecctx_getsets, /* tp_getset */ 1008 0, /* tp_base */ 1009 0, /* tp_dict */ 1010 0, /* tp_descr_get */ 1011 0, /* tp_descr_set */ 1012 0, /* tp_dictoffset */ 1013 mbiencoder_init, /* tp_init */ 1014 0, /* tp_alloc */ 1015 mbiencoder_new, /* tp_new */ 1016 }; 1017 1018 1019 /** 1020 * MultibyteIncrementalDecoder object 1021 */ 1022 1023 static PyObject * 1024 mbidecoder_decode(MultibyteIncrementalDecoderObject *self, 1025 PyObject *args, PyObject *kwargs) 1026 { 1027 MultibyteDecodeBuffer buf; 1028 char *data, *wdata = NULL; 1029 Py_buffer pdata; 1030 Py_ssize_t wsize, finalsize = 0, size, origpending; 1031 int final = 0; 1032 1033 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*|i:decode", 1034 incrementalkwarglist, &pdata, &final)) 1035 return NULL; 1036 data = pdata.buf; 1037 size = pdata.len; 1038 1039 buf.outobj = buf.excobj = NULL; 1040 origpending = self->pendingsize; 1041 1042 if (self->pendingsize == 0) { 1043 wsize = size; 1044 wdata = data; 1045 } 1046 else { 1047 if (size > PY_SSIZE_T_MAX - self->pendingsize) { 1048 PyErr_NoMemory(); 1049 goto errorexit; 1050 } 1051 wsize = size + self->pendingsize; 1052 wdata = PyMem_Malloc(wsize); 1053 if (wdata == NULL) 1054 goto errorexit; 1055 memcpy(wdata, self->pending, self->pendingsize); 1056 memcpy(wdata + self->pendingsize, data, size); 1057 self->pendingsize = 0; 1058 } 1059 1060 if (decoder_prepare_buffer(&buf, wdata, wsize) != 0) 1061 goto errorexit; 1062 1063 if (decoder_feed_buffer(STATEFUL_DCTX(self), &buf)) 1064 goto errorexit; 1065 1066 if (final && buf.inbuf < buf.inbuf_end) { 1067 if (multibytecodec_decerror(self->codec, &self->state, 1068 &buf, self->errors, MBERR_TOOFEW)) { 1069 /* recover the original pending buffer */ 1070 memcpy(self->pending, wdata, origpending); 1071 self->pendingsize = origpending; 1072 goto errorexit; 1073 } 1074 } 1075 1076 if (buf.inbuf < buf.inbuf_end) { /* pending sequence still exists */ 1077 if (decoder_append_pending(STATEFUL_DCTX(self), &buf) != 0) 1078 goto errorexit; 1079 } 1080 1081 finalsize = (Py_ssize_t)(buf.outbuf - PyUnicode_AS_UNICODE(buf.outobj)); 1082 if (finalsize != PyUnicode_GET_SIZE(buf.outobj)) 1083 if (PyUnicode_Resize(&buf.outobj, finalsize) == -1) 1084 goto errorexit; 1085 1086 PyBuffer_Release(&pdata); 1087 if (wdata != data) 1088 PyMem_Del(wdata); 1089 Py_XDECREF(buf.excobj); 1090 return buf.outobj; 1091 1092 errorexit: 1093 PyBuffer_Release(&pdata); 1094 if (wdata != NULL && wdata != data) 1095 PyMem_Del(wdata); 1096 Py_XDECREF(buf.excobj); 1097 Py_XDECREF(buf.outobj); 1098 return NULL; 1099 } 1100 1101 static PyObject * 1102 mbidecoder_reset(MultibyteIncrementalDecoderObject *self) 1103 { 1104 if (self->codec->decreset != NULL && 1105 self->codec->decreset(&self->state, self->codec->config) != 0) 1106 return NULL; 1107 self->pendingsize = 0; 1108 1109 Py_RETURN_NONE; 1110 } 1111 1112 static struct PyMethodDef mbidecoder_methods[] = { 1113 {"decode", (PyCFunction)mbidecoder_decode, 1114 METH_VARARGS | METH_KEYWORDS, NULL}, 1115 {"reset", (PyCFunction)mbidecoder_reset, 1116 METH_NOARGS, NULL}, 1117 {NULL, NULL}, 1118 }; 1119 1120 static PyObject * 1121 mbidecoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 1122 { 1123 MultibyteIncrementalDecoderObject *self; 1124 PyObject *codec = NULL; 1125 char *errors = NULL; 1126 1127 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|s:IncrementalDecoder", 1128 incnewkwarglist, &errors)) 1129 return NULL; 1130 1131 self = (MultibyteIncrementalDecoderObject *)type->tp_alloc(type, 0); 1132 if (self == NULL) 1133 return NULL; 1134 1135 codec = PyObject_GetAttrString((PyObject *)type, "codec"); 1136 if (codec == NULL) 1137 goto errorexit; 1138 if (!MultibyteCodec_Check(codec)) { 1139 PyErr_SetString(PyExc_TypeError, "codec is unexpected type"); 1140 goto errorexit; 1141 } 1142 1143 self->codec = ((MultibyteCodecObject *)codec)->codec; 1144 self->pendingsize = 0; 1145 self->errors = internal_error_callback(errors); 1146 if (self->errors == NULL) 1147 goto errorexit; 1148 if (self->codec->decinit != NULL && 1149 self->codec->decinit(&self->state, self->codec->config) != 0) 1150 goto errorexit; 1151 1152 Py_DECREF(codec); 1153 return (PyObject *)self; 1154 1155 errorexit: 1156 Py_XDECREF(self); 1157 Py_XDECREF(codec); 1158 return NULL; 1159 } 1160 1161 static int 1162 mbidecoder_init(PyObject *self, PyObject *args, PyObject *kwds) 1163 { 1164 return 0; 1165 } 1166 1167 static int 1168 mbidecoder_traverse(MultibyteIncrementalDecoderObject *self, 1169 visitproc visit, void *arg) 1170 { 1171 if (ERROR_ISCUSTOM(self->errors)) 1172 Py_VISIT(self->errors); 1173 return 0; 1174 } 1175 1176 static void 1177 mbidecoder_dealloc(MultibyteIncrementalDecoderObject *self) 1178 { 1179 PyObject_GC_UnTrack(self); 1180 ERROR_DECREF(self->errors); 1181 Py_TYPE(self)->tp_free(self); 1182 } 1183 1184 static PyTypeObject MultibyteIncrementalDecoder_Type = { 1185 PyVarObject_HEAD_INIT(NULL, 0) 1186 "MultibyteIncrementalDecoder", /* tp_name */ 1187 sizeof(MultibyteIncrementalDecoderObject), /* tp_basicsize */ 1188 0, /* tp_itemsize */ 1189 /* methods */ 1190 (destructor)mbidecoder_dealloc, /* tp_dealloc */ 1191 0, /* tp_print */ 1192 0, /* tp_getattr */ 1193 0, /* tp_setattr */ 1194 0, /* tp_compare */ 1195 0, /* tp_repr */ 1196 0, /* tp_as_number */ 1197 0, /* tp_as_sequence */ 1198 0, /* tp_as_mapping */ 1199 0, /* tp_hash */ 1200 0, /* tp_call */ 1201 0, /* tp_str */ 1202 PyObject_GenericGetAttr, /* tp_getattro */ 1203 0, /* tp_setattro */ 1204 0, /* tp_as_buffer */ 1205 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC 1206 | Py_TPFLAGS_BASETYPE, /* tp_flags */ 1207 0, /* tp_doc */ 1208 (traverseproc)mbidecoder_traverse, /* tp_traverse */ 1209 0, /* tp_clear */ 1210 0, /* tp_richcompare */ 1211 0, /* tp_weaklistoffset */ 1212 0, /* tp_iter */ 1213 0, /* tp_iterext */ 1214 mbidecoder_methods, /* tp_methods */ 1215 0, /* tp_members */ 1216 codecctx_getsets, /* tp_getset */ 1217 0, /* tp_base */ 1218 0, /* tp_dict */ 1219 0, /* tp_descr_get */ 1220 0, /* tp_descr_set */ 1221 0, /* tp_dictoffset */ 1222 mbidecoder_init, /* tp_init */ 1223 0, /* tp_alloc */ 1224 mbidecoder_new, /* tp_new */ 1225 }; 1226 1227 1228 /** 1229 * MultibyteStreamReader object 1230 */ 1231 1232 static PyObject * 1233 mbstreamreader_iread(MultibyteStreamReaderObject *self, 1234 const char *method, Py_ssize_t sizehint) 1235 { 1236 MultibyteDecodeBuffer buf; 1237 PyObject *cres; 1238 Py_ssize_t rsize, finalsize = 0; 1239 1240 if (sizehint == 0) 1241 return PyUnicode_FromUnicode(NULL, 0); 1242 1243 buf.outobj = buf.excobj = NULL; 1244 cres = NULL; 1245 1246 for (;;) { 1247 int endoffile; 1248 1249 if (sizehint < 0) 1250 cres = PyObject_CallMethod(self->stream, 1251 (char *)method, NULL); 1252 else 1253 cres = PyObject_CallMethod(self->stream, 1254 (char *)method, "i", sizehint); 1255 if (cres == NULL) 1256 goto errorexit; 1257 1258 if (!PyString_Check(cres)) { 1259 PyErr_SetString(PyExc_TypeError, 1260 "stream function returned a " 1261 "non-string object"); 1262 goto errorexit; 1263 } 1264 1265 endoffile = (PyString_GET_SIZE(cres) == 0); 1266 1267 if (self->pendingsize > 0) { 1268 PyObject *ctr; 1269 char *ctrdata; 1270 1271 if (PyString_GET_SIZE(cres) > PY_SSIZE_T_MAX - self->pendingsize) { 1272 PyErr_NoMemory(); 1273 goto errorexit; 1274 } 1275 rsize = PyString_GET_SIZE(cres) + self->pendingsize; 1276 ctr = PyString_FromStringAndSize(NULL, rsize); 1277 if (ctr == NULL) 1278 goto errorexit; 1279 ctrdata = PyString_AS_STRING(ctr); 1280 memcpy(ctrdata, self->pending, self->pendingsize); 1281 memcpy(ctrdata + self->pendingsize, 1282 PyString_AS_STRING(cres), 1283 PyString_GET_SIZE(cres)); 1284 Py_DECREF(cres); 1285 cres = ctr; 1286 self->pendingsize = 0; 1287 } 1288 1289 rsize = PyString_GET_SIZE(cres); 1290 if (decoder_prepare_buffer(&buf, PyString_AS_STRING(cres), 1291 rsize) != 0) 1292 goto errorexit; 1293 1294 if (rsize > 0 && decoder_feed_buffer( 1295 (MultibyteStatefulDecoderContext *)self, &buf)) 1296 goto errorexit; 1297 1298 if (endoffile || sizehint < 0) { 1299 if (buf.inbuf < buf.inbuf_end && 1300 multibytecodec_decerror(self->codec, &self->state, 1301 &buf, self->errors, MBERR_TOOFEW)) 1302 goto errorexit; 1303 } 1304 1305 if (buf.inbuf < buf.inbuf_end) { /* pending sequence exists */ 1306 if (decoder_append_pending(STATEFUL_DCTX(self), 1307 &buf) != 0) 1308 goto errorexit; 1309 } 1310 1311 finalsize = (Py_ssize_t)(buf.outbuf - 1312 PyUnicode_AS_UNICODE(buf.outobj)); 1313 Py_DECREF(cres); 1314 cres = NULL; 1315 1316 if (sizehint < 0 || finalsize != 0 || rsize == 0) 1317 break; 1318 1319 sizehint = 1; /* read 1 more byte and retry */ 1320 } 1321 1322 if (finalsize != PyUnicode_GET_SIZE(buf.outobj)) 1323 if (PyUnicode_Resize(&buf.outobj, finalsize) == -1) 1324 goto errorexit; 1325 1326 Py_XDECREF(cres); 1327 Py_XDECREF(buf.excobj); 1328 return buf.outobj; 1329 1330 errorexit: 1331 Py_XDECREF(cres); 1332 Py_XDECREF(buf.excobj); 1333 Py_XDECREF(buf.outobj); 1334 return NULL; 1335 } 1336 1337 static PyObject * 1338 mbstreamreader_read(MultibyteStreamReaderObject *self, PyObject *args) 1339 { 1340 PyObject *sizeobj = NULL; 1341 Py_ssize_t size; 1342 1343 if (!PyArg_UnpackTuple(args, "read", 0, 1, &sizeobj)) 1344 return NULL; 1345 1346 if (sizeobj == Py_None || sizeobj == NULL) 1347 size = -1; 1348 else if (PyInt_Check(sizeobj)) 1349 size = PyInt_AsSsize_t(sizeobj); 1350 else { 1351 PyErr_SetString(PyExc_TypeError, "arg 1 must be an integer"); 1352 return NULL; 1353 } 1354 1355 return mbstreamreader_iread(self, "read", size); 1356 } 1357 1358 static PyObject * 1359 mbstreamreader_readline(MultibyteStreamReaderObject *self, PyObject *args) 1360 { 1361 PyObject *sizeobj = NULL; 1362 Py_ssize_t size; 1363 1364 if (!PyArg_UnpackTuple(args, "readline", 0, 1, &sizeobj)) 1365 return NULL; 1366 1367 if (sizeobj == Py_None || sizeobj == NULL) 1368 size = -1; 1369 else if (PyInt_Check(sizeobj)) 1370 size = PyInt_AsSsize_t(sizeobj); 1371 else { 1372 PyErr_SetString(PyExc_TypeError, "arg 1 must be an integer"); 1373 return NULL; 1374 } 1375 1376 return mbstreamreader_iread(self, "readline", size); 1377 } 1378 1379 static PyObject * 1380 mbstreamreader_readlines(MultibyteStreamReaderObject *self, PyObject *args) 1381 { 1382 PyObject *sizehintobj = NULL, *r, *sr; 1383 Py_ssize_t sizehint; 1384 1385 if (!PyArg_UnpackTuple(args, "readlines", 0, 1, &sizehintobj)) 1386 return NULL; 1387 1388 if (sizehintobj == Py_None || sizehintobj == NULL) 1389 sizehint = -1; 1390 else if (PyInt_Check(sizehintobj)) 1391 sizehint = PyInt_AsSsize_t(sizehintobj); 1392 else { 1393 PyErr_SetString(PyExc_TypeError, "arg 1 must be an integer"); 1394 return NULL; 1395 } 1396 1397 r = mbstreamreader_iread(self, "read", sizehint); 1398 if (r == NULL) 1399 return NULL; 1400 1401 sr = PyUnicode_Splitlines(r, 1); 1402 Py_DECREF(r); 1403 return sr; 1404 } 1405 1406 static PyObject * 1407 mbstreamreader_reset(MultibyteStreamReaderObject *self) 1408 { 1409 if (self->codec->decreset != NULL && 1410 self->codec->decreset(&self->state, self->codec->config) != 0) 1411 return NULL; 1412 self->pendingsize = 0; 1413 1414 Py_RETURN_NONE; 1415 } 1416 1417 static struct PyMethodDef mbstreamreader_methods[] = { 1418 {"read", (PyCFunction)mbstreamreader_read, 1419 METH_VARARGS, NULL}, 1420 {"readline", (PyCFunction)mbstreamreader_readline, 1421 METH_VARARGS, NULL}, 1422 {"readlines", (PyCFunction)mbstreamreader_readlines, 1423 METH_VARARGS, NULL}, 1424 {"reset", (PyCFunction)mbstreamreader_reset, 1425 METH_NOARGS, NULL}, 1426 {NULL, NULL}, 1427 }; 1428 1429 static PyMemberDef mbstreamreader_members[] = { 1430 {"stream", T_OBJECT, 1431 offsetof(MultibyteStreamReaderObject, stream), 1432 READONLY, NULL}, 1433 {NULL,} 1434 }; 1435 1436 static PyObject * 1437 mbstreamreader_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 1438 { 1439 MultibyteStreamReaderObject *self; 1440 PyObject *stream, *codec = NULL; 1441 char *errors = NULL; 1442 1443 if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|s:StreamReader", 1444 streamkwarglist, &stream, &errors)) 1445 return NULL; 1446 1447 self = (MultibyteStreamReaderObject *)type->tp_alloc(type, 0); 1448 if (self == NULL) 1449 return NULL; 1450 1451 codec = PyObject_GetAttrString((PyObject *)type, "codec"); 1452 if (codec == NULL) 1453 goto errorexit; 1454 if (!MultibyteCodec_Check(codec)) { 1455 PyErr_SetString(PyExc_TypeError, "codec is unexpected type"); 1456 goto errorexit; 1457 } 1458 1459 self->codec = ((MultibyteCodecObject *)codec)->codec; 1460 self->stream = stream; 1461 Py_INCREF(stream); 1462 self->pendingsize = 0; 1463 self->errors = internal_error_callback(errors); 1464 if (self->errors == NULL) 1465 goto errorexit; 1466 if (self->codec->decinit != NULL && 1467 self->codec->decinit(&self->state, self->codec->config) != 0) 1468 goto errorexit; 1469 1470 Py_DECREF(codec); 1471 return (PyObject *)self; 1472 1473 errorexit: 1474 Py_XDECREF(self); 1475 Py_XDECREF(codec); 1476 return NULL; 1477 } 1478 1479 static int 1480 mbstreamreader_init(PyObject *self, PyObject *args, PyObject *kwds) 1481 { 1482 return 0; 1483 } 1484 1485 static int 1486 mbstreamreader_traverse(MultibyteStreamReaderObject *self, 1487 visitproc visit, void *arg) 1488 { 1489 if (ERROR_ISCUSTOM(self->errors)) 1490 Py_VISIT(self->errors); 1491 Py_VISIT(self->stream); 1492 return 0; 1493 } 1494 1495 static void 1496 mbstreamreader_dealloc(MultibyteStreamReaderObject *self) 1497 { 1498 PyObject_GC_UnTrack(self); 1499 ERROR_DECREF(self->errors); 1500 Py_XDECREF(self->stream); 1501 Py_TYPE(self)->tp_free(self); 1502 } 1503 1504 static PyTypeObject MultibyteStreamReader_Type = { 1505 PyVarObject_HEAD_INIT(NULL, 0) 1506 "MultibyteStreamReader", /* tp_name */ 1507 sizeof(MultibyteStreamReaderObject), /* tp_basicsize */ 1508 0, /* tp_itemsize */ 1509 /* methods */ 1510 (destructor)mbstreamreader_dealloc, /* tp_dealloc */ 1511 0, /* tp_print */ 1512 0, /* tp_getattr */ 1513 0, /* tp_setattr */ 1514 0, /* tp_compare */ 1515 0, /* tp_repr */ 1516 0, /* tp_as_number */ 1517 0, /* tp_as_sequence */ 1518 0, /* tp_as_mapping */ 1519 0, /* tp_hash */ 1520 0, /* tp_call */ 1521 0, /* tp_str */ 1522 PyObject_GenericGetAttr, /* tp_getattro */ 1523 0, /* tp_setattro */ 1524 0, /* tp_as_buffer */ 1525 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC 1526 | Py_TPFLAGS_BASETYPE, /* tp_flags */ 1527 0, /* tp_doc */ 1528 (traverseproc)mbstreamreader_traverse, /* tp_traverse */ 1529 0, /* tp_clear */ 1530 0, /* tp_richcompare */ 1531 0, /* tp_weaklistoffset */ 1532 0, /* tp_iter */ 1533 0, /* tp_iterext */ 1534 mbstreamreader_methods, /* tp_methods */ 1535 mbstreamreader_members, /* tp_members */ 1536 codecctx_getsets, /* tp_getset */ 1537 0, /* tp_base */ 1538 0, /* tp_dict */ 1539 0, /* tp_descr_get */ 1540 0, /* tp_descr_set */ 1541 0, /* tp_dictoffset */ 1542 mbstreamreader_init, /* tp_init */ 1543 0, /* tp_alloc */ 1544 mbstreamreader_new, /* tp_new */ 1545 }; 1546 1547 1548 /** 1549 * MultibyteStreamWriter object 1550 */ 1551 1552 static int 1553 mbstreamwriter_iwrite(MultibyteStreamWriterObject *self, 1554 PyObject *unistr) 1555 { 1556 PyObject *str, *wr; 1557 1558 str = encoder_encode_stateful(STATEFUL_ECTX(self), unistr, 0); 1559 if (str == NULL) 1560 return -1; 1561 1562 wr = PyObject_CallMethod(self->stream, "write", "O", str); 1563 Py_DECREF(str); 1564 if (wr == NULL) 1565 return -1; 1566 1567 Py_DECREF(wr); 1568 return 0; 1569 } 1570 1571 static PyObject * 1572 mbstreamwriter_write(MultibyteStreamWriterObject *self, PyObject *strobj) 1573 { 1574 if (mbstreamwriter_iwrite(self, strobj)) 1575 return NULL; 1576 else 1577 Py_RETURN_NONE; 1578 } 1579 1580 static PyObject * 1581 mbstreamwriter_writelines(MultibyteStreamWriterObject *self, PyObject *lines) 1582 { 1583 PyObject *strobj; 1584 int i, r; 1585 1586 if (!PySequence_Check(lines)) { 1587 PyErr_SetString(PyExc_TypeError, 1588 "arg must be a sequence object"); 1589 return NULL; 1590 } 1591 1592 for (i = 0; i < PySequence_Length(lines); i++) { 1593 /* length can be changed even within this loop */ 1594 strobj = PySequence_GetItem(lines, i); 1595 if (strobj == NULL) 1596 return NULL; 1597 1598 r = mbstreamwriter_iwrite(self, strobj); 1599 Py_DECREF(strobj); 1600 if (r == -1) 1601 return NULL; 1602 } 1603 1604 Py_RETURN_NONE; 1605 } 1606 1607 static PyObject * 1608 mbstreamwriter_reset(MultibyteStreamWriterObject *self) 1609 { 1610 const Py_UNICODE *pending; 1611 PyObject *pwrt; 1612 1613 pending = self->pending; 1614 pwrt = multibytecodec_encode(self->codec, &self->state, 1615 &pending, self->pendingsize, self->errors, 1616 MBENC_FLUSH | MBENC_RESET); 1617 /* some pending buffer can be truncated when UnicodeEncodeError is 1618 * raised on 'strict' mode. but, 'reset' method is designed to 1619 * reset the pending buffer or states so failed string sequence 1620 * ought to be missed */ 1621 self->pendingsize = 0; 1622 if (pwrt == NULL) 1623 return NULL; 1624 1625 if (PyString_Size(pwrt) > 0) { 1626 PyObject *wr; 1627 wr = PyObject_CallMethod(self->stream, "write", "O", pwrt); 1628 if (wr == NULL) { 1629 Py_DECREF(pwrt); 1630 return NULL; 1631 } 1632 } 1633 Py_DECREF(pwrt); 1634 1635 Py_RETURN_NONE; 1636 } 1637 1638 static PyObject * 1639 mbstreamwriter_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 1640 { 1641 MultibyteStreamWriterObject *self; 1642 PyObject *stream, *codec = NULL; 1643 char *errors = NULL; 1644 1645 if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|s:StreamWriter", 1646 streamkwarglist, &stream, &errors)) 1647 return NULL; 1648 1649 self = (MultibyteStreamWriterObject *)type->tp_alloc(type, 0); 1650 if (self == NULL) 1651 return NULL; 1652 1653 codec = PyObject_GetAttrString((PyObject *)type, "codec"); 1654 if (codec == NULL) 1655 goto errorexit; 1656 if (!MultibyteCodec_Check(codec)) { 1657 PyErr_SetString(PyExc_TypeError, "codec is unexpected type"); 1658 goto errorexit; 1659 } 1660 1661 self->codec = ((MultibyteCodecObject *)codec)->codec; 1662 self->stream = stream; 1663 Py_INCREF(stream); 1664 self->pendingsize = 0; 1665 self->errors = internal_error_callback(errors); 1666 if (self->errors == NULL) 1667 goto errorexit; 1668 if (self->codec->encinit != NULL && 1669 self->codec->encinit(&self->state, self->codec->config) != 0) 1670 goto errorexit; 1671 1672 Py_DECREF(codec); 1673 return (PyObject *)self; 1674 1675 errorexit: 1676 Py_XDECREF(self); 1677 Py_XDECREF(codec); 1678 return NULL; 1679 } 1680 1681 static int 1682 mbstreamwriter_init(PyObject *self, PyObject *args, PyObject *kwds) 1683 { 1684 return 0; 1685 } 1686 1687 static int 1688 mbstreamwriter_traverse(MultibyteStreamWriterObject *self, 1689 visitproc visit, void *arg) 1690 { 1691 if (ERROR_ISCUSTOM(self->errors)) 1692 Py_VISIT(self->errors); 1693 Py_VISIT(self->stream); 1694 return 0; 1695 } 1696 1697 static void 1698 mbstreamwriter_dealloc(MultibyteStreamWriterObject *self) 1699 { 1700 PyObject_GC_UnTrack(self); 1701 ERROR_DECREF(self->errors); 1702 Py_XDECREF(self->stream); 1703 Py_TYPE(self)->tp_free(self); 1704 } 1705 1706 static struct PyMethodDef mbstreamwriter_methods[] = { 1707 {"write", (PyCFunction)mbstreamwriter_write, 1708 METH_O, NULL}, 1709 {"writelines", (PyCFunction)mbstreamwriter_writelines, 1710 METH_O, NULL}, 1711 {"reset", (PyCFunction)mbstreamwriter_reset, 1712 METH_NOARGS, NULL}, 1713 {NULL, NULL}, 1714 }; 1715 1716 static PyMemberDef mbstreamwriter_members[] = { 1717 {"stream", T_OBJECT, 1718 offsetof(MultibyteStreamWriterObject, stream), 1719 READONLY, NULL}, 1720 {NULL,} 1721 }; 1722 1723 static PyTypeObject MultibyteStreamWriter_Type = { 1724 PyVarObject_HEAD_INIT(NULL, 0) 1725 "MultibyteStreamWriter", /* tp_name */ 1726 sizeof(MultibyteStreamWriterObject), /* tp_basicsize */ 1727 0, /* tp_itemsize */ 1728 /* methods */ 1729 (destructor)mbstreamwriter_dealloc, /* tp_dealloc */ 1730 0, /* tp_print */ 1731 0, /* tp_getattr */ 1732 0, /* tp_setattr */ 1733 0, /* tp_compare */ 1734 0, /* tp_repr */ 1735 0, /* tp_as_number */ 1736 0, /* tp_as_sequence */ 1737 0, /* tp_as_mapping */ 1738 0, /* tp_hash */ 1739 0, /* tp_call */ 1740 0, /* tp_str */ 1741 PyObject_GenericGetAttr, /* tp_getattro */ 1742 0, /* tp_setattro */ 1743 0, /* tp_as_buffer */ 1744 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC 1745 | Py_TPFLAGS_BASETYPE, /* tp_flags */ 1746 0, /* tp_doc */ 1747 (traverseproc)mbstreamwriter_traverse, /* tp_traverse */ 1748 0, /* tp_clear */ 1749 0, /* tp_richcompare */ 1750 0, /* tp_weaklistoffset */ 1751 0, /* tp_iter */ 1752 0, /* tp_iterext */ 1753 mbstreamwriter_methods, /* tp_methods */ 1754 mbstreamwriter_members, /* tp_members */ 1755 codecctx_getsets, /* tp_getset */ 1756 0, /* tp_base */ 1757 0, /* tp_dict */ 1758 0, /* tp_descr_get */ 1759 0, /* tp_descr_set */ 1760 0, /* tp_dictoffset */ 1761 mbstreamwriter_init, /* tp_init */ 1762 0, /* tp_alloc */ 1763 mbstreamwriter_new, /* tp_new */ 1764 }; 1765 1766 1767 /** 1768 * Exposed factory function 1769 */ 1770 1771 static PyObject * 1772 __create_codec(PyObject *ignore, PyObject *arg) 1773 { 1774 MultibyteCodecObject *self; 1775 MultibyteCodec *codec; 1776 1777 if (!PyCapsule_IsValid(arg, PyMultibyteCodec_CAPSULE_NAME)) { 1778 PyErr_SetString(PyExc_ValueError, "argument type invalid"); 1779 return NULL; 1780 } 1781 1782 codec = PyCapsule_GetPointer(arg, PyMultibyteCodec_CAPSULE_NAME); 1783 if (codec->codecinit != NULL && codec->codecinit(codec->config) != 0) 1784 return NULL; 1785 1786 self = PyObject_New(MultibyteCodecObject, &MultibyteCodec_Type); 1787 if (self == NULL) 1788 return NULL; 1789 self->codec = codec; 1790 1791 return (PyObject *)self; 1792 } 1793 1794 static struct PyMethodDef __methods[] = { 1795 {"__create_codec", (PyCFunction)__create_codec, METH_O}, 1796 {NULL, NULL}, 1797 }; 1798 1799 PyMODINIT_FUNC 1800 init_multibytecodec(void) 1801 { 1802 int i; 1803 PyObject *m; 1804 PyTypeObject *typelist[] = { 1805 &MultibyteIncrementalEncoder_Type, 1806 &MultibyteIncrementalDecoder_Type, 1807 &MultibyteStreamReader_Type, 1808 &MultibyteStreamWriter_Type, 1809 NULL 1810 }; 1811 1812 if (PyType_Ready(&MultibyteCodec_Type) < 0) 1813 return; 1814 1815 m = Py_InitModule("_multibytecodec", __methods); 1816 if (m == NULL) 1817 return; 1818 1819 for (i = 0; typelist[i] != NULL; i++) { 1820 if (PyType_Ready(typelist[i]) < 0) 1821 return; 1822 Py_INCREF(typelist[i]); 1823 PyModule_AddObject(m, typelist[i]->tp_name, 1824 (PyObject *)typelist[i]); 1825 } 1826 1827 if (PyErr_Occurred()) 1828 Py_FatalError("can't initialize the _multibytecodec module"); 1829 } 1830