1 /* ------------------------------------------------------------------------ 2 3 Python Codec Registry and support functions 4 5 Written by Marc-Andre Lemburg (mal (at) lemburg.com). 6 7 Copyright (c) Corporation for National Research Initiatives. 8 9 ------------------------------------------------------------------------ */ 10 11 #include "Python.h" 12 #include "internal/pystate.h" 13 #include "ucnhash.h" 14 #include <ctype.h> 15 16 const char *Py_hexdigits = "0123456789abcdef"; 17 18 /* --- Codec Registry ----------------------------------------------------- */ 19 20 /* Import the standard encodings package which will register the first 21 codec search function. 22 23 This is done in a lazy way so that the Unicode implementation does 24 not downgrade startup time of scripts not needing it. 25 26 ImportErrors are silently ignored by this function. Only one try is 27 made. 28 29 */ 30 31 static int _PyCodecRegistry_Init(void); /* Forward */ 32 33 int PyCodec_Register(PyObject *search_function) 34 { 35 PyInterpreterState *interp = PyThreadState_GET()->interp; 36 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) 37 goto onError; 38 if (search_function == NULL) { 39 PyErr_BadArgument(); 40 goto onError; 41 } 42 if (!PyCallable_Check(search_function)) { 43 PyErr_SetString(PyExc_TypeError, "argument must be callable"); 44 goto onError; 45 } 46 return PyList_Append(interp->codec_search_path, search_function); 47 48 onError: 49 return -1; 50 } 51 52 /* Convert a string to a normalized Python string: all characters are 53 converted to lower case, spaces are replaced with underscores. */ 54 55 static 56 PyObject *normalizestring(const char *string) 57 { 58 size_t i; 59 size_t len = strlen(string); 60 char *p; 61 PyObject *v; 62 63 if (len > PY_SSIZE_T_MAX) { 64 PyErr_SetString(PyExc_OverflowError, "string is too large"); 65 return NULL; 66 } 67 68 p = PyMem_Malloc(len + 1); 69 if (p == NULL) 70 return PyErr_NoMemory(); 71 for (i = 0; i < len; i++) { 72 char ch = string[i]; 73 if (ch == ' ') 74 ch = '-'; 75 else 76 ch = Py_TOLOWER(Py_CHARMASK(ch)); 77 p[i] = ch; 78 } 79 p[i] = '\0'; 80 v = PyUnicode_FromString(p); 81 PyMem_Free(p); 82 return v; 83 } 84 85 /* Lookup the given encoding and return a tuple providing the codec 86 facilities. 87 88 The encoding string is looked up converted to all lower-case 89 characters. This makes encodings looked up through this mechanism 90 effectively case-insensitive. 91 92 If no codec is found, a LookupError is set and NULL returned. 93 94 As side effect, this tries to load the encodings package, if not 95 yet done. This is part of the lazy load strategy for the encodings 96 package. 97 98 */ 99 100 PyObject *_PyCodec_Lookup(const char *encoding) 101 { 102 PyInterpreterState *interp; 103 PyObject *result, *args = NULL, *v; 104 Py_ssize_t i, len; 105 106 if (encoding == NULL) { 107 PyErr_BadArgument(); 108 goto onError; 109 } 110 111 interp = PyThreadState_GET()->interp; 112 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) 113 goto onError; 114 115 /* Convert the encoding to a normalized Python string: all 116 characters are converted to lower case, spaces and hyphens are 117 replaced with underscores. */ 118 v = normalizestring(encoding); 119 if (v == NULL) 120 goto onError; 121 PyUnicode_InternInPlace(&v); 122 123 /* First, try to lookup the name in the registry dictionary */ 124 result = PyDict_GetItem(interp->codec_search_cache, v); 125 if (result != NULL) { 126 Py_INCREF(result); 127 Py_DECREF(v); 128 return result; 129 } 130 131 /* Next, scan the search functions in order of registration */ 132 args = PyTuple_New(1); 133 if (args == NULL) { 134 Py_DECREF(v); 135 return NULL; 136 } 137 PyTuple_SET_ITEM(args,0,v); 138 139 len = PyList_Size(interp->codec_search_path); 140 if (len < 0) 141 goto onError; 142 if (len == 0) { 143 PyErr_SetString(PyExc_LookupError, 144 "no codec search functions registered: " 145 "can't find encoding"); 146 goto onError; 147 } 148 149 for (i = 0; i < len; i++) { 150 PyObject *func; 151 152 func = PyList_GetItem(interp->codec_search_path, i); 153 if (func == NULL) 154 goto onError; 155 result = PyEval_CallObject(func, args); 156 if (result == NULL) 157 goto onError; 158 if (result == Py_None) { 159 Py_DECREF(result); 160 continue; 161 } 162 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) { 163 PyErr_SetString(PyExc_TypeError, 164 "codec search functions must return 4-tuples"); 165 Py_DECREF(result); 166 goto onError; 167 } 168 break; 169 } 170 if (i == len) { 171 /* XXX Perhaps we should cache misses too ? */ 172 PyErr_Format(PyExc_LookupError, 173 "unknown encoding: %s", encoding); 174 goto onError; 175 } 176 177 /* Cache and return the result */ 178 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) { 179 Py_DECREF(result); 180 goto onError; 181 } 182 Py_DECREF(args); 183 return result; 184 185 onError: 186 Py_XDECREF(args); 187 return NULL; 188 } 189 190 int _PyCodec_Forget(const char *encoding) 191 { 192 PyInterpreterState *interp; 193 PyObject *v; 194 int result; 195 196 interp = PyThreadState_GET()->interp; 197 if (interp->codec_search_path == NULL) { 198 return -1; 199 } 200 201 /* Convert the encoding to a normalized Python string: all 202 characters are converted to lower case, spaces and hyphens are 203 replaced with underscores. */ 204 v = normalizestring(encoding); 205 if (v == NULL) { 206 return -1; 207 } 208 209 /* Drop the named codec from the internal cache */ 210 result = PyDict_DelItem(interp->codec_search_cache, v); 211 Py_DECREF(v); 212 213 return result; 214 } 215 216 /* Codec registry encoding check API. */ 217 218 int PyCodec_KnownEncoding(const char *encoding) 219 { 220 PyObject *codecs; 221 222 codecs = _PyCodec_Lookup(encoding); 223 if (!codecs) { 224 PyErr_Clear(); 225 return 0; 226 } 227 else { 228 Py_DECREF(codecs); 229 return 1; 230 } 231 } 232 233 static 234 PyObject *args_tuple(PyObject *object, 235 const char *errors) 236 { 237 PyObject *args; 238 239 args = PyTuple_New(1 + (errors != NULL)); 240 if (args == NULL) 241 return NULL; 242 Py_INCREF(object); 243 PyTuple_SET_ITEM(args,0,object); 244 if (errors) { 245 PyObject *v; 246 247 v = PyUnicode_FromString(errors); 248 if (v == NULL) { 249 Py_DECREF(args); 250 return NULL; 251 } 252 PyTuple_SET_ITEM(args, 1, v); 253 } 254 return args; 255 } 256 257 /* Helper function to get a codec item */ 258 259 static 260 PyObject *codec_getitem(const char *encoding, int index) 261 { 262 PyObject *codecs; 263 PyObject *v; 264 265 codecs = _PyCodec_Lookup(encoding); 266 if (codecs == NULL) 267 return NULL; 268 v = PyTuple_GET_ITEM(codecs, index); 269 Py_DECREF(codecs); 270 Py_INCREF(v); 271 return v; 272 } 273 274 /* Helper functions to create an incremental codec. */ 275 static 276 PyObject *codec_makeincrementalcodec(PyObject *codec_info, 277 const char *errors, 278 const char *attrname) 279 { 280 PyObject *ret, *inccodec; 281 282 inccodec = PyObject_GetAttrString(codec_info, attrname); 283 if (inccodec == NULL) 284 return NULL; 285 if (errors) 286 ret = PyObject_CallFunction(inccodec, "s", errors); 287 else 288 ret = _PyObject_CallNoArg(inccodec); 289 Py_DECREF(inccodec); 290 return ret; 291 } 292 293 static 294 PyObject *codec_getincrementalcodec(const char *encoding, 295 const char *errors, 296 const char *attrname) 297 { 298 PyObject *codec_info, *ret; 299 300 codec_info = _PyCodec_Lookup(encoding); 301 if (codec_info == NULL) 302 return NULL; 303 ret = codec_makeincrementalcodec(codec_info, errors, attrname); 304 Py_DECREF(codec_info); 305 return ret; 306 } 307 308 /* Helper function to create a stream codec. */ 309 310 static 311 PyObject *codec_getstreamcodec(const char *encoding, 312 PyObject *stream, 313 const char *errors, 314 const int index) 315 { 316 PyObject *codecs, *streamcodec, *codeccls; 317 318 codecs = _PyCodec_Lookup(encoding); 319 if (codecs == NULL) 320 return NULL; 321 322 codeccls = PyTuple_GET_ITEM(codecs, index); 323 if (errors != NULL) 324 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors); 325 else 326 streamcodec = PyObject_CallFunctionObjArgs(codeccls, stream, NULL); 327 Py_DECREF(codecs); 328 return streamcodec; 329 } 330 331 /* Helpers to work with the result of _PyCodec_Lookup 332 333 */ 334 PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info, 335 const char *errors) 336 { 337 return codec_makeincrementalcodec(codec_info, errors, 338 "incrementaldecoder"); 339 } 340 341 PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info, 342 const char *errors) 343 { 344 return codec_makeincrementalcodec(codec_info, errors, 345 "incrementalencoder"); 346 } 347 348 349 /* Convenience APIs to query the Codec registry. 350 351 All APIs return a codec object with incremented refcount. 352 353 */ 354 355 PyObject *PyCodec_Encoder(const char *encoding) 356 { 357 return codec_getitem(encoding, 0); 358 } 359 360 PyObject *PyCodec_Decoder(const char *encoding) 361 { 362 return codec_getitem(encoding, 1); 363 } 364 365 PyObject *PyCodec_IncrementalEncoder(const char *encoding, 366 const char *errors) 367 { 368 return codec_getincrementalcodec(encoding, errors, "incrementalencoder"); 369 } 370 371 PyObject *PyCodec_IncrementalDecoder(const char *encoding, 372 const char *errors) 373 { 374 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder"); 375 } 376 377 PyObject *PyCodec_StreamReader(const char *encoding, 378 PyObject *stream, 379 const char *errors) 380 { 381 return codec_getstreamcodec(encoding, stream, errors, 2); 382 } 383 384 PyObject *PyCodec_StreamWriter(const char *encoding, 385 PyObject *stream, 386 const char *errors) 387 { 388 return codec_getstreamcodec(encoding, stream, errors, 3); 389 } 390 391 /* Helper that tries to ensure the reported exception chain indicates the 392 * codec that was invoked to trigger the failure without changing the type 393 * of the exception raised. 394 */ 395 static void 396 wrap_codec_error(const char *operation, 397 const char *encoding) 398 { 399 /* TrySetFromCause will replace the active exception with a suitably 400 * updated clone if it can, otherwise it will leave the original 401 * exception alone. 402 */ 403 _PyErr_TrySetFromCause("%s with '%s' codec failed", 404 operation, encoding); 405 } 406 407 /* Encode an object (e.g. a Unicode object) using the given encoding 408 and return the resulting encoded object (usually a Python string). 409 410 errors is passed to the encoder factory as argument if non-NULL. */ 411 412 static PyObject * 413 _PyCodec_EncodeInternal(PyObject *object, 414 PyObject *encoder, 415 const char *encoding, 416 const char *errors) 417 { 418 PyObject *args = NULL, *result = NULL; 419 PyObject *v = NULL; 420 421 args = args_tuple(object, errors); 422 if (args == NULL) 423 goto onError; 424 425 result = PyEval_CallObject(encoder, args); 426 if (result == NULL) { 427 wrap_codec_error("encoding", encoding); 428 goto onError; 429 } 430 431 if (!PyTuple_Check(result) || 432 PyTuple_GET_SIZE(result) != 2) { 433 PyErr_SetString(PyExc_TypeError, 434 "encoder must return a tuple (object, integer)"); 435 goto onError; 436 } 437 v = PyTuple_GET_ITEM(result,0); 438 Py_INCREF(v); 439 /* We don't check or use the second (integer) entry. */ 440 441 Py_DECREF(args); 442 Py_DECREF(encoder); 443 Py_DECREF(result); 444 return v; 445 446 onError: 447 Py_XDECREF(result); 448 Py_XDECREF(args); 449 Py_XDECREF(encoder); 450 return NULL; 451 } 452 453 /* Decode an object (usually a Python string) using the given encoding 454 and return an equivalent object (e.g. a Unicode object). 455 456 errors is passed to the decoder factory as argument if non-NULL. */ 457 458 static PyObject * 459 _PyCodec_DecodeInternal(PyObject *object, 460 PyObject *decoder, 461 const char *encoding, 462 const char *errors) 463 { 464 PyObject *args = NULL, *result = NULL; 465 PyObject *v; 466 467 args = args_tuple(object, errors); 468 if (args == NULL) 469 goto onError; 470 471 result = PyEval_CallObject(decoder,args); 472 if (result == NULL) { 473 wrap_codec_error("decoding", encoding); 474 goto onError; 475 } 476 if (!PyTuple_Check(result) || 477 PyTuple_GET_SIZE(result) != 2) { 478 PyErr_SetString(PyExc_TypeError, 479 "decoder must return a tuple (object,integer)"); 480 goto onError; 481 } 482 v = PyTuple_GET_ITEM(result,0); 483 Py_INCREF(v); 484 /* We don't check or use the second (integer) entry. */ 485 486 Py_DECREF(args); 487 Py_DECREF(decoder); 488 Py_DECREF(result); 489 return v; 490 491 onError: 492 Py_XDECREF(args); 493 Py_XDECREF(decoder); 494 Py_XDECREF(result); 495 return NULL; 496 } 497 498 /* Generic encoding/decoding API */ 499 PyObject *PyCodec_Encode(PyObject *object, 500 const char *encoding, 501 const char *errors) 502 { 503 PyObject *encoder; 504 505 encoder = PyCodec_Encoder(encoding); 506 if (encoder == NULL) 507 return NULL; 508 509 return _PyCodec_EncodeInternal(object, encoder, encoding, errors); 510 } 511 512 PyObject *PyCodec_Decode(PyObject *object, 513 const char *encoding, 514 const char *errors) 515 { 516 PyObject *decoder; 517 518 decoder = PyCodec_Decoder(encoding); 519 if (decoder == NULL) 520 return NULL; 521 522 return _PyCodec_DecodeInternal(object, decoder, encoding, errors); 523 } 524 525 /* Text encoding/decoding API */ 526 PyObject * _PyCodec_LookupTextEncoding(const char *encoding, 527 const char *alternate_command) 528 { 529 _Py_IDENTIFIER(_is_text_encoding); 530 PyObject *codec; 531 PyObject *attr; 532 int is_text_codec; 533 534 codec = _PyCodec_Lookup(encoding); 535 if (codec == NULL) 536 return NULL; 537 538 /* Backwards compatibility: assume any raw tuple describes a text 539 * encoding, and the same for anything lacking the private 540 * attribute. 541 */ 542 if (!PyTuple_CheckExact(codec)) { 543 if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) { 544 Py_DECREF(codec); 545 return NULL; 546 } 547 if (attr != NULL) { 548 is_text_codec = PyObject_IsTrue(attr); 549 Py_DECREF(attr); 550 if (is_text_codec <= 0) { 551 Py_DECREF(codec); 552 if (!is_text_codec) 553 PyErr_Format(PyExc_LookupError, 554 "'%.400s' is not a text encoding; " 555 "use %s to handle arbitrary codecs", 556 encoding, alternate_command); 557 return NULL; 558 } 559 } 560 } 561 562 /* This appears to be a valid text encoding */ 563 return codec; 564 } 565 566 567 static 568 PyObject *codec_getitem_checked(const char *encoding, 569 const char *alternate_command, 570 int index) 571 { 572 PyObject *codec; 573 PyObject *v; 574 575 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command); 576 if (codec == NULL) 577 return NULL; 578 579 v = PyTuple_GET_ITEM(codec, index); 580 Py_INCREF(v); 581 Py_DECREF(codec); 582 return v; 583 } 584 585 static PyObject * _PyCodec_TextEncoder(const char *encoding) 586 { 587 return codec_getitem_checked(encoding, "codecs.encode()", 0); 588 } 589 590 static PyObject * _PyCodec_TextDecoder(const char *encoding) 591 { 592 return codec_getitem_checked(encoding, "codecs.decode()", 1); 593 } 594 595 PyObject *_PyCodec_EncodeText(PyObject *object, 596 const char *encoding, 597 const char *errors) 598 { 599 PyObject *encoder; 600 601 encoder = _PyCodec_TextEncoder(encoding); 602 if (encoder == NULL) 603 return NULL; 604 605 return _PyCodec_EncodeInternal(object, encoder, encoding, errors); 606 } 607 608 PyObject *_PyCodec_DecodeText(PyObject *object, 609 const char *encoding, 610 const char *errors) 611 { 612 PyObject *decoder; 613 614 decoder = _PyCodec_TextDecoder(encoding); 615 if (decoder == NULL) 616 return NULL; 617 618 return _PyCodec_DecodeInternal(object, decoder, encoding, errors); 619 } 620 621 /* Register the error handling callback function error under the name 622 name. This function will be called by the codec when it encounters 623 an unencodable characters/undecodable bytes and doesn't know the 624 callback name, when name is specified as the error parameter 625 in the call to the encode/decode function. 626 Return 0 on success, -1 on error */ 627 int PyCodec_RegisterError(const char *name, PyObject *error) 628 { 629 PyInterpreterState *interp = PyThreadState_GET()->interp; 630 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) 631 return -1; 632 if (!PyCallable_Check(error)) { 633 PyErr_SetString(PyExc_TypeError, "handler must be callable"); 634 return -1; 635 } 636 return PyDict_SetItemString(interp->codec_error_registry, 637 name, error); 638 } 639 640 /* Lookup the error handling callback function registered under the 641 name error. As a special case NULL can be passed, in which case 642 the error handling callback for strict encoding will be returned. */ 643 PyObject *PyCodec_LookupError(const char *name) 644 { 645 PyObject *handler = NULL; 646 647 PyInterpreterState *interp = PyThreadState_GET()->interp; 648 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) 649 return NULL; 650 651 if (name==NULL) 652 name = "strict"; 653 handler = PyDict_GetItemString(interp->codec_error_registry, name); 654 if (!handler) 655 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name); 656 else 657 Py_INCREF(handler); 658 return handler; 659 } 660 661 static void wrong_exception_type(PyObject *exc) 662 { 663 PyErr_Format(PyExc_TypeError, 664 "don't know how to handle %.200s in error callback", 665 exc->ob_type->tp_name); 666 } 667 668 PyObject *PyCodec_StrictErrors(PyObject *exc) 669 { 670 if (PyExceptionInstance_Check(exc)) 671 PyErr_SetObject(PyExceptionInstance_Class(exc), exc); 672 else 673 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance"); 674 return NULL; 675 } 676 677 678 PyObject *PyCodec_IgnoreErrors(PyObject *exc) 679 { 680 Py_ssize_t end; 681 682 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { 683 if (PyUnicodeEncodeError_GetEnd(exc, &end)) 684 return NULL; 685 } 686 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { 687 if (PyUnicodeDecodeError_GetEnd(exc, &end)) 688 return NULL; 689 } 690 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) { 691 if (PyUnicodeTranslateError_GetEnd(exc, &end)) 692 return NULL; 693 } 694 else { 695 wrong_exception_type(exc); 696 return NULL; 697 } 698 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end); 699 } 700 701 702 PyObject *PyCodec_ReplaceErrors(PyObject *exc) 703 { 704 Py_ssize_t start, end, i, len; 705 706 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { 707 PyObject *res; 708 int kind; 709 void *data; 710 if (PyUnicodeEncodeError_GetStart(exc, &start)) 711 return NULL; 712 if (PyUnicodeEncodeError_GetEnd(exc, &end)) 713 return NULL; 714 len = end - start; 715 res = PyUnicode_New(len, '?'); 716 if (res == NULL) 717 return NULL; 718 kind = PyUnicode_KIND(res); 719 data = PyUnicode_DATA(res); 720 for (i = 0; i < len; ++i) 721 PyUnicode_WRITE(kind, data, i, '?'); 722 assert(_PyUnicode_CheckConsistency(res, 1)); 723 return Py_BuildValue("(Nn)", res, end); 724 } 725 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { 726 if (PyUnicodeDecodeError_GetEnd(exc, &end)) 727 return NULL; 728 return Py_BuildValue("(Cn)", 729 (int)Py_UNICODE_REPLACEMENT_CHARACTER, 730 end); 731 } 732 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) { 733 PyObject *res; 734 int kind; 735 void *data; 736 if (PyUnicodeTranslateError_GetStart(exc, &start)) 737 return NULL; 738 if (PyUnicodeTranslateError_GetEnd(exc, &end)) 739 return NULL; 740 len = end - start; 741 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER); 742 if (res == NULL) 743 return NULL; 744 kind = PyUnicode_KIND(res); 745 data = PyUnicode_DATA(res); 746 for (i=0; i < len; i++) 747 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER); 748 assert(_PyUnicode_CheckConsistency(res, 1)); 749 return Py_BuildValue("(Nn)", res, end); 750 } 751 else { 752 wrong_exception_type(exc); 753 return NULL; 754 } 755 } 756 757 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) 758 { 759 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { 760 PyObject *restuple; 761 PyObject *object; 762 Py_ssize_t i; 763 Py_ssize_t start; 764 Py_ssize_t end; 765 PyObject *res; 766 unsigned char *outp; 767 Py_ssize_t ressize; 768 Py_UCS4 ch; 769 if (PyUnicodeEncodeError_GetStart(exc, &start)) 770 return NULL; 771 if (PyUnicodeEncodeError_GetEnd(exc, &end)) 772 return NULL; 773 if (!(object = PyUnicodeEncodeError_GetObject(exc))) 774 return NULL; 775 if (end - start > PY_SSIZE_T_MAX / (2+7+1)) 776 end = start + PY_SSIZE_T_MAX / (2+7+1); 777 for (i = start, ressize = 0; i < end; ++i) { 778 /* object is guaranteed to be "ready" */ 779 ch = PyUnicode_READ_CHAR(object, i); 780 if (ch<10) 781 ressize += 2+1+1; 782 else if (ch<100) 783 ressize += 2+2+1; 784 else if (ch<1000) 785 ressize += 2+3+1; 786 else if (ch<10000) 787 ressize += 2+4+1; 788 else if (ch<100000) 789 ressize += 2+5+1; 790 else if (ch<1000000) 791 ressize += 2+6+1; 792 else 793 ressize += 2+7+1; 794 } 795 /* allocate replacement */ 796 res = PyUnicode_New(ressize, 127); 797 if (res == NULL) { 798 Py_DECREF(object); 799 return NULL; 800 } 801 outp = PyUnicode_1BYTE_DATA(res); 802 /* generate replacement */ 803 for (i = start; i < end; ++i) { 804 int digits; 805 int base; 806 ch = PyUnicode_READ_CHAR(object, i); 807 *outp++ = '&'; 808 *outp++ = '#'; 809 if (ch<10) { 810 digits = 1; 811 base = 1; 812 } 813 else if (ch<100) { 814 digits = 2; 815 base = 10; 816 } 817 else if (ch<1000) { 818 digits = 3; 819 base = 100; 820 } 821 else if (ch<10000) { 822 digits = 4; 823 base = 1000; 824 } 825 else if (ch<100000) { 826 digits = 5; 827 base = 10000; 828 } 829 else if (ch<1000000) { 830 digits = 6; 831 base = 100000; 832 } 833 else { 834 digits = 7; 835 base = 1000000; 836 } 837 while (digits-->0) { 838 *outp++ = '0' + ch/base; 839 ch %= base; 840 base /= 10; 841 } 842 *outp++ = ';'; 843 } 844 assert(_PyUnicode_CheckConsistency(res, 1)); 845 restuple = Py_BuildValue("(Nn)", res, end); 846 Py_DECREF(object); 847 return restuple; 848 } 849 else { 850 wrong_exception_type(exc); 851 return NULL; 852 } 853 } 854 855 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) 856 { 857 PyObject *object; 858 Py_ssize_t i; 859 Py_ssize_t start; 860 Py_ssize_t end; 861 PyObject *res; 862 unsigned char *outp; 863 int ressize; 864 Py_UCS4 c; 865 866 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { 867 const unsigned char *p; 868 if (PyUnicodeDecodeError_GetStart(exc, &start)) 869 return NULL; 870 if (PyUnicodeDecodeError_GetEnd(exc, &end)) 871 return NULL; 872 if (!(object = PyUnicodeDecodeError_GetObject(exc))) 873 return NULL; 874 p = (const unsigned char*)PyBytes_AS_STRING(object); 875 res = PyUnicode_New(4 * (end - start), 127); 876 if (res == NULL) { 877 Py_DECREF(object); 878 return NULL; 879 } 880 outp = PyUnicode_1BYTE_DATA(res); 881 for (i = start; i < end; i++, outp += 4) { 882 unsigned char c = p[i]; 883 outp[0] = '\\'; 884 outp[1] = 'x'; 885 outp[2] = Py_hexdigits[(c>>4)&0xf]; 886 outp[3] = Py_hexdigits[c&0xf]; 887 } 888 889 assert(_PyUnicode_CheckConsistency(res, 1)); 890 Py_DECREF(object); 891 return Py_BuildValue("(Nn)", res, end); 892 } 893 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { 894 if (PyUnicodeEncodeError_GetStart(exc, &start)) 895 return NULL; 896 if (PyUnicodeEncodeError_GetEnd(exc, &end)) 897 return NULL; 898 if (!(object = PyUnicodeEncodeError_GetObject(exc))) 899 return NULL; 900 } 901 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) { 902 if (PyUnicodeTranslateError_GetStart(exc, &start)) 903 return NULL; 904 if (PyUnicodeTranslateError_GetEnd(exc, &end)) 905 return NULL; 906 if (!(object = PyUnicodeTranslateError_GetObject(exc))) 907 return NULL; 908 } 909 else { 910 wrong_exception_type(exc); 911 return NULL; 912 } 913 914 if (end - start > PY_SSIZE_T_MAX / (1+1+8)) 915 end = start + PY_SSIZE_T_MAX / (1+1+8); 916 for (i = start, ressize = 0; i < end; ++i) { 917 /* object is guaranteed to be "ready" */ 918 c = PyUnicode_READ_CHAR(object, i); 919 if (c >= 0x10000) { 920 ressize += 1+1+8; 921 } 922 else if (c >= 0x100) { 923 ressize += 1+1+4; 924 } 925 else 926 ressize += 1+1+2; 927 } 928 res = PyUnicode_New(ressize, 127); 929 if (res == NULL) { 930 Py_DECREF(object); 931 return NULL; 932 } 933 outp = PyUnicode_1BYTE_DATA(res); 934 for (i = start; i < end; ++i) { 935 c = PyUnicode_READ_CHAR(object, i); 936 *outp++ = '\\'; 937 if (c >= 0x00010000) { 938 *outp++ = 'U'; 939 *outp++ = Py_hexdigits[(c>>28)&0xf]; 940 *outp++ = Py_hexdigits[(c>>24)&0xf]; 941 *outp++ = Py_hexdigits[(c>>20)&0xf]; 942 *outp++ = Py_hexdigits[(c>>16)&0xf]; 943 *outp++ = Py_hexdigits[(c>>12)&0xf]; 944 *outp++ = Py_hexdigits[(c>>8)&0xf]; 945 } 946 else if (c >= 0x100) { 947 *outp++ = 'u'; 948 *outp++ = Py_hexdigits[(c>>12)&0xf]; 949 *outp++ = Py_hexdigits[(c>>8)&0xf]; 950 } 951 else 952 *outp++ = 'x'; 953 *outp++ = Py_hexdigits[(c>>4)&0xf]; 954 *outp++ = Py_hexdigits[c&0xf]; 955 } 956 957 assert(_PyUnicode_CheckConsistency(res, 1)); 958 Py_DECREF(object); 959 return Py_BuildValue("(Nn)", res, end); 960 } 961 962 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 963 964 PyObject *PyCodec_NameReplaceErrors(PyObject *exc) 965 { 966 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { 967 PyObject *restuple; 968 PyObject *object; 969 Py_ssize_t i; 970 Py_ssize_t start; 971 Py_ssize_t end; 972 PyObject *res; 973 unsigned char *outp; 974 Py_ssize_t ressize; 975 int replsize; 976 Py_UCS4 c; 977 char buffer[256]; /* NAME_MAXLEN */ 978 if (PyUnicodeEncodeError_GetStart(exc, &start)) 979 return NULL; 980 if (PyUnicodeEncodeError_GetEnd(exc, &end)) 981 return NULL; 982 if (!(object = PyUnicodeEncodeError_GetObject(exc))) 983 return NULL; 984 if (!ucnhash_CAPI) { 985 /* load the unicode data module */ 986 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 987 PyUnicodeData_CAPSULE_NAME, 1); 988 if (!ucnhash_CAPI) 989 return NULL; 990 } 991 for (i = start, ressize = 0; i < end; ++i) { 992 /* object is guaranteed to be "ready" */ 993 c = PyUnicode_READ_CHAR(object, i); 994 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) { 995 replsize = 1+1+1+(int)strlen(buffer)+1; 996 } 997 else if (c >= 0x10000) { 998 replsize = 1+1+8; 999 } 1000 else if (c >= 0x100) { 1001 replsize = 1+1+4; 1002 } 1003 else 1004 replsize = 1+1+2; 1005 if (ressize > PY_SSIZE_T_MAX - replsize) 1006 break; 1007 ressize += replsize; 1008 } 1009 end = i; 1010 res = PyUnicode_New(ressize, 127); 1011 if (res==NULL) 1012 return NULL; 1013 for (i = start, outp = PyUnicode_1BYTE_DATA(res); 1014 i < end; ++i) { 1015 c = PyUnicode_READ_CHAR(object, i); 1016 *outp++ = '\\'; 1017 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) { 1018 *outp++ = 'N'; 1019 *outp++ = '{'; 1020 strcpy((char *)outp, buffer); 1021 outp += strlen(buffer); 1022 *outp++ = '}'; 1023 continue; 1024 } 1025 if (c >= 0x00010000) { 1026 *outp++ = 'U'; 1027 *outp++ = Py_hexdigits[(c>>28)&0xf]; 1028 *outp++ = Py_hexdigits[(c>>24)&0xf]; 1029 *outp++ = Py_hexdigits[(c>>20)&0xf]; 1030 *outp++ = Py_hexdigits[(c>>16)&0xf]; 1031 *outp++ = Py_hexdigits[(c>>12)&0xf]; 1032 *outp++ = Py_hexdigits[(c>>8)&0xf]; 1033 } 1034 else if (c >= 0x100) { 1035 *outp++ = 'u'; 1036 *outp++ = Py_hexdigits[(c>>12)&0xf]; 1037 *outp++ = Py_hexdigits[(c>>8)&0xf]; 1038 } 1039 else 1040 *outp++ = 'x'; 1041 *outp++ = Py_hexdigits[(c>>4)&0xf]; 1042 *outp++ = Py_hexdigits[c&0xf]; 1043 } 1044 1045 assert(outp == PyUnicode_1BYTE_DATA(res) + ressize); 1046 assert(_PyUnicode_CheckConsistency(res, 1)); 1047 restuple = Py_BuildValue("(Nn)", res, end); 1048 Py_DECREF(object); 1049 return restuple; 1050 } 1051 else { 1052 wrong_exception_type(exc); 1053 return NULL; 1054 } 1055 } 1056 1057 #define ENC_UNKNOWN -1 1058 #define ENC_UTF8 0 1059 #define ENC_UTF16BE 1 1060 #define ENC_UTF16LE 2 1061 #define ENC_UTF32BE 3 1062 #define ENC_UTF32LE 4 1063 1064 static int 1065 get_standard_encoding(const char *encoding, int *bytelength) 1066 { 1067 if (Py_TOLOWER(encoding[0]) == 'u' && 1068 Py_TOLOWER(encoding[1]) == 't' && 1069 Py_TOLOWER(encoding[2]) == 'f') { 1070 encoding += 3; 1071 if (*encoding == '-' || *encoding == '_' ) 1072 encoding++; 1073 if (encoding[0] == '8' && encoding[1] == '\0') { 1074 *bytelength = 3; 1075 return ENC_UTF8; 1076 } 1077 else if (encoding[0] == '1' && encoding[1] == '6') { 1078 encoding += 2; 1079 *bytelength = 2; 1080 if (*encoding == '\0') { 1081 #ifdef WORDS_BIGENDIAN 1082 return ENC_UTF16BE; 1083 #else 1084 return ENC_UTF16LE; 1085 #endif 1086 } 1087 if (*encoding == '-' || *encoding == '_' ) 1088 encoding++; 1089 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { 1090 if (Py_TOLOWER(encoding[0]) == 'b') 1091 return ENC_UTF16BE; 1092 if (Py_TOLOWER(encoding[0]) == 'l') 1093 return ENC_UTF16LE; 1094 } 1095 } 1096 else if (encoding[0] == '3' && encoding[1] == '2') { 1097 encoding += 2; 1098 *bytelength = 4; 1099 if (*encoding == '\0') { 1100 #ifdef WORDS_BIGENDIAN 1101 return ENC_UTF32BE; 1102 #else 1103 return ENC_UTF32LE; 1104 #endif 1105 } 1106 if (*encoding == '-' || *encoding == '_' ) 1107 encoding++; 1108 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { 1109 if (Py_TOLOWER(encoding[0]) == 'b') 1110 return ENC_UTF32BE; 1111 if (Py_TOLOWER(encoding[0]) == 'l') 1112 return ENC_UTF32LE; 1113 } 1114 } 1115 } 1116 else if (strcmp(encoding, "CP_UTF8") == 0) { 1117 *bytelength = 3; 1118 return ENC_UTF8; 1119 } 1120 return ENC_UNKNOWN; 1121 } 1122 1123 /* This handler is declared static until someone demonstrates 1124 a need to call it directly. */ 1125 static PyObject * 1126 PyCodec_SurrogatePassErrors(PyObject *exc) 1127 { 1128 PyObject *restuple; 1129 PyObject *object; 1130 PyObject *encode; 1131 const char *encoding; 1132 int code; 1133 int bytelength; 1134 Py_ssize_t i; 1135 Py_ssize_t start; 1136 Py_ssize_t end; 1137 PyObject *res; 1138 1139 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { 1140 unsigned char *outp; 1141 if (PyUnicodeEncodeError_GetStart(exc, &start)) 1142 return NULL; 1143 if (PyUnicodeEncodeError_GetEnd(exc, &end)) 1144 return NULL; 1145 if (!(object = PyUnicodeEncodeError_GetObject(exc))) 1146 return NULL; 1147 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) { 1148 Py_DECREF(object); 1149 return NULL; 1150 } 1151 if (!(encoding = PyUnicode_AsUTF8(encode))) { 1152 Py_DECREF(object); 1153 Py_DECREF(encode); 1154 return NULL; 1155 } 1156 code = get_standard_encoding(encoding, &bytelength); 1157 Py_DECREF(encode); 1158 if (code == ENC_UNKNOWN) { 1159 /* Not supported, fail with original exception */ 1160 PyErr_SetObject(PyExceptionInstance_Class(exc), exc); 1161 Py_DECREF(object); 1162 return NULL; 1163 } 1164 1165 if (end - start > PY_SSIZE_T_MAX / bytelength) 1166 end = start + PY_SSIZE_T_MAX / bytelength; 1167 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start)); 1168 if (!res) { 1169 Py_DECREF(object); 1170 return NULL; 1171 } 1172 outp = (unsigned char*)PyBytes_AsString(res); 1173 for (i = start; i < end; i++) { 1174 /* object is guaranteed to be "ready" */ 1175 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); 1176 if (!Py_UNICODE_IS_SURROGATE(ch)) { 1177 /* Not a surrogate, fail with original exception */ 1178 PyErr_SetObject(PyExceptionInstance_Class(exc), exc); 1179 Py_DECREF(res); 1180 Py_DECREF(object); 1181 return NULL; 1182 } 1183 switch (code) { 1184 case ENC_UTF8: 1185 *outp++ = (unsigned char)(0xe0 | (ch >> 12)); 1186 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f)); 1187 *outp++ = (unsigned char)(0x80 | (ch & 0x3f)); 1188 break; 1189 case ENC_UTF16LE: 1190 *outp++ = (unsigned char) ch; 1191 *outp++ = (unsigned char)(ch >> 8); 1192 break; 1193 case ENC_UTF16BE: 1194 *outp++ = (unsigned char)(ch >> 8); 1195 *outp++ = (unsigned char) ch; 1196 break; 1197 case ENC_UTF32LE: 1198 *outp++ = (unsigned char) ch; 1199 *outp++ = (unsigned char)(ch >> 8); 1200 *outp++ = (unsigned char)(ch >> 16); 1201 *outp++ = (unsigned char)(ch >> 24); 1202 break; 1203 case ENC_UTF32BE: 1204 *outp++ = (unsigned char)(ch >> 24); 1205 *outp++ = (unsigned char)(ch >> 16); 1206 *outp++ = (unsigned char)(ch >> 8); 1207 *outp++ = (unsigned char) ch; 1208 break; 1209 } 1210 } 1211 restuple = Py_BuildValue("(On)", res, end); 1212 Py_DECREF(res); 1213 Py_DECREF(object); 1214 return restuple; 1215 } 1216 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { 1217 const unsigned char *p; 1218 Py_UCS4 ch = 0; 1219 if (PyUnicodeDecodeError_GetStart(exc, &start)) 1220 return NULL; 1221 if (PyUnicodeDecodeError_GetEnd(exc, &end)) 1222 return NULL; 1223 if (!(object = PyUnicodeDecodeError_GetObject(exc))) 1224 return NULL; 1225 p = (const unsigned char*)PyBytes_AS_STRING(object); 1226 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) { 1227 Py_DECREF(object); 1228 return NULL; 1229 } 1230 if (!(encoding = PyUnicode_AsUTF8(encode))) { 1231 Py_DECREF(object); 1232 Py_DECREF(encode); 1233 return NULL; 1234 } 1235 code = get_standard_encoding(encoding, &bytelength); 1236 Py_DECREF(encode); 1237 if (code == ENC_UNKNOWN) { 1238 /* Not supported, fail with original exception */ 1239 PyErr_SetObject(PyExceptionInstance_Class(exc), exc); 1240 Py_DECREF(object); 1241 return NULL; 1242 } 1243 1244 /* Try decoding a single surrogate character. If 1245 there are more, let the codec call us again. */ 1246 p += start; 1247 if (PyBytes_GET_SIZE(object) - start >= bytelength) { 1248 switch (code) { 1249 case ENC_UTF8: 1250 if ((p[0] & 0xf0) == 0xe0 && 1251 (p[1] & 0xc0) == 0x80 && 1252 (p[2] & 0xc0) == 0x80) { 1253 /* it's a three-byte code */ 1254 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); 1255 } 1256 break; 1257 case ENC_UTF16LE: 1258 ch = p[1] << 8 | p[0]; 1259 break; 1260 case ENC_UTF16BE: 1261 ch = p[0] << 8 | p[1]; 1262 break; 1263 case ENC_UTF32LE: 1264 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]; 1265 break; 1266 case ENC_UTF32BE: 1267 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; 1268 break; 1269 } 1270 } 1271 1272 Py_DECREF(object); 1273 if (!Py_UNICODE_IS_SURROGATE(ch)) { 1274 /* it's not a surrogate - fail */ 1275 PyErr_SetObject(PyExceptionInstance_Class(exc), exc); 1276 return NULL; 1277 } 1278 res = PyUnicode_FromOrdinal(ch); 1279 if (res == NULL) 1280 return NULL; 1281 return Py_BuildValue("(Nn)", res, start + bytelength); 1282 } 1283 else { 1284 wrong_exception_type(exc); 1285 return NULL; 1286 } 1287 } 1288 1289 static PyObject * 1290 PyCodec_SurrogateEscapeErrors(PyObject *exc) 1291 { 1292 PyObject *restuple; 1293 PyObject *object; 1294 Py_ssize_t i; 1295 Py_ssize_t start; 1296 Py_ssize_t end; 1297 PyObject *res; 1298 1299 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { 1300 char *outp; 1301 if (PyUnicodeEncodeError_GetStart(exc, &start)) 1302 return NULL; 1303 if (PyUnicodeEncodeError_GetEnd(exc, &end)) 1304 return NULL; 1305 if (!(object = PyUnicodeEncodeError_GetObject(exc))) 1306 return NULL; 1307 res = PyBytes_FromStringAndSize(NULL, end-start); 1308 if (!res) { 1309 Py_DECREF(object); 1310 return NULL; 1311 } 1312 outp = PyBytes_AsString(res); 1313 for (i = start; i < end; i++) { 1314 /* object is guaranteed to be "ready" */ 1315 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); 1316 if (ch < 0xdc80 || ch > 0xdcff) { 1317 /* Not a UTF-8b surrogate, fail with original exception */ 1318 PyErr_SetObject(PyExceptionInstance_Class(exc), exc); 1319 Py_DECREF(res); 1320 Py_DECREF(object); 1321 return NULL; 1322 } 1323 *outp++ = ch - 0xdc00; 1324 } 1325 restuple = Py_BuildValue("(On)", res, end); 1326 Py_DECREF(res); 1327 Py_DECREF(object); 1328 return restuple; 1329 } 1330 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { 1331 PyObject *str; 1332 const unsigned char *p; 1333 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */ 1334 int consumed = 0; 1335 if (PyUnicodeDecodeError_GetStart(exc, &start)) 1336 return NULL; 1337 if (PyUnicodeDecodeError_GetEnd(exc, &end)) 1338 return NULL; 1339 if (!(object = PyUnicodeDecodeError_GetObject(exc))) 1340 return NULL; 1341 p = (const unsigned char*)PyBytes_AS_STRING(object); 1342 while (consumed < 4 && consumed < end-start) { 1343 /* Refuse to escape ASCII bytes. */ 1344 if (p[start+consumed] < 128) 1345 break; 1346 ch[consumed] = 0xdc00 + p[start+consumed]; 1347 consumed++; 1348 } 1349 Py_DECREF(object); 1350 if (!consumed) { 1351 /* codec complained about ASCII byte. */ 1352 PyErr_SetObject(PyExceptionInstance_Class(exc), exc); 1353 return NULL; 1354 } 1355 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed); 1356 if (str == NULL) 1357 return NULL; 1358 return Py_BuildValue("(Nn)", str, start+consumed); 1359 } 1360 else { 1361 wrong_exception_type(exc); 1362 return NULL; 1363 } 1364 } 1365 1366 1367 static PyObject *strict_errors(PyObject *self, PyObject *exc) 1368 { 1369 return PyCodec_StrictErrors(exc); 1370 } 1371 1372 1373 static PyObject *ignore_errors(PyObject *self, PyObject *exc) 1374 { 1375 return PyCodec_IgnoreErrors(exc); 1376 } 1377 1378 1379 static PyObject *replace_errors(PyObject *self, PyObject *exc) 1380 { 1381 return PyCodec_ReplaceErrors(exc); 1382 } 1383 1384 1385 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc) 1386 { 1387 return PyCodec_XMLCharRefReplaceErrors(exc); 1388 } 1389 1390 1391 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc) 1392 { 1393 return PyCodec_BackslashReplaceErrors(exc); 1394 } 1395 1396 static PyObject *namereplace_errors(PyObject *self, PyObject *exc) 1397 { 1398 return PyCodec_NameReplaceErrors(exc); 1399 } 1400 1401 static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc) 1402 { 1403 return PyCodec_SurrogatePassErrors(exc); 1404 } 1405 1406 static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc) 1407 { 1408 return PyCodec_SurrogateEscapeErrors(exc); 1409 } 1410 1411 static int _PyCodecRegistry_Init(void) 1412 { 1413 static struct { 1414 char *name; 1415 PyMethodDef def; 1416 } methods[] = 1417 { 1418 { 1419 "strict", 1420 { 1421 "strict_errors", 1422 strict_errors, 1423 METH_O, 1424 PyDoc_STR("Implements the 'strict' error handling, which " 1425 "raises a UnicodeError on coding errors.") 1426 } 1427 }, 1428 { 1429 "ignore", 1430 { 1431 "ignore_errors", 1432 ignore_errors, 1433 METH_O, 1434 PyDoc_STR("Implements the 'ignore' error handling, which " 1435 "ignores malformed data and continues.") 1436 } 1437 }, 1438 { 1439 "replace", 1440 { 1441 "replace_errors", 1442 replace_errors, 1443 METH_O, 1444 PyDoc_STR("Implements the 'replace' error handling, which " 1445 "replaces malformed data with a replacement marker.") 1446 } 1447 }, 1448 { 1449 "xmlcharrefreplace", 1450 { 1451 "xmlcharrefreplace_errors", 1452 xmlcharrefreplace_errors, 1453 METH_O, 1454 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, " 1455 "which replaces an unencodable character with the " 1456 "appropriate XML character reference.") 1457 } 1458 }, 1459 { 1460 "backslashreplace", 1461 { 1462 "backslashreplace_errors", 1463 backslashreplace_errors, 1464 METH_O, 1465 PyDoc_STR("Implements the 'backslashreplace' error handling, " 1466 "which replaces malformed data with a backslashed " 1467 "escape sequence.") 1468 } 1469 }, 1470 { 1471 "namereplace", 1472 { 1473 "namereplace_errors", 1474 namereplace_errors, 1475 METH_O, 1476 PyDoc_STR("Implements the 'namereplace' error handling, " 1477 "which replaces an unencodable character with a " 1478 "\\N{...} escape sequence.") 1479 } 1480 }, 1481 { 1482 "surrogatepass", 1483 { 1484 "surrogatepass", 1485 surrogatepass_errors, 1486 METH_O 1487 } 1488 }, 1489 { 1490 "surrogateescape", 1491 { 1492 "surrogateescape", 1493 surrogateescape_errors, 1494 METH_O 1495 } 1496 } 1497 }; 1498 1499 PyInterpreterState *interp = PyThreadState_GET()->interp; 1500 PyObject *mod; 1501 unsigned i; 1502 1503 if (interp->codec_search_path != NULL) 1504 return 0; 1505 1506 interp->codec_search_path = PyList_New(0); 1507 interp->codec_search_cache = PyDict_New(); 1508 interp->codec_error_registry = PyDict_New(); 1509 1510 if (interp->codec_error_registry) { 1511 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) { 1512 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL); 1513 int res; 1514 if (!func) 1515 Py_FatalError("can't initialize codec error registry"); 1516 res = PyCodec_RegisterError(methods[i].name, func); 1517 Py_DECREF(func); 1518 if (res) 1519 Py_FatalError("can't initialize codec error registry"); 1520 } 1521 } 1522 1523 if (interp->codec_search_path == NULL || 1524 interp->codec_search_cache == NULL || 1525 interp->codec_error_registry == NULL) 1526 Py_FatalError("can't initialize codec registry"); 1527 1528 mod = PyImport_ImportModuleNoBlock("encodings"); 1529 if (mod == NULL) { 1530 return -1; 1531 } 1532 Py_DECREF(mod); 1533 interp->codecs_initialized = 1; 1534 return 0; 1535 } 1536