Home | History | Annotate | Download | only in Python
      1 /* ------------------------------------------------------------------------
      2 
      3    Python Codec Registry and support functions
      4 
      5 Written by Marc-Andre Lemburg (mal (at) lemburg.com).
      6 
      7 Copyright (c) Corporation for National Research Initiatives.
      8 
      9    ------------------------------------------------------------------------ */
     10 
     11 #include "Python.h"
     12 #include "internal/pystate.h"
     13 #include "ucnhash.h"
     14 #include <ctype.h>
     15 
     16 const char *Py_hexdigits = "0123456789abcdef";
     17 
     18 /* --- Codec Registry ----------------------------------------------------- */
     19 
     20 /* Import the standard encodings package which will register the first
     21    codec search function.
     22 
     23    This is done in a lazy way so that the Unicode implementation does
     24    not downgrade startup time of scripts not needing it.
     25 
     26    ImportErrors are silently ignored by this function. Only one try is
     27    made.
     28 
     29 */
     30 
     31 static int _PyCodecRegistry_Init(void); /* Forward */
     32 
     33 int PyCodec_Register(PyObject *search_function)
     34 {
     35     PyInterpreterState *interp = PyThreadState_GET()->interp;
     36     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
     37         goto onError;
     38     if (search_function == NULL) {
     39         PyErr_BadArgument();
     40         goto onError;
     41     }
     42     if (!PyCallable_Check(search_function)) {
     43         PyErr_SetString(PyExc_TypeError, "argument must be callable");
     44         goto onError;
     45     }
     46     return PyList_Append(interp->codec_search_path, search_function);
     47 
     48  onError:
     49     return -1;
     50 }
     51 
     52 /* Convert a string to a normalized Python string: all characters are
     53    converted to lower case, spaces are replaced with underscores. */
     54 
     55 static
     56 PyObject *normalizestring(const char *string)
     57 {
     58     size_t i;
     59     size_t len = strlen(string);
     60     char *p;
     61     PyObject *v;
     62 
     63     if (len > PY_SSIZE_T_MAX) {
     64         PyErr_SetString(PyExc_OverflowError, "string is too large");
     65         return NULL;
     66     }
     67 
     68     p = PyMem_Malloc(len + 1);
     69     if (p == NULL)
     70         return PyErr_NoMemory();
     71     for (i = 0; i < len; i++) {
     72         char ch = string[i];
     73         if (ch == ' ')
     74             ch = '-';
     75         else
     76             ch = Py_TOLOWER(Py_CHARMASK(ch));
     77         p[i] = ch;
     78     }
     79     p[i] = '\0';
     80     v = PyUnicode_FromString(p);
     81     PyMem_Free(p);
     82     return v;
     83 }
     84 
     85 /* Lookup the given encoding and return a tuple providing the codec
     86    facilities.
     87 
     88    The encoding string is looked up converted to all lower-case
     89    characters. This makes encodings looked up through this mechanism
     90    effectively case-insensitive.
     91 
     92    If no codec is found, a LookupError is set and NULL returned.
     93 
     94    As side effect, this tries to load the encodings package, if not
     95    yet done. This is part of the lazy load strategy for the encodings
     96    package.
     97 
     98 */
     99 
    100 PyObject *_PyCodec_Lookup(const char *encoding)
    101 {
    102     PyInterpreterState *interp;
    103     PyObject *result, *args = NULL, *v;
    104     Py_ssize_t i, len;
    105 
    106     if (encoding == NULL) {
    107         PyErr_BadArgument();
    108         goto onError;
    109     }
    110 
    111     interp = PyThreadState_GET()->interp;
    112     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
    113         goto onError;
    114 
    115     /* Convert the encoding to a normalized Python string: all
    116        characters are converted to lower case, spaces and hyphens are
    117        replaced with underscores. */
    118     v = normalizestring(encoding);
    119     if (v == NULL)
    120         goto onError;
    121     PyUnicode_InternInPlace(&v);
    122 
    123     /* First, try to lookup the name in the registry dictionary */
    124     result = PyDict_GetItem(interp->codec_search_cache, v);
    125     if (result != NULL) {
    126         Py_INCREF(result);
    127         Py_DECREF(v);
    128         return result;
    129     }
    130 
    131     /* Next, scan the search functions in order of registration */
    132     args = PyTuple_New(1);
    133     if (args == NULL) {
    134         Py_DECREF(v);
    135         return NULL;
    136     }
    137     PyTuple_SET_ITEM(args,0,v);
    138 
    139     len = PyList_Size(interp->codec_search_path);
    140     if (len < 0)
    141         goto onError;
    142     if (len == 0) {
    143         PyErr_SetString(PyExc_LookupError,
    144                         "no codec search functions registered: "
    145                         "can't find encoding");
    146         goto onError;
    147     }
    148 
    149     for (i = 0; i < len; i++) {
    150         PyObject *func;
    151 
    152         func = PyList_GetItem(interp->codec_search_path, i);
    153         if (func == NULL)
    154             goto onError;
    155         result = PyEval_CallObject(func, args);
    156         if (result == NULL)
    157             goto onError;
    158         if (result == Py_None) {
    159             Py_DECREF(result);
    160             continue;
    161         }
    162         if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
    163             PyErr_SetString(PyExc_TypeError,
    164                             "codec search functions must return 4-tuples");
    165             Py_DECREF(result);
    166             goto onError;
    167         }
    168         break;
    169     }
    170     if (i == len) {
    171         /* XXX Perhaps we should cache misses too ? */
    172         PyErr_Format(PyExc_LookupError,
    173                      "unknown encoding: %s", encoding);
    174         goto onError;
    175     }
    176 
    177     /* Cache and return the result */
    178     if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
    179         Py_DECREF(result);
    180         goto onError;
    181     }
    182     Py_DECREF(args);
    183     return result;
    184 
    185  onError:
    186     Py_XDECREF(args);
    187     return NULL;
    188 }
    189 
    190 int _PyCodec_Forget(const char *encoding)
    191 {
    192     PyInterpreterState *interp;
    193     PyObject *v;
    194     int result;
    195 
    196     interp = PyThreadState_GET()->interp;
    197     if (interp->codec_search_path == NULL) {
    198         return -1;
    199     }
    200 
    201     /* Convert the encoding to a normalized Python string: all
    202        characters are converted to lower case, spaces and hyphens are
    203        replaced with underscores. */
    204     v = normalizestring(encoding);
    205     if (v == NULL) {
    206         return -1;
    207     }
    208 
    209     /* Drop the named codec from the internal cache */
    210     result = PyDict_DelItem(interp->codec_search_cache, v);
    211     Py_DECREF(v);
    212 
    213     return result;
    214 }
    215 
    216 /* Codec registry encoding check API. */
    217 
    218 int PyCodec_KnownEncoding(const char *encoding)
    219 {
    220     PyObject *codecs;
    221 
    222     codecs = _PyCodec_Lookup(encoding);
    223     if (!codecs) {
    224         PyErr_Clear();
    225         return 0;
    226     }
    227     else {
    228         Py_DECREF(codecs);
    229         return 1;
    230     }
    231 }
    232 
    233 static
    234 PyObject *args_tuple(PyObject *object,
    235                      const char *errors)
    236 {
    237     PyObject *args;
    238 
    239     args = PyTuple_New(1 + (errors != NULL));
    240     if (args == NULL)
    241         return NULL;
    242     Py_INCREF(object);
    243     PyTuple_SET_ITEM(args,0,object);
    244     if (errors) {
    245         PyObject *v;
    246 
    247         v = PyUnicode_FromString(errors);
    248         if (v == NULL) {
    249             Py_DECREF(args);
    250             return NULL;
    251         }
    252         PyTuple_SET_ITEM(args, 1, v);
    253     }
    254     return args;
    255 }
    256 
    257 /* Helper function to get a codec item */
    258 
    259 static
    260 PyObject *codec_getitem(const char *encoding, int index)
    261 {
    262     PyObject *codecs;
    263     PyObject *v;
    264 
    265     codecs = _PyCodec_Lookup(encoding);
    266     if (codecs == NULL)
    267         return NULL;
    268     v = PyTuple_GET_ITEM(codecs, index);
    269     Py_DECREF(codecs);
    270     Py_INCREF(v);
    271     return v;
    272 }
    273 
    274 /* Helper functions to create an incremental codec. */
    275 static
    276 PyObject *codec_makeincrementalcodec(PyObject *codec_info,
    277                                      const char *errors,
    278                                      const char *attrname)
    279 {
    280     PyObject *ret, *inccodec;
    281 
    282     inccodec = PyObject_GetAttrString(codec_info, attrname);
    283     if (inccodec == NULL)
    284         return NULL;
    285     if (errors)
    286         ret = PyObject_CallFunction(inccodec, "s", errors);
    287     else
    288         ret = _PyObject_CallNoArg(inccodec);
    289     Py_DECREF(inccodec);
    290     return ret;
    291 }
    292 
    293 static
    294 PyObject *codec_getincrementalcodec(const char *encoding,
    295                                     const char *errors,
    296                                     const char *attrname)
    297 {
    298     PyObject *codec_info, *ret;
    299 
    300     codec_info = _PyCodec_Lookup(encoding);
    301     if (codec_info == NULL)
    302         return NULL;
    303     ret = codec_makeincrementalcodec(codec_info, errors, attrname);
    304     Py_DECREF(codec_info);
    305     return ret;
    306 }
    307 
    308 /* Helper function to create a stream codec. */
    309 
    310 static
    311 PyObject *codec_getstreamcodec(const char *encoding,
    312                                PyObject *stream,
    313                                const char *errors,
    314                                const int index)
    315 {
    316     PyObject *codecs, *streamcodec, *codeccls;
    317 
    318     codecs = _PyCodec_Lookup(encoding);
    319     if (codecs == NULL)
    320         return NULL;
    321 
    322     codeccls = PyTuple_GET_ITEM(codecs, index);
    323     if (errors != NULL)
    324         streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
    325     else
    326         streamcodec = PyObject_CallFunctionObjArgs(codeccls, stream, NULL);
    327     Py_DECREF(codecs);
    328     return streamcodec;
    329 }
    330 
    331 /* Helpers to work with the result of _PyCodec_Lookup
    332 
    333  */
    334 PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
    335                                              const char *errors)
    336 {
    337     return codec_makeincrementalcodec(codec_info, errors,
    338                                       "incrementaldecoder");
    339 }
    340 
    341 PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
    342                                              const char *errors)
    343 {
    344     return codec_makeincrementalcodec(codec_info, errors,
    345                                       "incrementalencoder");
    346 }
    347 
    348 
    349 /* Convenience APIs to query the Codec registry.
    350 
    351    All APIs return a codec object with incremented refcount.
    352 
    353  */
    354 
    355 PyObject *PyCodec_Encoder(const char *encoding)
    356 {
    357     return codec_getitem(encoding, 0);
    358 }
    359 
    360 PyObject *PyCodec_Decoder(const char *encoding)
    361 {
    362     return codec_getitem(encoding, 1);
    363 }
    364 
    365 PyObject *PyCodec_IncrementalEncoder(const char *encoding,
    366                                      const char *errors)
    367 {
    368     return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
    369 }
    370 
    371 PyObject *PyCodec_IncrementalDecoder(const char *encoding,
    372                                      const char *errors)
    373 {
    374     return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
    375 }
    376 
    377 PyObject *PyCodec_StreamReader(const char *encoding,
    378                                PyObject *stream,
    379                                const char *errors)
    380 {
    381     return codec_getstreamcodec(encoding, stream, errors, 2);
    382 }
    383 
    384 PyObject *PyCodec_StreamWriter(const char *encoding,
    385                                PyObject *stream,
    386                                const char *errors)
    387 {
    388     return codec_getstreamcodec(encoding, stream, errors, 3);
    389 }
    390 
    391 /* Helper that tries to ensure the reported exception chain indicates the
    392  * codec that was invoked to trigger the failure without changing the type
    393  * of the exception raised.
    394  */
    395 static void
    396 wrap_codec_error(const char *operation,
    397                  const char *encoding)
    398 {
    399     /* TrySetFromCause will replace the active exception with a suitably
    400      * updated clone if it can, otherwise it will leave the original
    401      * exception alone.
    402      */
    403     _PyErr_TrySetFromCause("%s with '%s' codec failed",
    404                            operation, encoding);
    405 }
    406 
    407 /* Encode an object (e.g. a Unicode object) using the given encoding
    408    and return the resulting encoded object (usually a Python string).
    409 
    410    errors is passed to the encoder factory as argument if non-NULL. */
    411 
    412 static PyObject *
    413 _PyCodec_EncodeInternal(PyObject *object,
    414                         PyObject *encoder,
    415                         const char *encoding,
    416                         const char *errors)
    417 {
    418     PyObject *args = NULL, *result = NULL;
    419     PyObject *v = NULL;
    420 
    421     args = args_tuple(object, errors);
    422     if (args == NULL)
    423         goto onError;
    424 
    425     result = PyEval_CallObject(encoder, args);
    426     if (result == NULL) {
    427         wrap_codec_error("encoding", encoding);
    428         goto onError;
    429     }
    430 
    431     if (!PyTuple_Check(result) ||
    432         PyTuple_GET_SIZE(result) != 2) {
    433         PyErr_SetString(PyExc_TypeError,
    434                         "encoder must return a tuple (object, integer)");
    435         goto onError;
    436     }
    437     v = PyTuple_GET_ITEM(result,0);
    438     Py_INCREF(v);
    439     /* We don't check or use the second (integer) entry. */
    440 
    441     Py_DECREF(args);
    442     Py_DECREF(encoder);
    443     Py_DECREF(result);
    444     return v;
    445 
    446  onError:
    447     Py_XDECREF(result);
    448     Py_XDECREF(args);
    449     Py_XDECREF(encoder);
    450     return NULL;
    451 }
    452 
    453 /* Decode an object (usually a Python string) using the given encoding
    454    and return an equivalent object (e.g. a Unicode object).
    455 
    456    errors is passed to the decoder factory as argument if non-NULL. */
    457 
    458 static PyObject *
    459 _PyCodec_DecodeInternal(PyObject *object,
    460                         PyObject *decoder,
    461                         const char *encoding,
    462                         const char *errors)
    463 {
    464     PyObject *args = NULL, *result = NULL;
    465     PyObject *v;
    466 
    467     args = args_tuple(object, errors);
    468     if (args == NULL)
    469         goto onError;
    470 
    471     result = PyEval_CallObject(decoder,args);
    472     if (result == NULL) {
    473         wrap_codec_error("decoding", encoding);
    474         goto onError;
    475     }
    476     if (!PyTuple_Check(result) ||
    477         PyTuple_GET_SIZE(result) != 2) {
    478         PyErr_SetString(PyExc_TypeError,
    479                         "decoder must return a tuple (object,integer)");
    480         goto onError;
    481     }
    482     v = PyTuple_GET_ITEM(result,0);
    483     Py_INCREF(v);
    484     /* We don't check or use the second (integer) entry. */
    485 
    486     Py_DECREF(args);
    487     Py_DECREF(decoder);
    488     Py_DECREF(result);
    489     return v;
    490 
    491  onError:
    492     Py_XDECREF(args);
    493     Py_XDECREF(decoder);
    494     Py_XDECREF(result);
    495     return NULL;
    496 }
    497 
    498 /* Generic encoding/decoding API */
    499 PyObject *PyCodec_Encode(PyObject *object,
    500                          const char *encoding,
    501                          const char *errors)
    502 {
    503     PyObject *encoder;
    504 
    505     encoder = PyCodec_Encoder(encoding);
    506     if (encoder == NULL)
    507         return NULL;
    508 
    509     return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
    510 }
    511 
    512 PyObject *PyCodec_Decode(PyObject *object,
    513                          const char *encoding,
    514                          const char *errors)
    515 {
    516     PyObject *decoder;
    517 
    518     decoder = PyCodec_Decoder(encoding);
    519     if (decoder == NULL)
    520         return NULL;
    521 
    522     return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
    523 }
    524 
    525 /* Text encoding/decoding API */
    526 PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
    527                                        const char *alternate_command)
    528 {
    529     _Py_IDENTIFIER(_is_text_encoding);
    530     PyObject *codec;
    531     PyObject *attr;
    532     int is_text_codec;
    533 
    534     codec = _PyCodec_Lookup(encoding);
    535     if (codec == NULL)
    536         return NULL;
    537 
    538     /* Backwards compatibility: assume any raw tuple describes a text
    539      * encoding, and the same for anything lacking the private
    540      * attribute.
    541      */
    542     if (!PyTuple_CheckExact(codec)) {
    543         if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) {
    544             Py_DECREF(codec);
    545             return NULL;
    546         }
    547         if (attr != NULL) {
    548             is_text_codec = PyObject_IsTrue(attr);
    549             Py_DECREF(attr);
    550             if (is_text_codec <= 0) {
    551                 Py_DECREF(codec);
    552                 if (!is_text_codec)
    553                     PyErr_Format(PyExc_LookupError,
    554                                  "'%.400s' is not a text encoding; "
    555                                  "use %s to handle arbitrary codecs",
    556                                  encoding, alternate_command);
    557                 return NULL;
    558             }
    559         }
    560     }
    561 
    562     /* This appears to be a valid text encoding */
    563     return codec;
    564 }
    565 
    566 
    567 static
    568 PyObject *codec_getitem_checked(const char *encoding,
    569                                 const char *alternate_command,
    570                                 int index)
    571 {
    572     PyObject *codec;
    573     PyObject *v;
    574 
    575     codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
    576     if (codec == NULL)
    577         return NULL;
    578 
    579     v = PyTuple_GET_ITEM(codec, index);
    580     Py_INCREF(v);
    581     Py_DECREF(codec);
    582     return v;
    583 }
    584 
    585 static PyObject * _PyCodec_TextEncoder(const char *encoding)
    586 {
    587     return codec_getitem_checked(encoding, "codecs.encode()", 0);
    588 }
    589 
    590 static PyObject * _PyCodec_TextDecoder(const char *encoding)
    591 {
    592     return codec_getitem_checked(encoding, "codecs.decode()", 1);
    593 }
    594 
    595 PyObject *_PyCodec_EncodeText(PyObject *object,
    596                               const char *encoding,
    597                               const char *errors)
    598 {
    599     PyObject *encoder;
    600 
    601     encoder = _PyCodec_TextEncoder(encoding);
    602     if (encoder == NULL)
    603         return NULL;
    604 
    605     return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
    606 }
    607 
    608 PyObject *_PyCodec_DecodeText(PyObject *object,
    609                               const char *encoding,
    610                               const char *errors)
    611 {
    612     PyObject *decoder;
    613 
    614     decoder = _PyCodec_TextDecoder(encoding);
    615     if (decoder == NULL)
    616         return NULL;
    617 
    618     return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
    619 }
    620 
    621 /* Register the error handling callback function error under the name
    622    name. This function will be called by the codec when it encounters
    623    an unencodable characters/undecodable bytes and doesn't know the
    624    callback name, when name is specified as the error parameter
    625    in the call to the encode/decode function.
    626    Return 0 on success, -1 on error */
    627 int PyCodec_RegisterError(const char *name, PyObject *error)
    628 {
    629     PyInterpreterState *interp = PyThreadState_GET()->interp;
    630     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
    631         return -1;
    632     if (!PyCallable_Check(error)) {
    633         PyErr_SetString(PyExc_TypeError, "handler must be callable");
    634         return -1;
    635     }
    636     return PyDict_SetItemString(interp->codec_error_registry,
    637                                 name, error);
    638 }
    639 
    640 /* Lookup the error handling callback function registered under the
    641    name error. As a special case NULL can be passed, in which case
    642    the error handling callback for strict encoding will be returned. */
    643 PyObject *PyCodec_LookupError(const char *name)
    644 {
    645     PyObject *handler = NULL;
    646 
    647     PyInterpreterState *interp = PyThreadState_GET()->interp;
    648     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
    649         return NULL;
    650 
    651     if (name==NULL)
    652         name = "strict";
    653     handler = PyDict_GetItemString(interp->codec_error_registry, name);
    654     if (!handler)
    655         PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
    656     else
    657         Py_INCREF(handler);
    658     return handler;
    659 }
    660 
    661 static void wrong_exception_type(PyObject *exc)
    662 {
    663     PyErr_Format(PyExc_TypeError,
    664                  "don't know how to handle %.200s in error callback",
    665                  exc->ob_type->tp_name);
    666 }
    667 
    668 PyObject *PyCodec_StrictErrors(PyObject *exc)
    669 {
    670     if (PyExceptionInstance_Check(exc))
    671         PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
    672     else
    673         PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
    674     return NULL;
    675 }
    676 
    677 
    678 PyObject *PyCodec_IgnoreErrors(PyObject *exc)
    679 {
    680     Py_ssize_t end;
    681 
    682     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
    683         if (PyUnicodeEncodeError_GetEnd(exc, &end))
    684             return NULL;
    685     }
    686     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
    687         if (PyUnicodeDecodeError_GetEnd(exc, &end))
    688             return NULL;
    689     }
    690     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
    691         if (PyUnicodeTranslateError_GetEnd(exc, &end))
    692             return NULL;
    693     }
    694     else {
    695         wrong_exception_type(exc);
    696         return NULL;
    697     }
    698     return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
    699 }
    700 
    701 
    702 PyObject *PyCodec_ReplaceErrors(PyObject *exc)
    703 {
    704     Py_ssize_t start, end, i, len;
    705 
    706     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
    707         PyObject *res;
    708         int kind;
    709         void *data;
    710         if (PyUnicodeEncodeError_GetStart(exc, &start))
    711             return NULL;
    712         if (PyUnicodeEncodeError_GetEnd(exc, &end))
    713             return NULL;
    714         len = end - start;
    715         res = PyUnicode_New(len, '?');
    716         if (res == NULL)
    717             return NULL;
    718         kind = PyUnicode_KIND(res);
    719         data = PyUnicode_DATA(res);
    720         for (i = 0; i < len; ++i)
    721             PyUnicode_WRITE(kind, data, i, '?');
    722         assert(_PyUnicode_CheckConsistency(res, 1));
    723         return Py_BuildValue("(Nn)", res, end);
    724     }
    725     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
    726         if (PyUnicodeDecodeError_GetEnd(exc, &end))
    727             return NULL;
    728         return Py_BuildValue("(Cn)",
    729                              (int)Py_UNICODE_REPLACEMENT_CHARACTER,
    730                              end);
    731     }
    732     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
    733         PyObject *res;
    734         int kind;
    735         void *data;
    736         if (PyUnicodeTranslateError_GetStart(exc, &start))
    737             return NULL;
    738         if (PyUnicodeTranslateError_GetEnd(exc, &end))
    739             return NULL;
    740         len = end - start;
    741         res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
    742         if (res == NULL)
    743             return NULL;
    744         kind = PyUnicode_KIND(res);
    745         data = PyUnicode_DATA(res);
    746         for (i=0; i < len; i++)
    747             PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
    748         assert(_PyUnicode_CheckConsistency(res, 1));
    749         return Py_BuildValue("(Nn)", res, end);
    750     }
    751     else {
    752         wrong_exception_type(exc);
    753         return NULL;
    754     }
    755 }
    756 
    757 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
    758 {
    759     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
    760         PyObject *restuple;
    761         PyObject *object;
    762         Py_ssize_t i;
    763         Py_ssize_t start;
    764         Py_ssize_t end;
    765         PyObject *res;
    766         unsigned char *outp;
    767         Py_ssize_t ressize;
    768         Py_UCS4 ch;
    769         if (PyUnicodeEncodeError_GetStart(exc, &start))
    770             return NULL;
    771         if (PyUnicodeEncodeError_GetEnd(exc, &end))
    772             return NULL;
    773         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
    774             return NULL;
    775         if (end - start > PY_SSIZE_T_MAX / (2+7+1))
    776             end = start + PY_SSIZE_T_MAX / (2+7+1);
    777         for (i = start, ressize = 0; i < end; ++i) {
    778             /* object is guaranteed to be "ready" */
    779             ch = PyUnicode_READ_CHAR(object, i);
    780             if (ch<10)
    781                 ressize += 2+1+1;
    782             else if (ch<100)
    783                 ressize += 2+2+1;
    784             else if (ch<1000)
    785                 ressize += 2+3+1;
    786             else if (ch<10000)
    787                 ressize += 2+4+1;
    788             else if (ch<100000)
    789                 ressize += 2+5+1;
    790             else if (ch<1000000)
    791                 ressize += 2+6+1;
    792             else
    793                 ressize += 2+7+1;
    794         }
    795         /* allocate replacement */
    796         res = PyUnicode_New(ressize, 127);
    797         if (res == NULL) {
    798             Py_DECREF(object);
    799             return NULL;
    800         }
    801         outp = PyUnicode_1BYTE_DATA(res);
    802         /* generate replacement */
    803         for (i = start; i < end; ++i) {
    804             int digits;
    805             int base;
    806             ch = PyUnicode_READ_CHAR(object, i);
    807             *outp++ = '&';
    808             *outp++ = '#';
    809             if (ch<10) {
    810                 digits = 1;
    811                 base = 1;
    812             }
    813             else if (ch<100) {
    814                 digits = 2;
    815                 base = 10;
    816             }
    817             else if (ch<1000) {
    818                 digits = 3;
    819                 base = 100;
    820             }
    821             else if (ch<10000) {
    822                 digits = 4;
    823                 base = 1000;
    824             }
    825             else if (ch<100000) {
    826                 digits = 5;
    827                 base = 10000;
    828             }
    829             else if (ch<1000000) {
    830                 digits = 6;
    831                 base = 100000;
    832             }
    833             else {
    834                 digits = 7;
    835                 base = 1000000;
    836             }
    837             while (digits-->0) {
    838                 *outp++ = '0' + ch/base;
    839                 ch %= base;
    840                 base /= 10;
    841             }
    842             *outp++ = ';';
    843         }
    844         assert(_PyUnicode_CheckConsistency(res, 1));
    845         restuple = Py_BuildValue("(Nn)", res, end);
    846         Py_DECREF(object);
    847         return restuple;
    848     }
    849     else {
    850         wrong_exception_type(exc);
    851         return NULL;
    852     }
    853 }
    854 
    855 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
    856 {
    857     PyObject *object;
    858     Py_ssize_t i;
    859     Py_ssize_t start;
    860     Py_ssize_t end;
    861     PyObject *res;
    862     unsigned char *outp;
    863     int ressize;
    864     Py_UCS4 c;
    865 
    866     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
    867         const unsigned char *p;
    868         if (PyUnicodeDecodeError_GetStart(exc, &start))
    869             return NULL;
    870         if (PyUnicodeDecodeError_GetEnd(exc, &end))
    871             return NULL;
    872         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
    873             return NULL;
    874         p = (const unsigned char*)PyBytes_AS_STRING(object);
    875         res = PyUnicode_New(4 * (end - start), 127);
    876         if (res == NULL) {
    877             Py_DECREF(object);
    878             return NULL;
    879         }
    880         outp = PyUnicode_1BYTE_DATA(res);
    881         for (i = start; i < end; i++, outp += 4) {
    882             unsigned char c = p[i];
    883             outp[0] = '\\';
    884             outp[1] = 'x';
    885             outp[2] = Py_hexdigits[(c>>4)&0xf];
    886             outp[3] = Py_hexdigits[c&0xf];
    887         }
    888 
    889         assert(_PyUnicode_CheckConsistency(res, 1));
    890         Py_DECREF(object);
    891         return Py_BuildValue("(Nn)", res, end);
    892     }
    893     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
    894         if (PyUnicodeEncodeError_GetStart(exc, &start))
    895             return NULL;
    896         if (PyUnicodeEncodeError_GetEnd(exc, &end))
    897             return NULL;
    898         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
    899             return NULL;
    900     }
    901     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
    902         if (PyUnicodeTranslateError_GetStart(exc, &start))
    903             return NULL;
    904         if (PyUnicodeTranslateError_GetEnd(exc, &end))
    905             return NULL;
    906         if (!(object = PyUnicodeTranslateError_GetObject(exc)))
    907             return NULL;
    908     }
    909     else {
    910         wrong_exception_type(exc);
    911         return NULL;
    912     }
    913 
    914     if (end - start > PY_SSIZE_T_MAX / (1+1+8))
    915         end = start + PY_SSIZE_T_MAX / (1+1+8);
    916     for (i = start, ressize = 0; i < end; ++i) {
    917         /* object is guaranteed to be "ready" */
    918         c = PyUnicode_READ_CHAR(object, i);
    919         if (c >= 0x10000) {
    920             ressize += 1+1+8;
    921         }
    922         else if (c >= 0x100) {
    923             ressize += 1+1+4;
    924         }
    925         else
    926             ressize += 1+1+2;
    927     }
    928     res = PyUnicode_New(ressize, 127);
    929     if (res == NULL) {
    930         Py_DECREF(object);
    931         return NULL;
    932     }
    933     outp = PyUnicode_1BYTE_DATA(res);
    934     for (i = start; i < end; ++i) {
    935         c = PyUnicode_READ_CHAR(object, i);
    936         *outp++ = '\\';
    937         if (c >= 0x00010000) {
    938             *outp++ = 'U';
    939             *outp++ = Py_hexdigits[(c>>28)&0xf];
    940             *outp++ = Py_hexdigits[(c>>24)&0xf];
    941             *outp++ = Py_hexdigits[(c>>20)&0xf];
    942             *outp++ = Py_hexdigits[(c>>16)&0xf];
    943             *outp++ = Py_hexdigits[(c>>12)&0xf];
    944             *outp++ = Py_hexdigits[(c>>8)&0xf];
    945         }
    946         else if (c >= 0x100) {
    947             *outp++ = 'u';
    948             *outp++ = Py_hexdigits[(c>>12)&0xf];
    949             *outp++ = Py_hexdigits[(c>>8)&0xf];
    950         }
    951         else
    952             *outp++ = 'x';
    953         *outp++ = Py_hexdigits[(c>>4)&0xf];
    954         *outp++ = Py_hexdigits[c&0xf];
    955     }
    956 
    957     assert(_PyUnicode_CheckConsistency(res, 1));
    958     Py_DECREF(object);
    959     return Py_BuildValue("(Nn)", res, end);
    960 }
    961 
    962 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
    963 
    964 PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
    965 {
    966     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
    967         PyObject *restuple;
    968         PyObject *object;
    969         Py_ssize_t i;
    970         Py_ssize_t start;
    971         Py_ssize_t end;
    972         PyObject *res;
    973         unsigned char *outp;
    974         Py_ssize_t ressize;
    975         int replsize;
    976         Py_UCS4 c;
    977         char buffer[256]; /* NAME_MAXLEN */
    978         if (PyUnicodeEncodeError_GetStart(exc, &start))
    979             return NULL;
    980         if (PyUnicodeEncodeError_GetEnd(exc, &end))
    981             return NULL;
    982         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
    983             return NULL;
    984         if (!ucnhash_CAPI) {
    985             /* load the unicode data module */
    986             ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
    987                                             PyUnicodeData_CAPSULE_NAME, 1);
    988             if (!ucnhash_CAPI)
    989                 return NULL;
    990         }
    991         for (i = start, ressize = 0; i < end; ++i) {
    992             /* object is guaranteed to be "ready" */
    993             c = PyUnicode_READ_CHAR(object, i);
    994             if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
    995                 replsize = 1+1+1+(int)strlen(buffer)+1;
    996             }
    997             else if (c >= 0x10000) {
    998                 replsize = 1+1+8;
    999             }
   1000             else if (c >= 0x100) {
   1001                 replsize = 1+1+4;
   1002             }
   1003             else
   1004                 replsize = 1+1+2;
   1005             if (ressize > PY_SSIZE_T_MAX - replsize)
   1006                 break;
   1007             ressize += replsize;
   1008         }
   1009         end = i;
   1010         res = PyUnicode_New(ressize, 127);
   1011         if (res==NULL)
   1012             return NULL;
   1013         for (i = start, outp = PyUnicode_1BYTE_DATA(res);
   1014             i < end; ++i) {
   1015             c = PyUnicode_READ_CHAR(object, i);
   1016             *outp++ = '\\';
   1017             if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
   1018                 *outp++ = 'N';
   1019                 *outp++ = '{';
   1020                 strcpy((char *)outp, buffer);
   1021                 outp += strlen(buffer);
   1022                 *outp++ = '}';
   1023                 continue;
   1024             }
   1025             if (c >= 0x00010000) {
   1026                 *outp++ = 'U';
   1027                 *outp++ = Py_hexdigits[(c>>28)&0xf];
   1028                 *outp++ = Py_hexdigits[(c>>24)&0xf];
   1029                 *outp++ = Py_hexdigits[(c>>20)&0xf];
   1030                 *outp++ = Py_hexdigits[(c>>16)&0xf];
   1031                 *outp++ = Py_hexdigits[(c>>12)&0xf];
   1032                 *outp++ = Py_hexdigits[(c>>8)&0xf];
   1033             }
   1034             else if (c >= 0x100) {
   1035                 *outp++ = 'u';
   1036                 *outp++ = Py_hexdigits[(c>>12)&0xf];
   1037                 *outp++ = Py_hexdigits[(c>>8)&0xf];
   1038             }
   1039             else
   1040                 *outp++ = 'x';
   1041             *outp++ = Py_hexdigits[(c>>4)&0xf];
   1042             *outp++ = Py_hexdigits[c&0xf];
   1043         }
   1044 
   1045         assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
   1046         assert(_PyUnicode_CheckConsistency(res, 1));
   1047         restuple = Py_BuildValue("(Nn)", res, end);
   1048         Py_DECREF(object);
   1049         return restuple;
   1050     }
   1051     else {
   1052         wrong_exception_type(exc);
   1053         return NULL;
   1054     }
   1055 }
   1056 
   1057 #define ENC_UNKNOWN     -1
   1058 #define ENC_UTF8        0
   1059 #define ENC_UTF16BE     1
   1060 #define ENC_UTF16LE     2
   1061 #define ENC_UTF32BE     3
   1062 #define ENC_UTF32LE     4
   1063 
   1064 static int
   1065 get_standard_encoding(const char *encoding, int *bytelength)
   1066 {
   1067     if (Py_TOLOWER(encoding[0]) == 'u' &&
   1068         Py_TOLOWER(encoding[1]) == 't' &&
   1069         Py_TOLOWER(encoding[2]) == 'f') {
   1070         encoding += 3;
   1071         if (*encoding == '-' || *encoding == '_' )
   1072             encoding++;
   1073         if (encoding[0] == '8' && encoding[1] == '\0') {
   1074             *bytelength = 3;
   1075             return ENC_UTF8;
   1076         }
   1077         else if (encoding[0] == '1' && encoding[1] == '6') {
   1078             encoding += 2;
   1079             *bytelength = 2;
   1080             if (*encoding == '\0') {
   1081 #ifdef WORDS_BIGENDIAN
   1082                 return ENC_UTF16BE;
   1083 #else
   1084                 return ENC_UTF16LE;
   1085 #endif
   1086             }
   1087             if (*encoding == '-' || *encoding == '_' )
   1088                 encoding++;
   1089             if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
   1090                 if (Py_TOLOWER(encoding[0]) == 'b')
   1091                     return ENC_UTF16BE;
   1092                 if (Py_TOLOWER(encoding[0]) == 'l')
   1093                     return ENC_UTF16LE;
   1094             }
   1095         }
   1096         else if (encoding[0] == '3' && encoding[1] == '2') {
   1097             encoding += 2;
   1098             *bytelength = 4;
   1099             if (*encoding == '\0') {
   1100 #ifdef WORDS_BIGENDIAN
   1101                 return ENC_UTF32BE;
   1102 #else
   1103                 return ENC_UTF32LE;
   1104 #endif
   1105             }
   1106             if (*encoding == '-' || *encoding == '_' )
   1107                 encoding++;
   1108             if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
   1109                 if (Py_TOLOWER(encoding[0]) == 'b')
   1110                     return ENC_UTF32BE;
   1111                 if (Py_TOLOWER(encoding[0]) == 'l')
   1112                     return ENC_UTF32LE;
   1113             }
   1114         }
   1115     }
   1116     else if (strcmp(encoding, "CP_UTF8") == 0) {
   1117         *bytelength = 3;
   1118         return ENC_UTF8;
   1119     }
   1120     return ENC_UNKNOWN;
   1121 }
   1122 
   1123 /* This handler is declared static until someone demonstrates
   1124    a need to call it directly. */
   1125 static PyObject *
   1126 PyCodec_SurrogatePassErrors(PyObject *exc)
   1127 {
   1128     PyObject *restuple;
   1129     PyObject *object;
   1130     PyObject *encode;
   1131     const char *encoding;
   1132     int code;
   1133     int bytelength;
   1134     Py_ssize_t i;
   1135     Py_ssize_t start;
   1136     Py_ssize_t end;
   1137     PyObject *res;
   1138 
   1139     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
   1140         unsigned char *outp;
   1141         if (PyUnicodeEncodeError_GetStart(exc, &start))
   1142             return NULL;
   1143         if (PyUnicodeEncodeError_GetEnd(exc, &end))
   1144             return NULL;
   1145         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
   1146             return NULL;
   1147         if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
   1148             Py_DECREF(object);
   1149             return NULL;
   1150         }
   1151         if (!(encoding = PyUnicode_AsUTF8(encode))) {
   1152             Py_DECREF(object);
   1153             Py_DECREF(encode);
   1154             return NULL;
   1155         }
   1156         code = get_standard_encoding(encoding, &bytelength);
   1157         Py_DECREF(encode);
   1158         if (code == ENC_UNKNOWN) {
   1159             /* Not supported, fail with original exception */
   1160             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
   1161             Py_DECREF(object);
   1162             return NULL;
   1163         }
   1164 
   1165         if (end - start > PY_SSIZE_T_MAX / bytelength)
   1166             end = start + PY_SSIZE_T_MAX / bytelength;
   1167         res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
   1168         if (!res) {
   1169             Py_DECREF(object);
   1170             return NULL;
   1171         }
   1172         outp = (unsigned char*)PyBytes_AsString(res);
   1173         for (i = start; i < end; i++) {
   1174             /* object is guaranteed to be "ready" */
   1175             Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
   1176             if (!Py_UNICODE_IS_SURROGATE(ch)) {
   1177                 /* Not a surrogate, fail with original exception */
   1178                 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
   1179                 Py_DECREF(res);
   1180                 Py_DECREF(object);
   1181                 return NULL;
   1182             }
   1183             switch (code) {
   1184             case ENC_UTF8:
   1185                 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
   1186                 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
   1187                 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
   1188                 break;
   1189             case ENC_UTF16LE:
   1190                 *outp++ = (unsigned char) ch;
   1191                 *outp++ = (unsigned char)(ch >> 8);
   1192                 break;
   1193             case ENC_UTF16BE:
   1194                 *outp++ = (unsigned char)(ch >> 8);
   1195                 *outp++ = (unsigned char) ch;
   1196                 break;
   1197             case ENC_UTF32LE:
   1198                 *outp++ = (unsigned char) ch;
   1199                 *outp++ = (unsigned char)(ch >> 8);
   1200                 *outp++ = (unsigned char)(ch >> 16);
   1201                 *outp++ = (unsigned char)(ch >> 24);
   1202                 break;
   1203             case ENC_UTF32BE:
   1204                 *outp++ = (unsigned char)(ch >> 24);
   1205                 *outp++ = (unsigned char)(ch >> 16);
   1206                 *outp++ = (unsigned char)(ch >> 8);
   1207                 *outp++ = (unsigned char) ch;
   1208                 break;
   1209             }
   1210         }
   1211         restuple = Py_BuildValue("(On)", res, end);
   1212         Py_DECREF(res);
   1213         Py_DECREF(object);
   1214         return restuple;
   1215     }
   1216     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
   1217         const unsigned char *p;
   1218         Py_UCS4 ch = 0;
   1219         if (PyUnicodeDecodeError_GetStart(exc, &start))
   1220             return NULL;
   1221         if (PyUnicodeDecodeError_GetEnd(exc, &end))
   1222             return NULL;
   1223         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
   1224             return NULL;
   1225         p = (const unsigned char*)PyBytes_AS_STRING(object);
   1226         if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
   1227             Py_DECREF(object);
   1228             return NULL;
   1229         }
   1230         if (!(encoding = PyUnicode_AsUTF8(encode))) {
   1231             Py_DECREF(object);
   1232             Py_DECREF(encode);
   1233             return NULL;
   1234         }
   1235         code = get_standard_encoding(encoding, &bytelength);
   1236         Py_DECREF(encode);
   1237         if (code == ENC_UNKNOWN) {
   1238             /* Not supported, fail with original exception */
   1239             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
   1240             Py_DECREF(object);
   1241             return NULL;
   1242         }
   1243 
   1244         /* Try decoding a single surrogate character. If
   1245            there are more, let the codec call us again. */
   1246         p += start;
   1247         if (PyBytes_GET_SIZE(object) - start >= bytelength) {
   1248             switch (code) {
   1249             case ENC_UTF8:
   1250                 if ((p[0] & 0xf0) == 0xe0 &&
   1251                     (p[1] & 0xc0) == 0x80 &&
   1252                     (p[2] & 0xc0) == 0x80) {
   1253                     /* it's a three-byte code */
   1254                     ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
   1255                 }
   1256                 break;
   1257             case ENC_UTF16LE:
   1258                 ch = p[1] << 8 | p[0];
   1259                 break;
   1260             case ENC_UTF16BE:
   1261                 ch = p[0] << 8 | p[1];
   1262                 break;
   1263             case ENC_UTF32LE:
   1264                 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
   1265                 break;
   1266             case ENC_UTF32BE:
   1267                 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
   1268                 break;
   1269             }
   1270         }
   1271 
   1272         Py_DECREF(object);
   1273         if (!Py_UNICODE_IS_SURROGATE(ch)) {
   1274             /* it's not a surrogate - fail */
   1275             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
   1276             return NULL;
   1277         }
   1278         res = PyUnicode_FromOrdinal(ch);
   1279         if (res == NULL)
   1280             return NULL;
   1281         return Py_BuildValue("(Nn)", res, start + bytelength);
   1282     }
   1283     else {
   1284         wrong_exception_type(exc);
   1285         return NULL;
   1286     }
   1287 }
   1288 
   1289 static PyObject *
   1290 PyCodec_SurrogateEscapeErrors(PyObject *exc)
   1291 {
   1292     PyObject *restuple;
   1293     PyObject *object;
   1294     Py_ssize_t i;
   1295     Py_ssize_t start;
   1296     Py_ssize_t end;
   1297     PyObject *res;
   1298 
   1299     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
   1300         char *outp;
   1301         if (PyUnicodeEncodeError_GetStart(exc, &start))
   1302             return NULL;
   1303         if (PyUnicodeEncodeError_GetEnd(exc, &end))
   1304             return NULL;
   1305         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
   1306             return NULL;
   1307         res = PyBytes_FromStringAndSize(NULL, end-start);
   1308         if (!res) {
   1309             Py_DECREF(object);
   1310             return NULL;
   1311         }
   1312         outp = PyBytes_AsString(res);
   1313         for (i = start; i < end; i++) {
   1314             /* object is guaranteed to be "ready" */
   1315             Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
   1316             if (ch < 0xdc80 || ch > 0xdcff) {
   1317                 /* Not a UTF-8b surrogate, fail with original exception */
   1318                 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
   1319                 Py_DECREF(res);
   1320                 Py_DECREF(object);
   1321                 return NULL;
   1322             }
   1323             *outp++ = ch - 0xdc00;
   1324         }
   1325         restuple = Py_BuildValue("(On)", res, end);
   1326         Py_DECREF(res);
   1327         Py_DECREF(object);
   1328         return restuple;
   1329     }
   1330     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
   1331         PyObject *str;
   1332         const unsigned char *p;
   1333         Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
   1334         int consumed = 0;
   1335         if (PyUnicodeDecodeError_GetStart(exc, &start))
   1336             return NULL;
   1337         if (PyUnicodeDecodeError_GetEnd(exc, &end))
   1338             return NULL;
   1339         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
   1340             return NULL;
   1341         p = (const unsigned char*)PyBytes_AS_STRING(object);
   1342         while (consumed < 4 && consumed < end-start) {
   1343             /* Refuse to escape ASCII bytes. */
   1344             if (p[start+consumed] < 128)
   1345                 break;
   1346             ch[consumed] = 0xdc00 + p[start+consumed];
   1347             consumed++;
   1348         }
   1349         Py_DECREF(object);
   1350         if (!consumed) {
   1351             /* codec complained about ASCII byte. */
   1352             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
   1353             return NULL;
   1354         }
   1355         str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
   1356         if (str == NULL)
   1357             return NULL;
   1358         return Py_BuildValue("(Nn)", str, start+consumed);
   1359     }
   1360     else {
   1361         wrong_exception_type(exc);
   1362         return NULL;
   1363     }
   1364 }
   1365 
   1366 
   1367 static PyObject *strict_errors(PyObject *self, PyObject *exc)
   1368 {
   1369     return PyCodec_StrictErrors(exc);
   1370 }
   1371 
   1372 
   1373 static PyObject *ignore_errors(PyObject *self, PyObject *exc)
   1374 {
   1375     return PyCodec_IgnoreErrors(exc);
   1376 }
   1377 
   1378 
   1379 static PyObject *replace_errors(PyObject *self, PyObject *exc)
   1380 {
   1381     return PyCodec_ReplaceErrors(exc);
   1382 }
   1383 
   1384 
   1385 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
   1386 {
   1387     return PyCodec_XMLCharRefReplaceErrors(exc);
   1388 }
   1389 
   1390 
   1391 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
   1392 {
   1393     return PyCodec_BackslashReplaceErrors(exc);
   1394 }
   1395 
   1396 static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
   1397 {
   1398     return PyCodec_NameReplaceErrors(exc);
   1399 }
   1400 
   1401 static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
   1402 {
   1403     return PyCodec_SurrogatePassErrors(exc);
   1404 }
   1405 
   1406 static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
   1407 {
   1408     return PyCodec_SurrogateEscapeErrors(exc);
   1409 }
   1410 
   1411 static int _PyCodecRegistry_Init(void)
   1412 {
   1413     static struct {
   1414         char *name;
   1415         PyMethodDef def;
   1416     } methods[] =
   1417     {
   1418         {
   1419             "strict",
   1420             {
   1421                 "strict_errors",
   1422                 strict_errors,
   1423                 METH_O,
   1424                 PyDoc_STR("Implements the 'strict' error handling, which "
   1425                           "raises a UnicodeError on coding errors.")
   1426             }
   1427         },
   1428         {
   1429             "ignore",
   1430             {
   1431                 "ignore_errors",
   1432                 ignore_errors,
   1433                 METH_O,
   1434                 PyDoc_STR("Implements the 'ignore' error handling, which "
   1435                           "ignores malformed data and continues.")
   1436             }
   1437         },
   1438         {
   1439             "replace",
   1440             {
   1441                 "replace_errors",
   1442                 replace_errors,
   1443                 METH_O,
   1444                 PyDoc_STR("Implements the 'replace' error handling, which "
   1445                           "replaces malformed data with a replacement marker.")
   1446             }
   1447         },
   1448         {
   1449             "xmlcharrefreplace",
   1450             {
   1451                 "xmlcharrefreplace_errors",
   1452                 xmlcharrefreplace_errors,
   1453                 METH_O,
   1454                 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
   1455                           "which replaces an unencodable character with the "
   1456                           "appropriate XML character reference.")
   1457             }
   1458         },
   1459         {
   1460             "backslashreplace",
   1461             {
   1462                 "backslashreplace_errors",
   1463                 backslashreplace_errors,
   1464                 METH_O,
   1465                 PyDoc_STR("Implements the 'backslashreplace' error handling, "
   1466                           "which replaces malformed data with a backslashed "
   1467                           "escape sequence.")
   1468             }
   1469         },
   1470         {
   1471             "namereplace",
   1472             {
   1473                 "namereplace_errors",
   1474                 namereplace_errors,
   1475                 METH_O,
   1476                 PyDoc_STR("Implements the 'namereplace' error handling, "
   1477                           "which replaces an unencodable character with a "
   1478                           "\\N{...} escape sequence.")
   1479             }
   1480         },
   1481         {
   1482             "surrogatepass",
   1483             {
   1484                 "surrogatepass",
   1485                 surrogatepass_errors,
   1486                 METH_O
   1487             }
   1488         },
   1489         {
   1490             "surrogateescape",
   1491             {
   1492                 "surrogateescape",
   1493                 surrogateescape_errors,
   1494                 METH_O
   1495             }
   1496         }
   1497     };
   1498 
   1499     PyInterpreterState *interp = PyThreadState_GET()->interp;
   1500     PyObject *mod;
   1501     unsigned i;
   1502 
   1503     if (interp->codec_search_path != NULL)
   1504         return 0;
   1505 
   1506     interp->codec_search_path = PyList_New(0);
   1507     interp->codec_search_cache = PyDict_New();
   1508     interp->codec_error_registry = PyDict_New();
   1509 
   1510     if (interp->codec_error_registry) {
   1511         for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
   1512             PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
   1513             int res;
   1514             if (!func)
   1515                 Py_FatalError("can't initialize codec error registry");
   1516             res = PyCodec_RegisterError(methods[i].name, func);
   1517             Py_DECREF(func);
   1518             if (res)
   1519                 Py_FatalError("can't initialize codec error registry");
   1520         }
   1521     }
   1522 
   1523     if (interp->codec_search_path == NULL ||
   1524         interp->codec_search_cache == NULL ||
   1525         interp->codec_error_registry == NULL)
   1526         Py_FatalError("can't initialize codec registry");
   1527 
   1528     mod = PyImport_ImportModuleNoBlock("encodings");
   1529     if (mod == NULL) {
   1530         return -1;
   1531     }
   1532     Py_DECREF(mod);
   1533     interp->codecs_initialized = 1;
   1534     return 0;
   1535 }
   1536