Home | History | Annotate | Download | only in Python
      1 /* ------------------------------------------------------------------------
      2 
      3    Python Codec Registry and support functions
      4 
      5 Written by Marc-Andre Lemburg (mal (at) lemburg.com).
      6 
      7 Copyright (c) Corporation for National Research Initiatives.
      8 
      9    ------------------------------------------------------------------------ */
     10 
     11 #include "Python.h"
     12 #include <ctype.h>
     13 
     14 /* --- Codec Registry ----------------------------------------------------- */
     15 
     16 /* Import the standard encodings package which will register the first
     17    codec search function.
     18 
     19    This is done in a lazy way so that the Unicode implementation does
     20    not downgrade startup time of scripts not needing it.
     21 
     22    ImportErrors are silently ignored by this function. Only one try is
     23    made.
     24 
     25 */
     26 
     27 static int _PyCodecRegistry_Init(void); /* Forward */
     28 
     29 int PyCodec_Register(PyObject *search_function)
     30 {
     31     PyInterpreterState *interp = PyThreadState_GET()->interp;
     32     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
     33         goto onError;
     34     if (search_function == NULL) {
     35         PyErr_BadArgument();
     36         goto onError;
     37     }
     38     if (!PyCallable_Check(search_function)) {
     39         PyErr_SetString(PyExc_TypeError, "argument must be callable");
     40         goto onError;
     41     }
     42     return PyList_Append(interp->codec_search_path, search_function);
     43 
     44  onError:
     45     return -1;
     46 }
     47 
     48 /* Convert a string to a normalized Python string: all characters are
     49    converted to lower case, spaces are replaced with underscores. */
     50 
     51 static
     52 PyObject *normalizestring(const char *string)
     53 {
     54     register size_t i;
     55     size_t len = strlen(string);
     56     char *p;
     57     PyObject *v;
     58 
     59     if (len > PY_SSIZE_T_MAX) {
     60         PyErr_SetString(PyExc_OverflowError, "string is too large");
     61         return NULL;
     62     }
     63 
     64     v = PyString_FromStringAndSize(NULL, len);
     65     if (v == NULL)
     66         return NULL;
     67     p = PyString_AS_STRING(v);
     68     for (i = 0; i < len; i++) {
     69         register char ch = string[i];
     70         if (ch == ' ')
     71             ch = '-';
     72         else
     73             ch = Py_TOLOWER(Py_CHARMASK(ch));
     74         p[i] = ch;
     75     }
     76     return v;
     77 }
     78 
     79 /* Lookup the given encoding and return a tuple providing the codec
     80    facilities.
     81 
     82    The encoding string is looked up converted to all lower-case
     83    characters. This makes encodings looked up through this mechanism
     84    effectively case-insensitive.
     85 
     86    If no codec is found, a LookupError is set and NULL returned.
     87 
     88    As side effect, this tries to load the encodings package, if not
     89    yet done. This is part of the lazy load strategy for the encodings
     90    package.
     91 
     92 */
     93 
     94 PyObject *_PyCodec_Lookup(const char *encoding)
     95 {
     96     PyInterpreterState *interp;
     97     PyObject *result, *args = NULL, *v;
     98     Py_ssize_t i, len;
     99 
    100     if (encoding == NULL) {
    101         PyErr_BadArgument();
    102         goto onError;
    103     }
    104 
    105     interp = PyThreadState_GET()->interp;
    106     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
    107         goto onError;
    108 
    109     /* Convert the encoding to a normalized Python string: all
    110        characters are converted to lower case, spaces and hyphens are
    111        replaced with underscores. */
    112     v = normalizestring(encoding);
    113     if (v == NULL)
    114         goto onError;
    115     PyString_InternInPlace(&v);
    116 
    117     /* First, try to lookup the name in the registry dictionary */
    118     result = PyDict_GetItem(interp->codec_search_cache, v);
    119     if (result != NULL) {
    120         Py_INCREF(result);
    121         Py_DECREF(v);
    122         return result;
    123     }
    124 
    125     /* Next, scan the search functions in order of registration */
    126     args = PyTuple_New(1);
    127     if (args == NULL)
    128         goto onError;
    129     PyTuple_SET_ITEM(args,0,v);
    130 
    131     len = PyList_Size(interp->codec_search_path);
    132     if (len < 0)
    133         goto onError;
    134     if (len == 0) {
    135         PyErr_SetString(PyExc_LookupError,
    136                         "no codec search functions registered: "
    137                         "can't find encoding");
    138         goto onError;
    139     }
    140 
    141     for (i = 0; i < len; i++) {
    142         PyObject *func;
    143 
    144         func = PyList_GetItem(interp->codec_search_path, i);
    145         if (func == NULL)
    146             goto onError;
    147         result = PyEval_CallObject(func, args);
    148         if (result == NULL)
    149             goto onError;
    150         if (result == Py_None) {
    151             Py_DECREF(result);
    152             continue;
    153         }
    154         if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
    155             PyErr_SetString(PyExc_TypeError,
    156                             "codec search functions must return 4-tuples");
    157             Py_DECREF(result);
    158             goto onError;
    159         }
    160         break;
    161     }
    162     if (i == len) {
    163         /* XXX Perhaps we should cache misses too ? */
    164         PyErr_Format(PyExc_LookupError,
    165                      "unknown encoding: %s", encoding);
    166         goto onError;
    167     }
    168 
    169     /* Cache and return the result */
    170     PyDict_SetItem(interp->codec_search_cache, v, result);
    171     Py_DECREF(args);
    172     return result;
    173 
    174  onError:
    175     Py_XDECREF(args);
    176     return NULL;
    177 }
    178 
    179 static
    180 PyObject *args_tuple(PyObject *object,
    181                      const char *errors)
    182 {
    183     PyObject *args;
    184 
    185     args = PyTuple_New(1 + (errors != NULL));
    186     if (args == NULL)
    187         return NULL;
    188     Py_INCREF(object);
    189     PyTuple_SET_ITEM(args,0,object);
    190     if (errors) {
    191         PyObject *v;
    192 
    193         v = PyString_FromString(errors);
    194         if (v == NULL) {
    195             Py_DECREF(args);
    196             return NULL;
    197         }
    198         PyTuple_SET_ITEM(args, 1, v);
    199     }
    200     return args;
    201 }
    202 
    203 /* Helper function to get a codec item */
    204 
    205 static
    206 PyObject *codec_getitem(const char *encoding, int index)
    207 {
    208     PyObject *codecs;
    209     PyObject *v;
    210 
    211     codecs = _PyCodec_Lookup(encoding);
    212     if (codecs == NULL)
    213         return NULL;
    214     v = PyTuple_GET_ITEM(codecs, index);
    215     Py_DECREF(codecs);
    216     Py_INCREF(v);
    217     return v;
    218 }
    219 
    220 /* Helper functions to create an incremental codec. */
    221 static
    222 PyObject *codec_makeincrementalcodec(PyObject *codec_info,
    223                                      const char *errors,
    224                                      const char *attrname)
    225 {
    226     PyObject *ret, *inccodec;
    227 
    228     inccodec = PyObject_GetAttrString(codec_info, attrname);
    229     if (inccodec == NULL)
    230         return NULL;
    231     if (errors)
    232         ret = PyObject_CallFunction(inccodec, "s", errors);
    233     else
    234         ret = PyObject_CallFunction(inccodec, NULL);
    235     Py_DECREF(inccodec);
    236     return ret;
    237 }
    238 
    239 static
    240 PyObject *codec_getincrementalcodec(const char *encoding,
    241                                     const char *errors,
    242                                     const char *attrname)
    243 {
    244     PyObject *codec_info, *ret;
    245 
    246     codec_info = _PyCodec_Lookup(encoding);
    247     if (codec_info == NULL)
    248         return NULL;
    249     ret = codec_makeincrementalcodec(codec_info, errors, attrname);
    250     Py_DECREF(codec_info);
    251     return ret;
    252 }
    253 
    254 /* Helper function to create a stream codec. */
    255 
    256 static
    257 PyObject *codec_getstreamcodec(const char *encoding,
    258                                PyObject *stream,
    259                                const char *errors,
    260                                const int index)
    261 {
    262     PyObject *codecs, *streamcodec, *codeccls;
    263 
    264     codecs = _PyCodec_Lookup(encoding);
    265     if (codecs == NULL)
    266         return NULL;
    267 
    268     codeccls = PyTuple_GET_ITEM(codecs, index);
    269     if (errors != NULL)
    270         streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
    271     else
    272         streamcodec = PyObject_CallFunction(codeccls, "O", stream);
    273     Py_DECREF(codecs);
    274     return streamcodec;
    275 }
    276 
    277 /* Helpers to work with the result of _PyCodec_Lookup
    278 
    279  */
    280 PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
    281                                              const char *errors)
    282 {
    283     return codec_makeincrementalcodec(codec_info, errors,
    284                                       "incrementaldecoder");
    285 }
    286 
    287 PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
    288                                              const char *errors)
    289 {
    290     return codec_makeincrementalcodec(codec_info, errors,
    291                                       "incrementalencoder");
    292 }
    293 
    294 
    295 /* Convenience APIs to query the Codec registry.
    296 
    297    All APIs return a codec object with incremented refcount.
    298 
    299  */
    300 
    301 PyObject *PyCodec_Encoder(const char *encoding)
    302 {
    303     return codec_getitem(encoding, 0);
    304 }
    305 
    306 PyObject *PyCodec_Decoder(const char *encoding)
    307 {
    308     return codec_getitem(encoding, 1);
    309 }
    310 
    311 PyObject *PyCodec_IncrementalEncoder(const char *encoding,
    312                                      const char *errors)
    313 {
    314     return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
    315 }
    316 
    317 PyObject *PyCodec_IncrementalDecoder(const char *encoding,
    318                                      const char *errors)
    319 {
    320     return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
    321 }
    322 
    323 PyObject *PyCodec_StreamReader(const char *encoding,
    324                                PyObject *stream,
    325                                const char *errors)
    326 {
    327     return codec_getstreamcodec(encoding, stream, errors, 2);
    328 }
    329 
    330 PyObject *PyCodec_StreamWriter(const char *encoding,
    331                                PyObject *stream,
    332                                const char *errors)
    333 {
    334     return codec_getstreamcodec(encoding, stream, errors, 3);
    335 }
    336 
    337 /* Encode an object (e.g. a Unicode object) using the given encoding
    338    and return the resulting encoded object (usually a Python string).
    339 
    340    errors is passed to the encoder factory as argument if non-NULL. */
    341 
    342 static PyObject *
    343 _PyCodec_EncodeInternal(PyObject *object,
    344                         PyObject *encoder,
    345                         const char *encoding,
    346                         const char *errors)
    347 {
    348     PyObject *args = NULL, *result = NULL;
    349     PyObject *v;
    350 
    351     args = args_tuple(object, errors);
    352     if (args == NULL)
    353         goto onError;
    354 
    355     result = PyEval_CallObject(encoder,args);
    356     if (result == NULL)
    357         goto onError;
    358 
    359     if (!PyTuple_Check(result) ||
    360         PyTuple_GET_SIZE(result) != 2) {
    361         PyErr_SetString(PyExc_TypeError,
    362                         "encoder must return a tuple (object,integer)");
    363         goto onError;
    364     }
    365     v = PyTuple_GET_ITEM(result,0);
    366     Py_INCREF(v);
    367     /* We don't check or use the second (integer) entry. */
    368 
    369     Py_DECREF(args);
    370     Py_DECREF(encoder);
    371     Py_DECREF(result);
    372     return v;
    373 
    374  onError:
    375     Py_XDECREF(result);
    376     Py_XDECREF(args);
    377     Py_XDECREF(encoder);
    378     return NULL;
    379 }
    380 
    381 /* Decode an object (usually a Python string) using the given encoding
    382    and return an equivalent object (e.g. a Unicode object).
    383 
    384    errors is passed to the decoder factory as argument if non-NULL. */
    385 
    386 static PyObject *
    387 _PyCodec_DecodeInternal(PyObject *object,
    388                         PyObject *decoder,
    389                         const char *encoding,
    390                         const char *errors)
    391 {
    392     PyObject *args = NULL, *result = NULL;
    393     PyObject *v;
    394 
    395     args = args_tuple(object, errors);
    396     if (args == NULL)
    397         goto onError;
    398 
    399     result = PyEval_CallObject(decoder,args);
    400     if (result == NULL)
    401         goto onError;
    402     if (!PyTuple_Check(result) ||
    403         PyTuple_GET_SIZE(result) != 2) {
    404         PyErr_SetString(PyExc_TypeError,
    405                         "decoder must return a tuple (object,integer)");
    406         goto onError;
    407     }
    408     v = PyTuple_GET_ITEM(result,0);
    409     Py_INCREF(v);
    410     /* We don't check or use the second (integer) entry. */
    411 
    412     Py_DECREF(args);
    413     Py_DECREF(decoder);
    414     Py_DECREF(result);
    415     return v;
    416 
    417  onError:
    418     Py_XDECREF(args);
    419     Py_XDECREF(decoder);
    420     Py_XDECREF(result);
    421     return NULL;
    422 }
    423 
    424 /* Generic encoding/decoding API */
    425 PyObject *PyCodec_Encode(PyObject *object,
    426                          const char *encoding,
    427                          const char *errors)
    428 {
    429     PyObject *encoder;
    430 
    431     encoder = PyCodec_Encoder(encoding);
    432     if (encoder == NULL)
    433         return NULL;
    434 
    435     return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
    436 }
    437 
    438 PyObject *PyCodec_Decode(PyObject *object,
    439                          const char *encoding,
    440                          const char *errors)
    441 {
    442     PyObject *decoder;
    443 
    444     decoder = PyCodec_Decoder(encoding);
    445     if (decoder == NULL)
    446         return NULL;
    447 
    448     return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
    449 }
    450 
    451 /* Text encoding/decoding API */
    452 PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
    453                                        const char *alternate_command)
    454 {
    455     PyObject *codec;
    456     PyObject *attr;
    457     int is_text_codec;
    458 
    459     codec = _PyCodec_Lookup(encoding);
    460     if (codec == NULL)
    461         return NULL;
    462 
    463     /* Backwards compatibility: assume any raw tuple describes a text
    464      * encoding, and the same for anything lacking the private
    465      * attribute.
    466      */
    467     if (Py_Py3kWarningFlag && !PyTuple_CheckExact(codec)) {
    468         attr = PyObject_GetAttrString(codec, "_is_text_encoding");
    469         if (attr == NULL) {
    470             if (!PyErr_ExceptionMatches(PyExc_AttributeError))
    471                 goto onError;
    472             PyErr_Clear();
    473         } else {
    474             is_text_codec = PyObject_IsTrue(attr);
    475             Py_DECREF(attr);
    476             if (is_text_codec < 0)
    477                 goto onError;
    478             if (!is_text_codec) {
    479                 PyObject *msg = PyString_FromFormat(
    480                             "'%.400s' is not a text encoding; "
    481                             "use %s to handle arbitrary codecs",
    482                             encoding, alternate_command);
    483                 if (msg == NULL)
    484                     goto onError;
    485                 if (PyErr_WarnPy3k(PyString_AS_STRING(msg), 1) < 0) {
    486                     Py_DECREF(msg);
    487                     goto onError;
    488                 }
    489                 Py_DECREF(msg);
    490             }
    491         }
    492     }
    493 
    494     /* This appears to be a valid text encoding */
    495     return codec;
    496 
    497  onError:
    498     Py_DECREF(codec);
    499     return NULL;
    500 }
    501 
    502 
    503 static
    504 PyObject *codec_getitem_checked(const char *encoding,
    505                                 const char *alternate_command,
    506                                 int index)
    507 {
    508     PyObject *codec;
    509     PyObject *v;
    510 
    511     codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
    512     if (codec == NULL)
    513         return NULL;
    514 
    515     v = PyTuple_GET_ITEM(codec, index);
    516     Py_INCREF(v);
    517     Py_DECREF(codec);
    518     return v;
    519 }
    520 
    521 static PyObject * _PyCodec_TextEncoder(const char *encoding)
    522 {
    523     return codec_getitem_checked(encoding, "codecs.encode()", 0);
    524 }
    525 
    526 static PyObject * _PyCodec_TextDecoder(const char *encoding)
    527 {
    528     return codec_getitem_checked(encoding, "codecs.decode()", 1);
    529 }
    530 
    531 PyObject *_PyCodec_EncodeText(PyObject *object,
    532                               const char *encoding,
    533                               const char *errors)
    534 {
    535     PyObject *encoder;
    536 
    537     encoder = _PyCodec_TextEncoder(encoding);
    538     if (encoder == NULL)
    539         return NULL;
    540 
    541     return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
    542 }
    543 
    544 PyObject *_PyCodec_DecodeText(PyObject *object,
    545                               const char *encoding,
    546                               const char *errors)
    547 {
    548     PyObject *decoder;
    549 
    550     decoder = _PyCodec_TextDecoder(encoding);
    551     if (decoder == NULL)
    552         return NULL;
    553 
    554     return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
    555 }
    556 
    557 /* Register the error handling callback function error under the name
    558    name. This function will be called by the codec when it encounters
    559    an unencodable characters/undecodable bytes and doesn't know the
    560    callback name, when name is specified as the error parameter
    561    in the call to the encode/decode function.
    562    Return 0 on success, -1 on error */
    563 int PyCodec_RegisterError(const char *name, PyObject *error)
    564 {
    565     PyInterpreterState *interp = PyThreadState_GET()->interp;
    566     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
    567         return -1;
    568     if (!PyCallable_Check(error)) {
    569         PyErr_SetString(PyExc_TypeError, "handler must be callable");
    570         return -1;
    571     }
    572     return PyDict_SetItemString(interp->codec_error_registry,
    573                                 (char *)name, error);
    574 }
    575 
    576 /* Lookup the error handling callback function registered under the
    577    name error. As a special case NULL can be passed, in which case
    578    the error handling callback for strict encoding will be returned. */
    579 PyObject *PyCodec_LookupError(const char *name)
    580 {
    581     PyObject *handler = NULL;
    582 
    583     PyInterpreterState *interp = PyThreadState_GET()->interp;
    584     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
    585         return NULL;
    586 
    587     if (name==NULL)
    588         name = "strict";
    589     handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
    590     if (!handler)
    591         PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
    592     else
    593         Py_INCREF(handler);
    594     return handler;
    595 }
    596 
    597 static void wrong_exception_type(PyObject *exc)
    598 {
    599     PyObject *type = PyObject_GetAttrString(exc, "__class__");
    600     if (type != NULL) {
    601         PyObject *name = PyObject_GetAttrString(type, "__name__");
    602         Py_DECREF(type);
    603         if (name != NULL) {
    604             PyObject *string = PyObject_Str(name);
    605             Py_DECREF(name);
    606             if (string != NULL) {
    607                 PyErr_Format(PyExc_TypeError,
    608                     "don't know how to handle %.400s in error callback",
    609                     PyString_AS_STRING(string));
    610                 Py_DECREF(string);
    611             }
    612         }
    613     }
    614 }
    615 
    616 PyObject *PyCodec_StrictErrors(PyObject *exc)
    617 {
    618     if (PyExceptionInstance_Check(exc))
    619         PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
    620     else
    621         PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
    622     return NULL;
    623 }
    624 
    625 
    626 #ifdef Py_USING_UNICODE
    627 PyObject *PyCodec_IgnoreErrors(PyObject *exc)
    628 {
    629     Py_ssize_t end;
    630 
    631     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
    632         if (PyUnicodeEncodeError_GetEnd(exc, &end))
    633             return NULL;
    634     }
    635     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
    636         if (PyUnicodeDecodeError_GetEnd(exc, &end))
    637             return NULL;
    638     }
    639     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
    640         if (PyUnicodeTranslateError_GetEnd(exc, &end))
    641             return NULL;
    642     }
    643     else {
    644         wrong_exception_type(exc);
    645         return NULL;
    646     }
    647     /* ouch: passing NULL, 0, pos gives None instead of u'' */
    648     return Py_BuildValue("(u#n)", &end, 0, end);
    649 }
    650 
    651 
    652 PyObject *PyCodec_ReplaceErrors(PyObject *exc)
    653 {
    654     PyObject *restuple;
    655     Py_ssize_t start;
    656     Py_ssize_t end;
    657     Py_ssize_t i;
    658 
    659     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
    660         PyObject *res;
    661         Py_UNICODE *p;
    662         if (PyUnicodeEncodeError_GetStart(exc, &start))
    663             return NULL;
    664         if (PyUnicodeEncodeError_GetEnd(exc, &end))
    665             return NULL;
    666         res = PyUnicode_FromUnicode(NULL, end-start);
    667         if (res == NULL)
    668             return NULL;
    669         for (p = PyUnicode_AS_UNICODE(res), i = start;
    670             i<end; ++p, ++i)
    671             *p = '?';
    672         restuple = Py_BuildValue("(On)", res, end);
    673         Py_DECREF(res);
    674         return restuple;
    675     }
    676     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
    677         Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER;
    678         if (PyUnicodeDecodeError_GetEnd(exc, &end))
    679             return NULL;
    680         return Py_BuildValue("(u#n)", &res, (Py_ssize_t)1, end);
    681     }
    682     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
    683         PyObject *res;
    684         Py_UNICODE *p;
    685         if (PyUnicodeTranslateError_GetStart(exc, &start))
    686             return NULL;
    687         if (PyUnicodeTranslateError_GetEnd(exc, &end))
    688             return NULL;
    689         res = PyUnicode_FromUnicode(NULL, end-start);
    690         if (res == NULL)
    691             return NULL;
    692         for (p = PyUnicode_AS_UNICODE(res), i = start;
    693             i<end; ++p, ++i)
    694             *p = Py_UNICODE_REPLACEMENT_CHARACTER;
    695         restuple = Py_BuildValue("(On)", res, end);
    696         Py_DECREF(res);
    697         return restuple;
    698     }
    699     else {
    700         wrong_exception_type(exc);
    701         return NULL;
    702     }
    703 }
    704 
    705 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
    706 {
    707     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
    708         PyObject *restuple;
    709         PyObject *object;
    710         Py_ssize_t start;
    711         Py_ssize_t end;
    712         PyObject *res;
    713         Py_UNICODE *p;
    714         Py_UNICODE *startp;
    715         Py_UNICODE *e;
    716         Py_UNICODE *outp;
    717         Py_ssize_t ressize;
    718         if (PyUnicodeEncodeError_GetStart(exc, &start))
    719             return NULL;
    720         if (PyUnicodeEncodeError_GetEnd(exc, &end))
    721             return NULL;
    722         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
    723             return NULL;
    724         startp = PyUnicode_AS_UNICODE(object);
    725         if (end - start > PY_SSIZE_T_MAX / (2+7+1)) {
    726             end = start + PY_SSIZE_T_MAX / (2+7+1);
    727 #ifndef Py_UNICODE_WIDE
    728             if (0xD800 <= startp[end - 1] && startp[end - 1] <= 0xDBFF)
    729                 end--;
    730 #endif
    731         }
    732         e = startp + end;
    733         for (p = startp+start, ressize = 0; p < e;) {
    734             Py_UCS4 ch = *p++;
    735 #ifndef Py_UNICODE_WIDE
    736             if ((0xD800 <= ch && ch <= 0xDBFF) &&
    737                 (p < e) &&
    738                 (0xDC00 <= *p && *p <= 0xDFFF)) {
    739                 ch = ((((ch & 0x03FF) << 10) |
    740                        ((Py_UCS4)*p++ & 0x03FF)) + 0x10000);
    741             }
    742 #endif
    743             if (ch < 10)
    744                 ressize += 2+1+1;
    745             else if (ch < 100)
    746                 ressize += 2+2+1;
    747             else if (ch < 1000)
    748                 ressize += 2+3+1;
    749             else if (ch < 10000)
    750                 ressize += 2+4+1;
    751             else if (ch < 100000)
    752                 ressize += 2+5+1;
    753             else if (ch < 1000000)
    754                 ressize += 2+6+1;
    755             else
    756                 ressize += 2+7+1;
    757         }
    758         /* allocate replacement */
    759         res = PyUnicode_FromUnicode(NULL, ressize);
    760         if (res == NULL) {
    761             Py_DECREF(object);
    762             return NULL;
    763         }
    764         /* generate replacement */
    765         for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); p < e;) {
    766             int digits;
    767             int base;
    768             Py_UCS4 ch = *p++;
    769 #ifndef Py_UNICODE_WIDE
    770             if ((0xD800 <= ch && ch <= 0xDBFF) &&
    771                 (p < startp+end) &&
    772                 (0xDC00 <= *p && *p <= 0xDFFF)) {
    773                 ch = ((((ch & 0x03FF) << 10) |
    774                        ((Py_UCS4)*p++ & 0x03FF)) + 0x10000);
    775             }
    776 #endif
    777             *outp++ = '&';
    778             *outp++ = '#';
    779             if (ch < 10) {
    780                 digits = 1;
    781                 base = 1;
    782             }
    783             else if (ch < 100) {
    784                 digits = 2;
    785                 base = 10;
    786             }
    787             else if (ch < 1000) {
    788                 digits = 3;
    789                 base = 100;
    790             }
    791             else if (ch < 10000) {
    792                 digits = 4;
    793                 base = 1000;
    794             }
    795             else if (ch < 100000) {
    796                 digits = 5;
    797                 base = 10000;
    798             }
    799             else if (ch < 1000000) {
    800                 digits = 6;
    801                 base = 100000;
    802             }
    803             else {
    804                 digits = 7;
    805                 base = 1000000;
    806             }
    807             while (digits-->0) {
    808                 *outp++ = '0' + ch/base;
    809                 ch %= base;
    810                 base /= 10;
    811             }
    812             *outp++ = ';';
    813         }
    814         restuple = Py_BuildValue("(On)", res, end);
    815         Py_DECREF(res);
    816         Py_DECREF(object);
    817         return restuple;
    818     }
    819     else {
    820         wrong_exception_type(exc);
    821         return NULL;
    822     }
    823 }
    824 
    825 static Py_UNICODE hexdigits[] = {
    826     '0', '1', '2', '3', '4', '5', '6', '7',
    827     '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
    828 };
    829 
    830 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
    831 {
    832     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
    833         PyObject *restuple;
    834         PyObject *object;
    835         Py_ssize_t start;
    836         Py_ssize_t end;
    837         PyObject *res;
    838         Py_UNICODE *p;
    839         Py_UNICODE *startp;
    840         Py_UNICODE *outp;
    841         Py_ssize_t ressize;
    842         if (PyUnicodeEncodeError_GetStart(exc, &start))
    843             return NULL;
    844         if (PyUnicodeEncodeError_GetEnd(exc, &end))
    845             return NULL;
    846         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
    847             return NULL;
    848         if (end - start > PY_SSIZE_T_MAX / (1+1+8))
    849             end = start + PY_SSIZE_T_MAX / (1+1+8);
    850         startp = PyUnicode_AS_UNICODE(object);
    851         for (p = startp+start, ressize = 0; p < startp+end; ++p) {
    852 #ifdef Py_UNICODE_WIDE
    853             if (*p >= 0x00010000)
    854                 ressize += 1+1+8;
    855             else
    856 #endif
    857             if (*p >= 0x100) {
    858                 ressize += 1+1+4;
    859             }
    860             else
    861                 ressize += 1+1+2;
    862         }
    863         res = PyUnicode_FromUnicode(NULL, ressize);
    864         if (res == NULL) {
    865             Py_DECREF(object);
    866             return NULL;
    867         }
    868         for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
    869             p < startp+end; ++p) {
    870             Py_UNICODE c = *p;
    871             *outp++ = '\\';
    872 #ifdef Py_UNICODE_WIDE
    873             if (c >= 0x00010000) {
    874                 *outp++ = 'U';
    875                 *outp++ = hexdigits[(c>>28)&0xf];
    876                 *outp++ = hexdigits[(c>>24)&0xf];
    877                 *outp++ = hexdigits[(c>>20)&0xf];
    878                 *outp++ = hexdigits[(c>>16)&0xf];
    879                 *outp++ = hexdigits[(c>>12)&0xf];
    880                 *outp++ = hexdigits[(c>>8)&0xf];
    881             }
    882             else
    883 #endif
    884             if (c >= 0x100) {
    885                 *outp++ = 'u';
    886                 *outp++ = hexdigits[(c>>12)&0xf];
    887                 *outp++ = hexdigits[(c>>8)&0xf];
    888             }
    889             else
    890                 *outp++ = 'x';
    891             *outp++ = hexdigits[(c>>4)&0xf];
    892             *outp++ = hexdigits[c&0xf];
    893         }
    894 
    895         restuple = Py_BuildValue("(On)", res, end);
    896         Py_DECREF(res);
    897         Py_DECREF(object);
    898         return restuple;
    899     }
    900     else {
    901         wrong_exception_type(exc);
    902         return NULL;
    903     }
    904 }
    905 #endif
    906 
    907 static PyObject *strict_errors(PyObject *self, PyObject *exc)
    908 {
    909     return PyCodec_StrictErrors(exc);
    910 }
    911 
    912 
    913 #ifdef Py_USING_UNICODE
    914 static PyObject *ignore_errors(PyObject *self, PyObject *exc)
    915 {
    916     return PyCodec_IgnoreErrors(exc);
    917 }
    918 
    919 
    920 static PyObject *replace_errors(PyObject *self, PyObject *exc)
    921 {
    922     return PyCodec_ReplaceErrors(exc);
    923 }
    924 
    925 
    926 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
    927 {
    928     return PyCodec_XMLCharRefReplaceErrors(exc);
    929 }
    930 
    931 
    932 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
    933 {
    934     return PyCodec_BackslashReplaceErrors(exc);
    935 }
    936 #endif
    937 
    938 static int _PyCodecRegistry_Init(void)
    939 {
    940     static struct {
    941         char *name;
    942         PyMethodDef def;
    943     } methods[] =
    944     {
    945         {
    946             "strict",
    947             {
    948                 "strict_errors",
    949                 strict_errors,
    950                 METH_O,
    951                 PyDoc_STR("Implements the 'strict' error handling, which "
    952                           "raises a UnicodeError on coding errors.")
    953             }
    954         },
    955 #ifdef Py_USING_UNICODE
    956         {
    957             "ignore",
    958             {
    959                 "ignore_errors",
    960                 ignore_errors,
    961                 METH_O,
    962                 PyDoc_STR("Implements the 'ignore' error handling, which "
    963                           "ignores malformed data and continues.")
    964             }
    965         },
    966         {
    967             "replace",
    968             {
    969                 "replace_errors",
    970                 replace_errors,
    971                 METH_O,
    972                 PyDoc_STR("Implements the 'replace' error handling, which "
    973                           "replaces malformed data with a replacement marker.")
    974             }
    975         },
    976         {
    977             "xmlcharrefreplace",
    978             {
    979                 "xmlcharrefreplace_errors",
    980                 xmlcharrefreplace_errors,
    981                 METH_O,
    982                 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
    983                           "which replaces an unencodable character with the "
    984                           "appropriate XML character reference.")
    985             }
    986         },
    987         {
    988             "backslashreplace",
    989             {
    990                 "backslashreplace_errors",
    991                 backslashreplace_errors,
    992                 METH_O,
    993                 PyDoc_STR("Implements the 'backslashreplace' error handling, "
    994                           "which replaces an unencodable character with a "
    995                           "backslashed escape sequence.")
    996             }
    997         }
    998 #endif
    999     };
   1000 
   1001     PyInterpreterState *interp = PyThreadState_GET()->interp;
   1002     PyObject *mod;
   1003     unsigned i;
   1004 
   1005     if (interp->codec_search_path != NULL)
   1006         return 0;
   1007 
   1008     interp->codec_search_path = PyList_New(0);
   1009     interp->codec_search_cache = PyDict_New();
   1010     interp->codec_error_registry = PyDict_New();
   1011 
   1012     if (interp->codec_error_registry) {
   1013         for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) {
   1014             PyObject *func = PyCFunction_New(&methods[i].def, NULL);
   1015             int res;
   1016             if (!func)
   1017                 Py_FatalError("can't initialize codec error registry");
   1018             res = PyCodec_RegisterError(methods[i].name, func);
   1019             Py_DECREF(func);
   1020             if (res)
   1021                 Py_FatalError("can't initialize codec error registry");
   1022         }
   1023     }
   1024 
   1025     if (interp->codec_search_path == NULL ||
   1026         interp->codec_search_cache == NULL ||
   1027         interp->codec_error_registry == NULL)
   1028         Py_FatalError("can't initialize codec registry");
   1029 
   1030     mod = PyImport_ImportModuleLevel("encodings", NULL, NULL, NULL, 0);
   1031     if (mod == NULL) {
   1032         if (PyErr_ExceptionMatches(PyExc_ImportError)) {
   1033             /* Ignore ImportErrors... this is done so that
   1034                distributions can disable the encodings package. Note
   1035                that other errors are not masked, e.g. SystemErrors
   1036                raised to inform the user of an error in the Python
   1037                configuration are still reported back to the user. */
   1038             PyErr_Clear();
   1039             return 0;
   1040         }
   1041         return -1;
   1042     }
   1043     Py_DECREF(mod);
   1044     return 0;
   1045 }
   1046