Home | History | Annotate | Download | only in Modules
      1 /* ------------------------------------------------------------------------
      2 
      3    _codecs -- Provides access to the codec registry and the builtin
      4               codecs.
      5 
      6    This module should never be imported directly. The standard library
      7    module "codecs" wraps this builtin module for use within Python.
      8 
      9    The codec registry is accessible via:
     10 
     11      register(search_function) -> None
     12 
     13      lookup(encoding) -> CodecInfo object
     14 
     15    The builtin Unicode codecs use the following interface:
     16 
     17      <encoding>_encode(Unicode_object[,errors='strict']) ->
     18         (string object, bytes consumed)
     19 
     20      <encoding>_decode(char_buffer_obj[,errors='strict']) ->
     21         (Unicode object, bytes consumed)
     22 
     23    These <encoding>s are available: utf_8, unicode_escape,
     24    raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit),
     25    mbcs (on win32).
     26 
     27 
     28 Written by Marc-Andre Lemburg (mal (at) lemburg.com).
     29 
     30 Copyright (c) Corporation for National Research Initiatives.
     31 
     32    ------------------------------------------------------------------------ */
     33 
     34 #define PY_SSIZE_T_CLEAN
     35 #include "Python.h"
     36 
     37 #ifdef MS_WINDOWS
     38 #include <windows.h>
     39 #endif
     40 
     41 /*[clinic input]
     42 module _codecs
     43 [clinic start generated code]*/
     44 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=e1390e3da3cb9deb]*/
     45 
     46 #include "clinic/_codecsmodule.c.h"
     47 
     48 /* --- Registry ----------------------------------------------------------- */
     49 
     50 /*[clinic input]
     51 _codecs.register
     52     search_function: object
     53     /
     54 
     55 Register a codec search function.
     56 
     57 Search functions are expected to take one argument, the encoding name in
     58 all lower case letters, and either return None, or a tuple of functions
     59 (encoder, decoder, stream_reader, stream_writer) (or a CodecInfo object).
     60 [clinic start generated code]*/
     61 
     62 static PyObject *
     63 _codecs_register(PyObject *module, PyObject *search_function)
     64 /*[clinic end generated code: output=d1bf21e99db7d6d3 input=369578467955cae4]*/
     65 {
     66     if (PyCodec_Register(search_function))
     67         return NULL;
     68 
     69     Py_RETURN_NONE;
     70 }
     71 
     72 /*[clinic input]
     73 _codecs.lookup
     74     encoding: str
     75     /
     76 
     77 Looks up a codec tuple in the Python codec registry and returns a CodecInfo object.
     78 [clinic start generated code]*/
     79 
     80 static PyObject *
     81 _codecs_lookup_impl(PyObject *module, const char *encoding)
     82 /*[clinic end generated code: output=9f0afa572080c36d input=3c572c0db3febe9c]*/
     83 {
     84     return _PyCodec_Lookup(encoding);
     85 }
     86 
     87 /*[clinic input]
     88 _codecs.encode
     89     obj: object
     90     encoding: str(c_default="NULL") = "utf-8"
     91     errors: str(c_default="NULL") = "strict"
     92 
     93 Encodes obj using the codec registered for encoding.
     94 
     95 The default encoding is 'utf-8'.  errors may be given to set a
     96 different error handling scheme.  Default is 'strict' meaning that encoding
     97 errors raise a ValueError.  Other possible values are 'ignore', 'replace'
     98 and 'backslashreplace' as well as any other name registered with
     99 codecs.register_error that can handle ValueErrors.
    100 [clinic start generated code]*/
    101 
    102 static PyObject *
    103 _codecs_encode_impl(PyObject *module, PyObject *obj, const char *encoding,
    104                     const char *errors)
    105 /*[clinic end generated code: output=385148eb9a067c86 input=cd5b685040ff61f0]*/
    106 {
    107     if (encoding == NULL)
    108         encoding = PyUnicode_GetDefaultEncoding();
    109 
    110     /* Encode via the codec registry */
    111     return PyCodec_Encode(obj, encoding, errors);
    112 }
    113 
    114 /*[clinic input]
    115 _codecs.decode
    116     obj: object
    117     encoding: str(c_default="NULL") = "utf-8"
    118     errors: str(c_default="NULL") = "strict"
    119 
    120 Decodes obj using the codec registered for encoding.
    121 
    122 Default encoding is 'utf-8'.  errors may be given to set a
    123 different error handling scheme.  Default is 'strict' meaning that encoding
    124 errors raise a ValueError.  Other possible values are 'ignore', 'replace'
    125 and 'backslashreplace' as well as any other name registered with
    126 codecs.register_error that can handle ValueErrors.
    127 [clinic start generated code]*/
    128 
    129 static PyObject *
    130 _codecs_decode_impl(PyObject *module, PyObject *obj, const char *encoding,
    131                     const char *errors)
    132 /*[clinic end generated code: output=679882417dc3a0bd input=7702c0cc2fa1add6]*/
    133 {
    134     if (encoding == NULL)
    135         encoding = PyUnicode_GetDefaultEncoding();
    136 
    137     /* Decode via the codec registry */
    138     return PyCodec_Decode(obj, encoding, errors);
    139 }
    140 
    141 /* --- Helpers ------------------------------------------------------------ */
    142 
    143 /*[clinic input]
    144 _codecs._forget_codec
    145 
    146     encoding: str
    147     /
    148 
    149 Purge the named codec from the internal codec lookup cache
    150 [clinic start generated code]*/
    151 
    152 static PyObject *
    153 _codecs__forget_codec_impl(PyObject *module, const char *encoding)
    154 /*[clinic end generated code: output=0bde9f0a5b084aa2 input=18d5d92d0e386c38]*/
    155 {
    156     if (_PyCodec_Forget(encoding) < 0) {
    157         return NULL;
    158     };
    159     Py_RETURN_NONE;
    160 }
    161 
    162 static
    163 PyObject *codec_tuple(PyObject *decoded,
    164                       Py_ssize_t len)
    165 {
    166     if (decoded == NULL)
    167         return NULL;
    168     return Py_BuildValue("Nn", decoded, len);
    169 }
    170 
    171 /* --- String codecs ------------------------------------------------------ */
    172 /*[clinic input]
    173 _codecs.escape_decode
    174     data: Py_buffer(accept={str, buffer})
    175     errors: str(accept={str, NoneType}) = NULL
    176     /
    177 [clinic start generated code]*/
    178 
    179 static PyObject *
    180 _codecs_escape_decode_impl(PyObject *module, Py_buffer *data,
    181                            const char *errors)
    182 /*[clinic end generated code: output=505200ba8056979a input=0018edfd99db714d]*/
    183 {
    184     PyObject *decoded = PyBytes_DecodeEscape(data->buf, data->len,
    185                                              errors, 0, NULL);
    186     return codec_tuple(decoded, data->len);
    187 }
    188 
    189 /*[clinic input]
    190 _codecs.escape_encode
    191     data: object(subclass_of='&PyBytes_Type')
    192     errors: str(accept={str, NoneType}) = NULL
    193     /
    194 [clinic start generated code]*/
    195 
    196 static PyObject *
    197 _codecs_escape_encode_impl(PyObject *module, PyObject *data,
    198                            const char *errors)
    199 /*[clinic end generated code: output=4af1d477834bab34 input=da9ded00992f32f2]*/
    200 {
    201     Py_ssize_t size;
    202     Py_ssize_t newsize;
    203     PyObject *v;
    204 
    205     size = PyBytes_GET_SIZE(data);
    206     if (size > PY_SSIZE_T_MAX / 4) {
    207         PyErr_SetString(PyExc_OverflowError,
    208             "string is too large to encode");
    209             return NULL;
    210     }
    211     newsize = 4*size;
    212     v = PyBytes_FromStringAndSize(NULL, newsize);
    213 
    214     if (v == NULL) {
    215         return NULL;
    216     }
    217     else {
    218         Py_ssize_t i;
    219         char c;
    220         char *p = PyBytes_AS_STRING(v);
    221 
    222         for (i = 0; i < size; i++) {
    223             /* There's at least enough room for a hex escape */
    224             assert(newsize - (p - PyBytes_AS_STRING(v)) >= 4);
    225             c = PyBytes_AS_STRING(data)[i];
    226             if (c == '\'' || c == '\\')
    227                 *p++ = '\\', *p++ = c;
    228             else if (c == '\t')
    229                 *p++ = '\\', *p++ = 't';
    230             else if (c == '\n')
    231                 *p++ = '\\', *p++ = 'n';
    232             else if (c == '\r')
    233                 *p++ = '\\', *p++ = 'r';
    234             else if (c < ' ' || c >= 0x7f) {
    235                 *p++ = '\\';
    236                 *p++ = 'x';
    237                 *p++ = Py_hexdigits[(c & 0xf0) >> 4];
    238                 *p++ = Py_hexdigits[c & 0xf];
    239             }
    240             else
    241                 *p++ = c;
    242         }
    243         *p = '\0';
    244         if (_PyBytes_Resize(&v, (p - PyBytes_AS_STRING(v)))) {
    245             return NULL;
    246         }
    247     }
    248 
    249     return codec_tuple(v, size);
    250 }
    251 
    252 /* --- Decoder ------------------------------------------------------------ */
    253 /*[clinic input]
    254 _codecs.unicode_internal_decode
    255     obj: object
    256     errors: str(accept={str, NoneType}) = NULL
    257     /
    258 [clinic start generated code]*/
    259 
    260 static PyObject *
    261 _codecs_unicode_internal_decode_impl(PyObject *module, PyObject *obj,
    262                                      const char *errors)
    263 /*[clinic end generated code: output=edbfe175e09eff9a input=8d57930aeda170c6]*/
    264 {
    265     if (PyUnicode_Check(obj)) {
    266         if (PyUnicode_READY(obj) < 0)
    267             return NULL;
    268         Py_INCREF(obj);
    269         return codec_tuple(obj, PyUnicode_GET_LENGTH(obj));
    270     }
    271     else {
    272         Py_buffer view;
    273         PyObject *result;
    274         if (PyObject_GetBuffer(obj, &view, PyBUF_SIMPLE) != 0)
    275             return NULL;
    276 
    277         result = codec_tuple(
    278                 _PyUnicode_DecodeUnicodeInternal(view.buf, view.len, errors),
    279                 view.len);
    280         PyBuffer_Release(&view);
    281         return result;
    282     }
    283 }
    284 
    285 /*[clinic input]
    286 _codecs.utf_7_decode
    287     data: Py_buffer
    288     errors: str(accept={str, NoneType}) = NULL
    289     final: int(c_default="0") = False
    290     /
    291 [clinic start generated code]*/
    292 
    293 static PyObject *
    294 _codecs_utf_7_decode_impl(PyObject *module, Py_buffer *data,
    295                           const char *errors, int final)
    296 /*[clinic end generated code: output=0cd3a944a32a4089 input=bc4d6247ecdb01e6]*/
    297 {
    298     Py_ssize_t consumed = data->len;
    299     PyObject *decoded = PyUnicode_DecodeUTF7Stateful(data->buf, data->len,
    300                                                      errors,
    301                                                      final ? NULL : &consumed);
    302     return codec_tuple(decoded, consumed);
    303 }
    304 
    305 /*[clinic input]
    306 _codecs.utf_8_decode
    307     data: Py_buffer
    308     errors: str(accept={str, NoneType}) = NULL
    309     final: int(c_default="0") = False
    310     /
    311 [clinic start generated code]*/
    312 
    313 static PyObject *
    314 _codecs_utf_8_decode_impl(PyObject *module, Py_buffer *data,
    315                           const char *errors, int final)
    316 /*[clinic end generated code: output=10f74dec8d9bb8bf input=39161d71e7422ee2]*/
    317 {
    318     Py_ssize_t consumed = data->len;
    319     PyObject *decoded = PyUnicode_DecodeUTF8Stateful(data->buf, data->len,
    320                                                      errors,
    321                                                      final ? NULL : &consumed);
    322     return codec_tuple(decoded, consumed);
    323 }
    324 
    325 /*[clinic input]
    326 _codecs.utf_16_decode
    327     data: Py_buffer
    328     errors: str(accept={str, NoneType}) = NULL
    329     final: int(c_default="0") = False
    330     /
    331 [clinic start generated code]*/
    332 
    333 static PyObject *
    334 _codecs_utf_16_decode_impl(PyObject *module, Py_buffer *data,
    335                            const char *errors, int final)
    336 /*[clinic end generated code: output=783b442abcbcc2d0 input=f3cf01d1461007ce]*/
    337 {
    338     int byteorder = 0;
    339     /* This is overwritten unless final is true. */
    340     Py_ssize_t consumed = data->len;
    341     PyObject *decoded = PyUnicode_DecodeUTF16Stateful(data->buf, data->len,
    342                                                       errors, &byteorder,
    343                                                       final ? NULL : &consumed);
    344     return codec_tuple(decoded, consumed);
    345 }
    346 
    347 /*[clinic input]
    348 _codecs.utf_16_le_decode
    349     data: Py_buffer
    350     errors: str(accept={str, NoneType}) = NULL
    351     final: int(c_default="0") = False
    352     /
    353 [clinic start generated code]*/
    354 
    355 static PyObject *
    356 _codecs_utf_16_le_decode_impl(PyObject *module, Py_buffer *data,
    357                               const char *errors, int final)
    358 /*[clinic end generated code: output=899b9e6364379dcd input=a77e3bf97335d94e]*/
    359 {
    360     int byteorder = -1;
    361     /* This is overwritten unless final is true. */
    362     Py_ssize_t consumed = data->len;
    363     PyObject *decoded = PyUnicode_DecodeUTF16Stateful(data->buf, data->len,
    364                                                       errors, &byteorder,
    365                                                       final ? NULL : &consumed);
    366     return codec_tuple(decoded, consumed);
    367 }
    368 
    369 /*[clinic input]
    370 _codecs.utf_16_be_decode
    371     data: Py_buffer
    372     errors: str(accept={str, NoneType}) = NULL
    373     final: int(c_default="0") = False
    374     /
    375 [clinic start generated code]*/
    376 
    377 static PyObject *
    378 _codecs_utf_16_be_decode_impl(PyObject *module, Py_buffer *data,
    379                               const char *errors, int final)
    380 /*[clinic end generated code: output=49f6465ea07669c8 input=606f69fae91b5563]*/
    381 {
    382     int byteorder = 1;
    383     /* This is overwritten unless final is true. */
    384     Py_ssize_t consumed = data->len;
    385     PyObject *decoded = PyUnicode_DecodeUTF16Stateful(data->buf, data->len,
    386                                                       errors, &byteorder,
    387                                                       final ? NULL : &consumed);
    388     return codec_tuple(decoded, consumed);
    389 }
    390 
    391 /* This non-standard version also provides access to the byteorder
    392    parameter of the builtin UTF-16 codec.
    393 
    394    It returns a tuple (unicode, bytesread, byteorder) with byteorder
    395    being the value in effect at the end of data.
    396 
    397 */
    398 /*[clinic input]
    399 _codecs.utf_16_ex_decode
    400     data: Py_buffer
    401     errors: str(accept={str, NoneType}) = NULL
    402     byteorder: int = 0
    403     final: int(c_default="0") = False
    404     /
    405 [clinic start generated code]*/
    406 
    407 static PyObject *
    408 _codecs_utf_16_ex_decode_impl(PyObject *module, Py_buffer *data,
    409                               const char *errors, int byteorder, int final)
    410 /*[clinic end generated code: output=0f385f251ecc1988 input=f6e7f697658c013e]*/
    411 {
    412     /* This is overwritten unless final is true. */
    413     Py_ssize_t consumed = data->len;
    414 
    415     PyObject *decoded = PyUnicode_DecodeUTF16Stateful(data->buf, data->len,
    416                                                       errors, &byteorder,
    417                                                       final ? NULL : &consumed);
    418     if (decoded == NULL)
    419         return NULL;
    420     return Py_BuildValue("Nni", decoded, consumed, byteorder);
    421 }
    422 
    423 /*[clinic input]
    424 _codecs.utf_32_decode
    425     data: Py_buffer
    426     errors: str(accept={str, NoneType}) = NULL
    427     final: int(c_default="0") = False
    428     /
    429 [clinic start generated code]*/
    430 
    431 static PyObject *
    432 _codecs_utf_32_decode_impl(PyObject *module, Py_buffer *data,
    433                            const char *errors, int final)
    434 /*[clinic end generated code: output=2fc961807f7b145f input=86d4f41c6c2e763d]*/
    435 {
    436     int byteorder = 0;
    437     /* This is overwritten unless final is true. */
    438     Py_ssize_t consumed = data->len;
    439     PyObject *decoded = PyUnicode_DecodeUTF32Stateful(data->buf, data->len,
    440                                                       errors, &byteorder,
    441                                                       final ? NULL : &consumed);
    442     return codec_tuple(decoded, consumed);
    443 }
    444 
    445 /*[clinic input]
    446 _codecs.utf_32_le_decode
    447     data: Py_buffer
    448     errors: str(accept={str, NoneType}) = NULL
    449     final: int(c_default="0") = False
    450     /
    451 [clinic start generated code]*/
    452 
    453 static PyObject *
    454 _codecs_utf_32_le_decode_impl(PyObject *module, Py_buffer *data,
    455                               const char *errors, int final)
    456 /*[clinic end generated code: output=ec8f46b67a94f3e6 input=d18b650772d188ba]*/
    457 {
    458     int byteorder = -1;
    459     /* This is overwritten unless final is true. */
    460     Py_ssize_t consumed = data->len;
    461     PyObject *decoded = PyUnicode_DecodeUTF32Stateful(data->buf, data->len,
    462                                                       errors, &byteorder,
    463                                                       final ? NULL : &consumed);
    464     return codec_tuple(decoded, consumed);
    465 }
    466 
    467 /*[clinic input]
    468 _codecs.utf_32_be_decode
    469     data: Py_buffer
    470     errors: str(accept={str, NoneType}) = NULL
    471     final: int(c_default="0") = False
    472     /
    473 [clinic start generated code]*/
    474 
    475 static PyObject *
    476 _codecs_utf_32_be_decode_impl(PyObject *module, Py_buffer *data,
    477                               const char *errors, int final)
    478 /*[clinic end generated code: output=ff82bae862c92c4e input=19c271b5d34926d8]*/
    479 {
    480     int byteorder = 1;
    481     /* This is overwritten unless final is true. */
    482     Py_ssize_t consumed = data->len;
    483     PyObject *decoded = PyUnicode_DecodeUTF32Stateful(data->buf, data->len,
    484                                                       errors, &byteorder,
    485                                                       final ? NULL : &consumed);
    486     return codec_tuple(decoded, consumed);
    487 }
    488 
    489 /* This non-standard version also provides access to the byteorder
    490    parameter of the builtin UTF-32 codec.
    491 
    492    It returns a tuple (unicode, bytesread, byteorder) with byteorder
    493    being the value in effect at the end of data.
    494 
    495 */
    496 /*[clinic input]
    497 _codecs.utf_32_ex_decode
    498     data: Py_buffer
    499     errors: str(accept={str, NoneType}) = NULL
    500     byteorder: int = 0
    501     final: int(c_default="0") = False
    502     /
    503 [clinic start generated code]*/
    504 
    505 static PyObject *
    506 _codecs_utf_32_ex_decode_impl(PyObject *module, Py_buffer *data,
    507                               const char *errors, int byteorder, int final)
    508 /*[clinic end generated code: output=6bfb177dceaf4848 input=4af3e6ccfe34a076]*/
    509 {
    510     Py_ssize_t consumed = data->len;
    511     PyObject *decoded = PyUnicode_DecodeUTF32Stateful(data->buf, data->len,
    512                                                       errors, &byteorder,
    513                                                       final ? NULL : &consumed);
    514     if (decoded == NULL)
    515         return NULL;
    516     return Py_BuildValue("Nni", decoded, consumed, byteorder);
    517 }
    518 
    519 /*[clinic input]
    520 _codecs.unicode_escape_decode
    521     data: Py_buffer(accept={str, buffer})
    522     errors: str(accept={str, NoneType}) = NULL
    523     /
    524 [clinic start generated code]*/
    525 
    526 static PyObject *
    527 _codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
    528                                    const char *errors)
    529 /*[clinic end generated code: output=3ca3c917176b82ab input=49fd27d06813a7f5]*/
    530 {
    531     PyObject *decoded = PyUnicode_DecodeUnicodeEscape(data->buf, data->len,
    532                                                       errors);
    533     return codec_tuple(decoded, data->len);
    534 }
    535 
    536 /*[clinic input]
    537 _codecs.raw_unicode_escape_decode
    538     data: Py_buffer(accept={str, buffer})
    539     errors: str(accept={str, NoneType}) = NULL
    540     /
    541 [clinic start generated code]*/
    542 
    543 static PyObject *
    544 _codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
    545                                        const char *errors)
    546 /*[clinic end generated code: output=c98eeb56028070a6 input=770903a211434ebc]*/
    547 {
    548     PyObject *decoded = PyUnicode_DecodeRawUnicodeEscape(data->buf, data->len,
    549                                                          errors);
    550     return codec_tuple(decoded, data->len);
    551 }
    552 
    553 /*[clinic input]
    554 _codecs.latin_1_decode
    555     data: Py_buffer
    556     errors: str(accept={str, NoneType}) = NULL
    557     /
    558 [clinic start generated code]*/
    559 
    560 static PyObject *
    561 _codecs_latin_1_decode_impl(PyObject *module, Py_buffer *data,
    562                             const char *errors)
    563 /*[clinic end generated code: output=07f3dfa3f72c7d8f input=5cad0f1759c618ec]*/
    564 {
    565     PyObject *decoded = PyUnicode_DecodeLatin1(data->buf, data->len, errors);
    566     return codec_tuple(decoded, data->len);
    567 }
    568 
    569 /*[clinic input]
    570 _codecs.ascii_decode
    571     data: Py_buffer
    572     errors: str(accept={str, NoneType}) = NULL
    573     /
    574 [clinic start generated code]*/
    575 
    576 static PyObject *
    577 _codecs_ascii_decode_impl(PyObject *module, Py_buffer *data,
    578                           const char *errors)
    579 /*[clinic end generated code: output=2627d72058d42429 input=ad1106f64037bd16]*/
    580 {
    581     PyObject *decoded = PyUnicode_DecodeASCII(data->buf, data->len, errors);
    582     return codec_tuple(decoded, data->len);
    583 }
    584 
    585 /*[clinic input]
    586 _codecs.charmap_decode
    587     data: Py_buffer
    588     errors: str(accept={str, NoneType}) = NULL
    589     mapping: object = NULL
    590     /
    591 [clinic start generated code]*/
    592 
    593 static PyObject *
    594 _codecs_charmap_decode_impl(PyObject *module, Py_buffer *data,
    595                             const char *errors, PyObject *mapping)
    596 /*[clinic end generated code: output=2c335b09778cf895 input=19712ca35c5a80e2]*/
    597 {
    598     PyObject *decoded;
    599 
    600     if (mapping == Py_None)
    601         mapping = NULL;
    602 
    603     decoded = PyUnicode_DecodeCharmap(data->buf, data->len, mapping, errors);
    604     return codec_tuple(decoded, data->len);
    605 }
    606 
    607 #ifdef MS_WINDOWS
    608 
    609 /*[clinic input]
    610 _codecs.mbcs_decode
    611     data: Py_buffer
    612     errors: str(accept={str, NoneType}) = NULL
    613     final: int(c_default="0") = False
    614     /
    615 [clinic start generated code]*/
    616 
    617 static PyObject *
    618 _codecs_mbcs_decode_impl(PyObject *module, Py_buffer *data,
    619                          const char *errors, int final)
    620 /*[clinic end generated code: output=39b65b8598938c4b input=d492c1ca64f4fa8a]*/
    621 {
    622     Py_ssize_t consumed = data->len;
    623     PyObject *decoded = PyUnicode_DecodeMBCSStateful(data->buf, data->len,
    624             errors, final ? NULL : &consumed);
    625     return codec_tuple(decoded, consumed);
    626 }
    627 
    628 /*[clinic input]
    629 _codecs.oem_decode
    630     data: Py_buffer
    631     errors: str(accept={str, NoneType}) = NULL
    632     final: int(c_default="0") = False
    633     /
    634 [clinic start generated code]*/
    635 
    636 static PyObject *
    637 _codecs_oem_decode_impl(PyObject *module, Py_buffer *data,
    638                         const char *errors, int final)
    639 /*[clinic end generated code: output=da1617612f3fcad8 input=95b8a92c446b03cd]*/
    640 {
    641     Py_ssize_t consumed = data->len;
    642     PyObject *decoded = PyUnicode_DecodeCodePageStateful(CP_OEMCP,
    643         data->buf, data->len, errors, final ? NULL : &consumed);
    644     return codec_tuple(decoded, consumed);
    645 }
    646 
    647 /*[clinic input]
    648 _codecs.code_page_decode
    649     codepage: int
    650     data: Py_buffer
    651     errors: str(accept={str, NoneType}) = NULL
    652     final: int(c_default="0") = False
    653     /
    654 [clinic start generated code]*/
    655 
    656 static PyObject *
    657 _codecs_code_page_decode_impl(PyObject *module, int codepage,
    658                               Py_buffer *data, const char *errors, int final)
    659 /*[clinic end generated code: output=53008ea967da3fff input=4f3152a304e21d51]*/
    660 {
    661     Py_ssize_t consumed = data->len;
    662     PyObject *decoded = PyUnicode_DecodeCodePageStateful(codepage,
    663                                                          data->buf, data->len,
    664                                                          errors,
    665                                                          final ? NULL : &consumed);
    666     return codec_tuple(decoded, consumed);
    667 }
    668 
    669 #endif /* MS_WINDOWS */
    670 
    671 /* --- Encoder ------------------------------------------------------------ */
    672 
    673 /*[clinic input]
    674 _codecs.readbuffer_encode
    675     data: Py_buffer(accept={str, buffer})
    676     errors: str(accept={str, NoneType}) = NULL
    677     /
    678 [clinic start generated code]*/
    679 
    680 static PyObject *
    681 _codecs_readbuffer_encode_impl(PyObject *module, Py_buffer *data,
    682                                const char *errors)
    683 /*[clinic end generated code: output=c645ea7cdb3d6e86 input=b7c322b89d4ab923]*/
    684 {
    685     PyObject *result = PyBytes_FromStringAndSize(data->buf, data->len);
    686     return codec_tuple(result, data->len);
    687 }
    688 
    689 /*[clinic input]
    690 _codecs.unicode_internal_encode
    691     obj: object
    692     errors: str(accept={str, NoneType}) = NULL
    693     /
    694 [clinic start generated code]*/
    695 
    696 static PyObject *
    697 _codecs_unicode_internal_encode_impl(PyObject *module, PyObject *obj,
    698                                      const char *errors)
    699 /*[clinic end generated code: output=a72507dde4ea558f input=8628f0280cf5ba61]*/
    700 {
    701     if (PyErr_WarnEx(PyExc_DeprecationWarning,
    702                      "unicode_internal codec has been deprecated",
    703                      1))
    704         return NULL;
    705 
    706     if (PyUnicode_Check(obj)) {
    707         Py_UNICODE *u;
    708         Py_ssize_t len, size;
    709 
    710         if (PyUnicode_READY(obj) < 0)
    711             return NULL;
    712 
    713         u = PyUnicode_AsUnicodeAndSize(obj, &len);
    714         if (u == NULL)
    715             return NULL;
    716         if ((size_t)len > (size_t)PY_SSIZE_T_MAX / sizeof(Py_UNICODE))
    717             return PyErr_NoMemory();
    718         size = len * sizeof(Py_UNICODE);
    719         return codec_tuple(PyBytes_FromStringAndSize((const char*)u, size),
    720                            PyUnicode_GET_LENGTH(obj));
    721     }
    722     else {
    723         Py_buffer view;
    724         PyObject *result;
    725         if (PyObject_GetBuffer(obj, &view, PyBUF_SIMPLE) != 0)
    726             return NULL;
    727         result = codec_tuple(PyBytes_FromStringAndSize(view.buf, view.len),
    728                              view.len);
    729         PyBuffer_Release(&view);
    730         return result;
    731     }
    732 }
    733 
    734 /*[clinic input]
    735 _codecs.utf_7_encode
    736     str: unicode
    737     errors: str(accept={str, NoneType}) = NULL
    738     /
    739 [clinic start generated code]*/
    740 
    741 static PyObject *
    742 _codecs_utf_7_encode_impl(PyObject *module, PyObject *str,
    743                           const char *errors)
    744 /*[clinic end generated code: output=0feda21ffc921bc8 input=d1a47579e79cbe15]*/
    745 {
    746     return codec_tuple(_PyUnicode_EncodeUTF7(str, 0, 0, errors),
    747                        PyUnicode_GET_LENGTH(str));
    748 }
    749 
    750 /*[clinic input]
    751 _codecs.utf_8_encode
    752     str: unicode
    753     errors: str(accept={str, NoneType}) = NULL
    754     /
    755 [clinic start generated code]*/
    756 
    757 static PyObject *
    758 _codecs_utf_8_encode_impl(PyObject *module, PyObject *str,
    759                           const char *errors)
    760 /*[clinic end generated code: output=02bf47332b9c796c input=42e3ba73c4392eef]*/
    761 {
    762     return codec_tuple(_PyUnicode_AsUTF8String(str, errors),
    763                        PyUnicode_GET_LENGTH(str));
    764 }
    765 
    766 /* This version provides access to the byteorder parameter of the
    767    builtin UTF-16 codecs as optional third argument. It defaults to 0
    768    which means: use the native byte order and prepend the data with a
    769    BOM mark.
    770 
    771 */
    772 
    773 /*[clinic input]
    774 _codecs.utf_16_encode
    775     str: unicode
    776     errors: str(accept={str, NoneType}) = NULL
    777     byteorder: int = 0
    778     /
    779 [clinic start generated code]*/
    780 
    781 static PyObject *
    782 _codecs_utf_16_encode_impl(PyObject *module, PyObject *str,
    783                            const char *errors, int byteorder)
    784 /*[clinic end generated code: output=c654e13efa2e64e4 input=ff46416b04edb944]*/
    785 {
    786     return codec_tuple(_PyUnicode_EncodeUTF16(str, errors, byteorder),
    787                        PyUnicode_GET_LENGTH(str));
    788 }
    789 
    790 /*[clinic input]
    791 _codecs.utf_16_le_encode
    792     str: unicode
    793     errors: str(accept={str, NoneType}) = NULL
    794     /
    795 [clinic start generated code]*/
    796 
    797 static PyObject *
    798 _codecs_utf_16_le_encode_impl(PyObject *module, PyObject *str,
    799                               const char *errors)
    800 /*[clinic end generated code: output=431b01e55f2d4995 input=cb385455ea8f2fe0]*/
    801 {
    802     return codec_tuple(_PyUnicode_EncodeUTF16(str, errors, -1),
    803                        PyUnicode_GET_LENGTH(str));
    804 }
    805 
    806 /*[clinic input]
    807 _codecs.utf_16_be_encode
    808     str: unicode
    809     errors: str(accept={str, NoneType}) = NULL
    810     /
    811 [clinic start generated code]*/
    812 
    813 static PyObject *
    814 _codecs_utf_16_be_encode_impl(PyObject *module, PyObject *str,
    815                               const char *errors)
    816 /*[clinic end generated code: output=96886a6fd54dcae3 input=9119997066bdaefd]*/
    817 {
    818     return codec_tuple(_PyUnicode_EncodeUTF16(str, errors, +1),
    819                        PyUnicode_GET_LENGTH(str));
    820 }
    821 
    822 /* This version provides access to the byteorder parameter of the
    823    builtin UTF-32 codecs as optional third argument. It defaults to 0
    824    which means: use the native byte order and prepend the data with a
    825    BOM mark.
    826 
    827 */
    828 
    829 /*[clinic input]
    830 _codecs.utf_32_encode
    831     str: unicode
    832     errors: str(accept={str, NoneType}) = NULL
    833     byteorder: int = 0
    834     /
    835 [clinic start generated code]*/
    836 
    837 static PyObject *
    838 _codecs_utf_32_encode_impl(PyObject *module, PyObject *str,
    839                            const char *errors, int byteorder)
    840 /*[clinic end generated code: output=5c760da0c09a8b83 input=c5e77da82fbe5c2a]*/
    841 {
    842     return codec_tuple(_PyUnicode_EncodeUTF32(str, errors, byteorder),
    843                        PyUnicode_GET_LENGTH(str));
    844 }
    845 
    846 /*[clinic input]
    847 _codecs.utf_32_le_encode
    848     str: unicode
    849     errors: str(accept={str, NoneType}) = NULL
    850     /
    851 [clinic start generated code]*/
    852 
    853 static PyObject *
    854 _codecs_utf_32_le_encode_impl(PyObject *module, PyObject *str,
    855                               const char *errors)
    856 /*[clinic end generated code: output=b65cd176de8e36d6 input=9993b25fe0877848]*/
    857 {
    858     return codec_tuple(_PyUnicode_EncodeUTF32(str, errors, -1),
    859                        PyUnicode_GET_LENGTH(str));
    860 }
    861 
    862 /*[clinic input]
    863 _codecs.utf_32_be_encode
    864     str: unicode
    865     errors: str(accept={str, NoneType}) = NULL
    866     /
    867 [clinic start generated code]*/
    868 
    869 static PyObject *
    870 _codecs_utf_32_be_encode_impl(PyObject *module, PyObject *str,
    871                               const char *errors)
    872 /*[clinic end generated code: output=1d9e71a9358709e9 input=d3e0ccaa02920431]*/
    873 {
    874     return codec_tuple(_PyUnicode_EncodeUTF32(str, errors, +1),
    875                        PyUnicode_GET_LENGTH(str));
    876 }
    877 
    878 /*[clinic input]
    879 _codecs.unicode_escape_encode
    880     str: unicode
    881     errors: str(accept={str, NoneType}) = NULL
    882     /
    883 [clinic start generated code]*/
    884 
    885 static PyObject *
    886 _codecs_unicode_escape_encode_impl(PyObject *module, PyObject *str,
    887                                    const char *errors)
    888 /*[clinic end generated code: output=66271b30bc4f7a3c input=65d9eefca65b455a]*/
    889 {
    890     return codec_tuple(PyUnicode_AsUnicodeEscapeString(str),
    891                        PyUnicode_GET_LENGTH(str));
    892 }
    893 
    894 /*[clinic input]
    895 _codecs.raw_unicode_escape_encode
    896     str: unicode
    897     errors: str(accept={str, NoneType}) = NULL
    898     /
    899 [clinic start generated code]*/
    900 
    901 static PyObject *
    902 _codecs_raw_unicode_escape_encode_impl(PyObject *module, PyObject *str,
    903                                        const char *errors)
    904 /*[clinic end generated code: output=a66a806ed01c830a input=5aa33e4a133391ab]*/
    905 {
    906     return codec_tuple(PyUnicode_AsRawUnicodeEscapeString(str),
    907                        PyUnicode_GET_LENGTH(str));
    908 }
    909 
    910 /*[clinic input]
    911 _codecs.latin_1_encode
    912     str: unicode
    913     errors: str(accept={str, NoneType}) = NULL
    914     /
    915 [clinic start generated code]*/
    916 
    917 static PyObject *
    918 _codecs_latin_1_encode_impl(PyObject *module, PyObject *str,
    919                             const char *errors)
    920 /*[clinic end generated code: output=2c28c83a27884e08 input=30b11c9e49a65150]*/
    921 {
    922     return codec_tuple(_PyUnicode_AsLatin1String(str, errors),
    923                        PyUnicode_GET_LENGTH(str));
    924 }
    925 
    926 /*[clinic input]
    927 _codecs.ascii_encode
    928     str: unicode
    929     errors: str(accept={str, NoneType}) = NULL
    930     /
    931 [clinic start generated code]*/
    932 
    933 static PyObject *
    934 _codecs_ascii_encode_impl(PyObject *module, PyObject *str,
    935                           const char *errors)
    936 /*[clinic end generated code: output=b5e035182d33befc input=843a1d268e6dfa8e]*/
    937 {
    938     return codec_tuple(_PyUnicode_AsASCIIString(str, errors),
    939                        PyUnicode_GET_LENGTH(str));
    940 }
    941 
    942 /*[clinic input]
    943 _codecs.charmap_encode
    944     str: unicode
    945     errors: str(accept={str, NoneType}) = NULL
    946     mapping: object = NULL
    947     /
    948 [clinic start generated code]*/
    949 
    950 static PyObject *
    951 _codecs_charmap_encode_impl(PyObject *module, PyObject *str,
    952                             const char *errors, PyObject *mapping)
    953 /*[clinic end generated code: output=047476f48495a9e9 input=0752cde07a6d6d00]*/
    954 {
    955     if (mapping == Py_None)
    956         mapping = NULL;
    957 
    958     return codec_tuple(_PyUnicode_EncodeCharmap(str, mapping, errors),
    959                        PyUnicode_GET_LENGTH(str));
    960 }
    961 
    962 /*[clinic input]
    963 _codecs.charmap_build
    964     map: unicode
    965     /
    966 [clinic start generated code]*/
    967 
    968 static PyObject *
    969 _codecs_charmap_build_impl(PyObject *module, PyObject *map)
    970 /*[clinic end generated code: output=bb073c27031db9ac input=d91a91d1717dbc6d]*/
    971 {
    972     return PyUnicode_BuildEncodingMap(map);
    973 }
    974 
    975 #ifdef MS_WINDOWS
    976 
    977 /*[clinic input]
    978 _codecs.mbcs_encode
    979     str: unicode
    980     errors: str(accept={str, NoneType}) = NULL
    981     /
    982 [clinic start generated code]*/
    983 
    984 static PyObject *
    985 _codecs_mbcs_encode_impl(PyObject *module, PyObject *str, const char *errors)
    986 /*[clinic end generated code: output=76e2e170c966c080 input=de471e0815947553]*/
    987 {
    988     return codec_tuple(PyUnicode_EncodeCodePage(CP_ACP, str, errors),
    989                        PyUnicode_GET_LENGTH(str));
    990 }
    991 
    992 /*[clinic input]
    993 _codecs.oem_encode
    994     str: unicode
    995     errors: str(accept={str, NoneType}) = NULL
    996     /
    997 [clinic start generated code]*/
    998 
    999 static PyObject *
   1000 _codecs_oem_encode_impl(PyObject *module, PyObject *str, const char *errors)
   1001 /*[clinic end generated code: output=65d5982c737de649 input=3fc5f0028aad3cda]*/
   1002 {
   1003     return codec_tuple(PyUnicode_EncodeCodePage(CP_OEMCP, str, errors),
   1004         PyUnicode_GET_LENGTH(str));
   1005 }
   1006 
   1007 /*[clinic input]
   1008 _codecs.code_page_encode
   1009     code_page: int
   1010     str: unicode
   1011     errors: str(accept={str, NoneType}) = NULL
   1012     /
   1013 [clinic start generated code]*/
   1014 
   1015 static PyObject *
   1016 _codecs_code_page_encode_impl(PyObject *module, int code_page, PyObject *str,
   1017                               const char *errors)
   1018 /*[clinic end generated code: output=45673f6085657a9e input=786421ae617d680b]*/
   1019 {
   1020     return codec_tuple(PyUnicode_EncodeCodePage(code_page, str, errors),
   1021                        PyUnicode_GET_LENGTH(str));
   1022 }
   1023 
   1024 #endif /* MS_WINDOWS */
   1025 
   1026 /* --- Error handler registry --------------------------------------------- */
   1027 
   1028 /*[clinic input]
   1029 _codecs.register_error
   1030     errors: str
   1031     handler: object
   1032     /
   1033 
   1034 Register the specified error handler under the name errors.
   1035 
   1036 handler must be a callable object, that will be called with an exception
   1037 instance containing information about the location of the encoding/decoding
   1038 error and must return a (replacement, new position) tuple.
   1039 [clinic start generated code]*/
   1040 
   1041 static PyObject *
   1042 _codecs_register_error_impl(PyObject *module, const char *errors,
   1043                             PyObject *handler)
   1044 /*[clinic end generated code: output=fa2f7d1879b3067d input=5e6709203c2e33fe]*/
   1045 {
   1046     if (PyCodec_RegisterError(errors, handler))
   1047         return NULL;
   1048     Py_RETURN_NONE;
   1049 }
   1050 
   1051 /*[clinic input]
   1052 _codecs.lookup_error
   1053     name: str
   1054     /
   1055 
   1056 lookup_error(errors) -> handler
   1057 
   1058 Return the error handler for the specified error handling name or raise a
   1059 LookupError, if no handler exists under this name.
   1060 [clinic start generated code]*/
   1061 
   1062 static PyObject *
   1063 _codecs_lookup_error_impl(PyObject *module, const char *name)
   1064 /*[clinic end generated code: output=087f05dc0c9a98cc input=4775dd65e6235aba]*/
   1065 {
   1066     return PyCodec_LookupError(name);
   1067 }
   1068 
   1069 /* --- Module API --------------------------------------------------------- */
   1070 
   1071 static PyMethodDef _codecs_functions[] = {
   1072     _CODECS_REGISTER_METHODDEF
   1073     _CODECS_LOOKUP_METHODDEF
   1074     _CODECS_ENCODE_METHODDEF
   1075     _CODECS_DECODE_METHODDEF
   1076     _CODECS_ESCAPE_ENCODE_METHODDEF
   1077     _CODECS_ESCAPE_DECODE_METHODDEF
   1078     _CODECS_UTF_8_ENCODE_METHODDEF
   1079     _CODECS_UTF_8_DECODE_METHODDEF
   1080     _CODECS_UTF_7_ENCODE_METHODDEF
   1081     _CODECS_UTF_7_DECODE_METHODDEF
   1082     _CODECS_UTF_16_ENCODE_METHODDEF
   1083     _CODECS_UTF_16_LE_ENCODE_METHODDEF
   1084     _CODECS_UTF_16_BE_ENCODE_METHODDEF
   1085     _CODECS_UTF_16_DECODE_METHODDEF
   1086     _CODECS_UTF_16_LE_DECODE_METHODDEF
   1087     _CODECS_UTF_16_BE_DECODE_METHODDEF
   1088     _CODECS_UTF_16_EX_DECODE_METHODDEF
   1089     _CODECS_UTF_32_ENCODE_METHODDEF
   1090     _CODECS_UTF_32_LE_ENCODE_METHODDEF
   1091     _CODECS_UTF_32_BE_ENCODE_METHODDEF
   1092     _CODECS_UTF_32_DECODE_METHODDEF
   1093     _CODECS_UTF_32_LE_DECODE_METHODDEF
   1094     _CODECS_UTF_32_BE_DECODE_METHODDEF
   1095     _CODECS_UTF_32_EX_DECODE_METHODDEF
   1096     _CODECS_UNICODE_ESCAPE_ENCODE_METHODDEF
   1097     _CODECS_UNICODE_ESCAPE_DECODE_METHODDEF
   1098     _CODECS_UNICODE_INTERNAL_ENCODE_METHODDEF
   1099     _CODECS_UNICODE_INTERNAL_DECODE_METHODDEF
   1100     _CODECS_RAW_UNICODE_ESCAPE_ENCODE_METHODDEF
   1101     _CODECS_RAW_UNICODE_ESCAPE_DECODE_METHODDEF
   1102     _CODECS_LATIN_1_ENCODE_METHODDEF
   1103     _CODECS_LATIN_1_DECODE_METHODDEF
   1104     _CODECS_ASCII_ENCODE_METHODDEF
   1105     _CODECS_ASCII_DECODE_METHODDEF
   1106     _CODECS_CHARMAP_ENCODE_METHODDEF
   1107     _CODECS_CHARMAP_DECODE_METHODDEF
   1108     _CODECS_CHARMAP_BUILD_METHODDEF
   1109     _CODECS_READBUFFER_ENCODE_METHODDEF
   1110     _CODECS_MBCS_ENCODE_METHODDEF
   1111     _CODECS_MBCS_DECODE_METHODDEF
   1112     _CODECS_OEM_ENCODE_METHODDEF
   1113     _CODECS_OEM_DECODE_METHODDEF
   1114     _CODECS_CODE_PAGE_ENCODE_METHODDEF
   1115     _CODECS_CODE_PAGE_DECODE_METHODDEF
   1116     _CODECS_REGISTER_ERROR_METHODDEF
   1117     _CODECS_LOOKUP_ERROR_METHODDEF
   1118     _CODECS__FORGET_CODEC_METHODDEF
   1119     {NULL, NULL}                /* sentinel */
   1120 };
   1121 
   1122 static struct PyModuleDef codecsmodule = {
   1123         PyModuleDef_HEAD_INIT,
   1124         "_codecs",
   1125         NULL,
   1126         -1,
   1127         _codecs_functions,
   1128         NULL,
   1129         NULL,
   1130         NULL,
   1131         NULL
   1132 };
   1133 
   1134 PyMODINIT_FUNC
   1135 PyInit__codecs(void)
   1136 {
   1137         return PyModule_Create(&codecsmodule);
   1138 }
   1139