Home | History | Annotate | Download | only in Objects
      1 /*
      2 
      3 Unicode implementation based on original code by Fredrik Lundh,
      4 modified by Marc-Andre Lemburg <mal (at) lemburg.com>.
      5 
      6 Major speed upgrades to the method implementations at the Reykjavik
      7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
      8 
      9 Copyright (c) Corporation for National Research Initiatives.
     10 
     11 --------------------------------------------------------------------
     12 The original string type implementation is:
     13 
     14   Copyright (c) 1999 by Secret Labs AB
     15   Copyright (c) 1999 by Fredrik Lundh
     16 
     17 By obtaining, using, and/or copying this software and/or its
     18 associated documentation, you agree that you have read, understood,
     19 and will comply with the following terms and conditions:
     20 
     21 Permission to use, copy, modify, and distribute this software and its
     22 associated documentation for any purpose and without fee is hereby
     23 granted, provided that the above copyright notice appears in all
     24 copies, and that both that copyright notice and this permission notice
     25 appear in supporting documentation, and that the name of Secret Labs
     26 AB or the author not be used in advertising or publicity pertaining to
     27 distribution of the software without specific, written prior
     28 permission.
     29 
     30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
     31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
     32 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
     33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
     36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     37 --------------------------------------------------------------------
     38 
     39 */
     40 
     41 #define PY_SSIZE_T_CLEAN
     42 #include "Python.h"
     43 #include "ucnhash.h"
     44 #include "bytes_methods.h"
     45 #include "stringlib/eq.h"
     46 
     47 #ifdef MS_WINDOWS
     48 #include <windows.h>
     49 #endif
     50 
     51 /*[clinic input]
     52 class str "PyUnicodeObject *" "&PyUnicode_Type"
     53 [clinic start generated code]*/
     54 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
     55 
     56 /* --- Globals ------------------------------------------------------------
     57 
     58 NOTE: In the interpreter's initialization phase, some globals are currently
     59       initialized dynamically as needed. In the process Unicode objects may
     60       be created before the Unicode type is ready.
     61 
     62 */
     63 
     64 
     65 #ifdef __cplusplus
     66 extern "C" {
     67 #endif
     68 
     69 /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
     70 #define MAX_UNICODE 0x10ffff
     71 
     72 #ifdef Py_DEBUG
     73 #  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
     74 #else
     75 #  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
     76 #endif
     77 
     78 #define _PyUnicode_UTF8(op)                             \
     79     (((PyCompactUnicodeObject*)(op))->utf8)
     80 #define PyUnicode_UTF8(op)                              \
     81     (assert(_PyUnicode_CHECK(op)),                      \
     82      assert(PyUnicode_IS_READY(op)),                    \
     83      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
     84          ((char*)((PyASCIIObject*)(op) + 1)) :          \
     85          _PyUnicode_UTF8(op))
     86 #define _PyUnicode_UTF8_LENGTH(op)                      \
     87     (((PyCompactUnicodeObject*)(op))->utf8_length)
     88 #define PyUnicode_UTF8_LENGTH(op)                       \
     89     (assert(_PyUnicode_CHECK(op)),                      \
     90      assert(PyUnicode_IS_READY(op)),                    \
     91      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
     92          ((PyASCIIObject*)(op))->length :               \
     93          _PyUnicode_UTF8_LENGTH(op))
     94 #define _PyUnicode_WSTR(op)                             \
     95     (((PyASCIIObject*)(op))->wstr)
     96 #define _PyUnicode_WSTR_LENGTH(op)                      \
     97     (((PyCompactUnicodeObject*)(op))->wstr_length)
     98 #define _PyUnicode_LENGTH(op)                           \
     99     (((PyASCIIObject *)(op))->length)
    100 #define _PyUnicode_STATE(op)                            \
    101     (((PyASCIIObject *)(op))->state)
    102 #define _PyUnicode_HASH(op)                             \
    103     (((PyASCIIObject *)(op))->hash)
    104 #define _PyUnicode_KIND(op)                             \
    105     (assert(_PyUnicode_CHECK(op)),                      \
    106      ((PyASCIIObject *)(op))->state.kind)
    107 #define _PyUnicode_GET_LENGTH(op)                       \
    108     (assert(_PyUnicode_CHECK(op)),                      \
    109      ((PyASCIIObject *)(op))->length)
    110 #define _PyUnicode_DATA_ANY(op)                         \
    111     (((PyUnicodeObject*)(op))->data.any)
    112 
    113 #undef PyUnicode_READY
    114 #define PyUnicode_READY(op)                             \
    115     (assert(_PyUnicode_CHECK(op)),                      \
    116      (PyUnicode_IS_READY(op) ?                          \
    117       0 :                                               \
    118       _PyUnicode_Ready(op)))
    119 
    120 #define _PyUnicode_SHARE_UTF8(op)                       \
    121     (assert(_PyUnicode_CHECK(op)),                      \
    122      assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
    123      (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
    124 #define _PyUnicode_SHARE_WSTR(op)                       \
    125     (assert(_PyUnicode_CHECK(op)),                      \
    126      (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
    127 
    128 /* true if the Unicode object has an allocated UTF-8 memory block
    129    (not shared with other data) */
    130 #define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
    131     ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
    132       && _PyUnicode_UTF8(op)                            \
    133       && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
    134 
    135 /* true if the Unicode object has an allocated wstr memory block
    136    (not shared with other data) */
    137 #define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
    138     ((_PyUnicode_WSTR(op) &&                            \
    139       (!PyUnicode_IS_READY(op) ||                       \
    140        _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
    141 
    142 /* Generic helper macro to convert characters of different types.
    143    from_type and to_type have to be valid type names, begin and end
    144    are pointers to the source characters which should be of type
    145    "from_type *".  to is a pointer of type "to_type *" and points to the
    146    buffer where the result characters are written to. */
    147 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
    148     do {                                                \
    149         to_type *_to = (to_type *)(to);                \
    150         const from_type *_iter = (from_type *)(begin);  \
    151         const from_type *_end = (from_type *)(end);     \
    152         Py_ssize_t n = (_end) - (_iter);                \
    153         const from_type *_unrolled_end =                \
    154             _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
    155         while (_iter < (_unrolled_end)) {               \
    156             _to[0] = (to_type) _iter[0];                \
    157             _to[1] = (to_type) _iter[1];                \
    158             _to[2] = (to_type) _iter[2];                \
    159             _to[3] = (to_type) _iter[3];                \
    160             _iter += 4; _to += 4;                       \
    161         }                                               \
    162         while (_iter < (_end))                          \
    163             *_to++ = (to_type) *_iter++;                \
    164     } while (0)
    165 
    166 #ifdef MS_WINDOWS
    167    /* On Windows, overallocate by 50% is the best factor */
    168 #  define OVERALLOCATE_FACTOR 2
    169 #else
    170    /* On Linux, overallocate by 25% is the best factor */
    171 #  define OVERALLOCATE_FACTOR 4
    172 #endif
    173 
    174 /* This dictionary holds all interned unicode strings.  Note that references
    175    to strings in this dictionary are *not* counted in the string's ob_refcnt.
    176    When the interned string reaches a refcnt of 0 the string deallocation
    177    function will delete the reference from this dictionary.
    178 
    179    Another way to look at this is that to say that the actual reference
    180    count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
    181 */
    182 static PyObject *interned = NULL;
    183 
    184 /* The empty Unicode object is shared to improve performance. */
    185 static PyObject *unicode_empty = NULL;
    186 
    187 #define _Py_INCREF_UNICODE_EMPTY()                      \
    188     do {                                                \
    189         if (unicode_empty != NULL)                      \
    190             Py_INCREF(unicode_empty);                   \
    191         else {                                          \
    192             unicode_empty = PyUnicode_New(0, 0);        \
    193             if (unicode_empty != NULL) {                \
    194                 Py_INCREF(unicode_empty);               \
    195                 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
    196             }                                           \
    197         }                                               \
    198     } while (0)
    199 
    200 #define _Py_RETURN_UNICODE_EMPTY()                      \
    201     do {                                                \
    202         _Py_INCREF_UNICODE_EMPTY();                     \
    203         return unicode_empty;                           \
    204     } while (0)
    205 
    206 /* Forward declaration */
    207 static inline int
    208 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
    209 
    210 /* List of static strings. */
    211 static _Py_Identifier *static_strings = NULL;
    212 
    213 /* Single character Unicode strings in the Latin-1 range are being
    214    shared as well. */
    215 static PyObject *unicode_latin1[256] = {NULL};
    216 
    217 /* Fast detection of the most frequent whitespace characters */
    218 const unsigned char _Py_ascii_whitespace[] = {
    219     0, 0, 0, 0, 0, 0, 0, 0,
    220 /*     case 0x0009: * CHARACTER TABULATION */
    221 /*     case 0x000A: * LINE FEED */
    222 /*     case 0x000B: * LINE TABULATION */
    223 /*     case 0x000C: * FORM FEED */
    224 /*     case 0x000D: * CARRIAGE RETURN */
    225     0, 1, 1, 1, 1, 1, 0, 0,
    226     0, 0, 0, 0, 0, 0, 0, 0,
    227 /*     case 0x001C: * FILE SEPARATOR */
    228 /*     case 0x001D: * GROUP SEPARATOR */
    229 /*     case 0x001E: * RECORD SEPARATOR */
    230 /*     case 0x001F: * UNIT SEPARATOR */
    231     0, 0, 0, 0, 1, 1, 1, 1,
    232 /*     case 0x0020: * SPACE */
    233     1, 0, 0, 0, 0, 0, 0, 0,
    234     0, 0, 0, 0, 0, 0, 0, 0,
    235     0, 0, 0, 0, 0, 0, 0, 0,
    236     0, 0, 0, 0, 0, 0, 0, 0,
    237 
    238     0, 0, 0, 0, 0, 0, 0, 0,
    239     0, 0, 0, 0, 0, 0, 0, 0,
    240     0, 0, 0, 0, 0, 0, 0, 0,
    241     0, 0, 0, 0, 0, 0, 0, 0,
    242     0, 0, 0, 0, 0, 0, 0, 0,
    243     0, 0, 0, 0, 0, 0, 0, 0,
    244     0, 0, 0, 0, 0, 0, 0, 0,
    245     0, 0, 0, 0, 0, 0, 0, 0
    246 };
    247 
    248 /* forward */
    249 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
    250 static PyObject* get_latin1_char(unsigned char ch);
    251 static int unicode_modifiable(PyObject *unicode);
    252 
    253 
    254 static PyObject *
    255 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
    256 static PyObject *
    257 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
    258 static PyObject *
    259 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
    260 
    261 static PyObject *
    262 unicode_encode_call_errorhandler(const char *errors,
    263        PyObject **errorHandler,const char *encoding, const char *reason,
    264        PyObject *unicode, PyObject **exceptionObject,
    265        Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
    266 
    267 static void
    268 raise_encode_exception(PyObject **exceptionObject,
    269                        const char *encoding,
    270                        PyObject *unicode,
    271                        Py_ssize_t startpos, Py_ssize_t endpos,
    272                        const char *reason);
    273 
    274 /* Same for linebreaks */
    275 static const unsigned char ascii_linebreak[] = {
    276     0, 0, 0, 0, 0, 0, 0, 0,
    277 /*         0x000A, * LINE FEED */
    278 /*         0x000B, * LINE TABULATION */
    279 /*         0x000C, * FORM FEED */
    280 /*         0x000D, * CARRIAGE RETURN */
    281     0, 0, 1, 1, 1, 1, 0, 0,
    282     0, 0, 0, 0, 0, 0, 0, 0,
    283 /*         0x001C, * FILE SEPARATOR */
    284 /*         0x001D, * GROUP SEPARATOR */
    285 /*         0x001E, * RECORD SEPARATOR */
    286     0, 0, 0, 0, 1, 1, 1, 0,
    287     0, 0, 0, 0, 0, 0, 0, 0,
    288     0, 0, 0, 0, 0, 0, 0, 0,
    289     0, 0, 0, 0, 0, 0, 0, 0,
    290     0, 0, 0, 0, 0, 0, 0, 0,
    291 
    292     0, 0, 0, 0, 0, 0, 0, 0,
    293     0, 0, 0, 0, 0, 0, 0, 0,
    294     0, 0, 0, 0, 0, 0, 0, 0,
    295     0, 0, 0, 0, 0, 0, 0, 0,
    296     0, 0, 0, 0, 0, 0, 0, 0,
    297     0, 0, 0, 0, 0, 0, 0, 0,
    298     0, 0, 0, 0, 0, 0, 0, 0,
    299     0, 0, 0, 0, 0, 0, 0, 0
    300 };
    301 
    302 #include "clinic/unicodeobject.c.h"
    303 
    304 typedef enum {
    305     _Py_ERROR_UNKNOWN=0,
    306     _Py_ERROR_STRICT,
    307     _Py_ERROR_SURROGATEESCAPE,
    308     _Py_ERROR_REPLACE,
    309     _Py_ERROR_IGNORE,
    310     _Py_ERROR_BACKSLASHREPLACE,
    311     _Py_ERROR_SURROGATEPASS,
    312     _Py_ERROR_XMLCHARREFREPLACE,
    313     _Py_ERROR_OTHER
    314 } _Py_error_handler;
    315 
    316 static _Py_error_handler
    317 get_error_handler(const char *errors)
    318 {
    319     if (errors == NULL || strcmp(errors, "strict") == 0) {
    320         return _Py_ERROR_STRICT;
    321     }
    322     if (strcmp(errors, "surrogateescape") == 0) {
    323         return _Py_ERROR_SURROGATEESCAPE;
    324     }
    325     if (strcmp(errors, "replace") == 0) {
    326         return _Py_ERROR_REPLACE;
    327     }
    328     if (strcmp(errors, "ignore") == 0) {
    329         return _Py_ERROR_IGNORE;
    330     }
    331     if (strcmp(errors, "backslashreplace") == 0) {
    332         return _Py_ERROR_BACKSLASHREPLACE;
    333     }
    334     if (strcmp(errors, "surrogatepass") == 0) {
    335         return _Py_ERROR_SURROGATEPASS;
    336     }
    337     if (strcmp(errors, "xmlcharrefreplace") == 0) {
    338         return _Py_ERROR_XMLCHARREFREPLACE;
    339     }
    340     return _Py_ERROR_OTHER;
    341 }
    342 
    343 /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
    344    This function is kept for backward compatibility with the old API. */
    345 Py_UNICODE
    346 PyUnicode_GetMax(void)
    347 {
    348 #ifdef Py_UNICODE_WIDE
    349     return 0x10FFFF;
    350 #else
    351     /* This is actually an illegal character, so it should
    352        not be passed to unichr. */
    353     return 0xFFFF;
    354 #endif
    355 }
    356 
    357 #ifdef Py_DEBUG
    358 int
    359 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
    360 {
    361     PyASCIIObject *ascii;
    362     unsigned int kind;
    363 
    364     assert(PyUnicode_Check(op));
    365 
    366     ascii = (PyASCIIObject *)op;
    367     kind = ascii->state.kind;
    368 
    369     if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
    370         assert(kind == PyUnicode_1BYTE_KIND);
    371         assert(ascii->state.ready == 1);
    372     }
    373     else {
    374         PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
    375         void *data;
    376 
    377         if (ascii->state.compact == 1) {
    378             data = compact + 1;
    379             assert(kind == PyUnicode_1BYTE_KIND
    380                    || kind == PyUnicode_2BYTE_KIND
    381                    || kind == PyUnicode_4BYTE_KIND);
    382             assert(ascii->state.ascii == 0);
    383             assert(ascii->state.ready == 1);
    384             assert (compact->utf8 != data);
    385         }
    386         else {
    387             PyUnicodeObject *unicode = (PyUnicodeObject *)op;
    388 
    389             data = unicode->data.any;
    390             if (kind == PyUnicode_WCHAR_KIND) {
    391                 assert(ascii->length == 0);
    392                 assert(ascii->hash == -1);
    393                 assert(ascii->state.compact == 0);
    394                 assert(ascii->state.ascii == 0);
    395                 assert(ascii->state.ready == 0);
    396                 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
    397                 assert(ascii->wstr != NULL);
    398                 assert(data == NULL);
    399                 assert(compact->utf8 == NULL);
    400             }
    401             else {
    402                 assert(kind == PyUnicode_1BYTE_KIND
    403                        || kind == PyUnicode_2BYTE_KIND
    404                        || kind == PyUnicode_4BYTE_KIND);
    405                 assert(ascii->state.compact == 0);
    406                 assert(ascii->state.ready == 1);
    407                 assert(data != NULL);
    408                 if (ascii->state.ascii) {
    409                     assert (compact->utf8 == data);
    410                     assert (compact->utf8_length == ascii->length);
    411                 }
    412                 else
    413                     assert (compact->utf8 != data);
    414             }
    415         }
    416         if (kind != PyUnicode_WCHAR_KIND) {
    417             if (
    418 #if SIZEOF_WCHAR_T == 2
    419                 kind == PyUnicode_2BYTE_KIND
    420 #else
    421                 kind == PyUnicode_4BYTE_KIND
    422 #endif
    423                )
    424             {
    425                 assert(ascii->wstr == data);
    426                 assert(compact->wstr_length == ascii->length);
    427             } else
    428                 assert(ascii->wstr != data);
    429         }
    430 
    431         if (compact->utf8 == NULL)
    432             assert(compact->utf8_length == 0);
    433         if (ascii->wstr == NULL)
    434             assert(compact->wstr_length == 0);
    435     }
    436     /* check that the best kind is used */
    437     if (check_content && kind != PyUnicode_WCHAR_KIND)
    438     {
    439         Py_ssize_t i;
    440         Py_UCS4 maxchar = 0;
    441         void *data;
    442         Py_UCS4 ch;
    443 
    444         data = PyUnicode_DATA(ascii);
    445         for (i=0; i < ascii->length; i++)
    446         {
    447             ch = PyUnicode_READ(kind, data, i);
    448             if (ch > maxchar)
    449                 maxchar = ch;
    450         }
    451         if (kind == PyUnicode_1BYTE_KIND) {
    452             if (ascii->state.ascii == 0) {
    453                 assert(maxchar >= 128);
    454                 assert(maxchar <= 255);
    455             }
    456             else
    457                 assert(maxchar < 128);
    458         }
    459         else if (kind == PyUnicode_2BYTE_KIND) {
    460             assert(maxchar >= 0x100);
    461             assert(maxchar <= 0xFFFF);
    462         }
    463         else {
    464             assert(maxchar >= 0x10000);
    465             assert(maxchar <= MAX_UNICODE);
    466         }
    467         assert(PyUnicode_READ(kind, data, ascii->length) == 0);
    468     }
    469     return 1;
    470 }
    471 #endif
    472 
    473 static PyObject*
    474 unicode_result_wchar(PyObject *unicode)
    475 {
    476 #ifndef Py_DEBUG
    477     Py_ssize_t len;
    478 
    479     len = _PyUnicode_WSTR_LENGTH(unicode);
    480     if (len == 0) {
    481         Py_DECREF(unicode);
    482         _Py_RETURN_UNICODE_EMPTY();
    483     }
    484 
    485     if (len == 1) {
    486         wchar_t ch = _PyUnicode_WSTR(unicode)[0];
    487         if ((Py_UCS4)ch < 256) {
    488             PyObject *latin1_char = get_latin1_char((unsigned char)ch);
    489             Py_DECREF(unicode);
    490             return latin1_char;
    491         }
    492     }
    493 
    494     if (_PyUnicode_Ready(unicode) < 0) {
    495         Py_DECREF(unicode);
    496         return NULL;
    497     }
    498 #else
    499     assert(Py_REFCNT(unicode) == 1);
    500 
    501     /* don't make the result ready in debug mode to ensure that the caller
    502        makes the string ready before using it */
    503     assert(_PyUnicode_CheckConsistency(unicode, 1));
    504 #endif
    505     return unicode;
    506 }
    507 
    508 static PyObject*
    509 unicode_result_ready(PyObject *unicode)
    510 {
    511     Py_ssize_t length;
    512 
    513     length = PyUnicode_GET_LENGTH(unicode);
    514     if (length == 0) {
    515         if (unicode != unicode_empty) {
    516             Py_DECREF(unicode);
    517             _Py_RETURN_UNICODE_EMPTY();
    518         }
    519         return unicode_empty;
    520     }
    521 
    522     if (length == 1) {
    523         void *data = PyUnicode_DATA(unicode);
    524         int kind = PyUnicode_KIND(unicode);
    525         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
    526         if (ch < 256) {
    527             PyObject *latin1_char = unicode_latin1[ch];
    528             if (latin1_char != NULL) {
    529                 if (unicode != latin1_char) {
    530                     Py_INCREF(latin1_char);
    531                     Py_DECREF(unicode);
    532                 }
    533                 return latin1_char;
    534             }
    535             else {
    536                 assert(_PyUnicode_CheckConsistency(unicode, 1));
    537                 Py_INCREF(unicode);
    538                 unicode_latin1[ch] = unicode;
    539                 return unicode;
    540             }
    541         }
    542     }
    543 
    544     assert(_PyUnicode_CheckConsistency(unicode, 1));
    545     return unicode;
    546 }
    547 
    548 static PyObject*
    549 unicode_result(PyObject *unicode)
    550 {
    551     assert(_PyUnicode_CHECK(unicode));
    552     if (PyUnicode_IS_READY(unicode))
    553         return unicode_result_ready(unicode);
    554     else
    555         return unicode_result_wchar(unicode);
    556 }
    557 
    558 static PyObject*
    559 unicode_result_unchanged(PyObject *unicode)
    560 {
    561     if (PyUnicode_CheckExact(unicode)) {
    562         if (PyUnicode_READY(unicode) == -1)
    563             return NULL;
    564         Py_INCREF(unicode);
    565         return unicode;
    566     }
    567     else
    568         /* Subtype -- return genuine unicode string with the same value. */
    569         return _PyUnicode_Copy(unicode);
    570 }
    571 
    572 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
    573    ASCII, Latin1, UTF-8, etc. */
    574 static char*
    575 backslashreplace(_PyBytesWriter *writer, char *str,
    576                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
    577 {
    578     Py_ssize_t size, i;
    579     Py_UCS4 ch;
    580     enum PyUnicode_Kind kind;
    581     void *data;
    582 
    583     assert(PyUnicode_IS_READY(unicode));
    584     kind = PyUnicode_KIND(unicode);
    585     data = PyUnicode_DATA(unicode);
    586 
    587     size = 0;
    588     /* determine replacement size */
    589     for (i = collstart; i < collend; ++i) {
    590         Py_ssize_t incr;
    591 
    592         ch = PyUnicode_READ(kind, data, i);
    593         if (ch < 0x100)
    594             incr = 2+2;
    595         else if (ch < 0x10000)
    596             incr = 2+4;
    597         else {
    598             assert(ch <= MAX_UNICODE);
    599             incr = 2+8;
    600         }
    601         if (size > PY_SSIZE_T_MAX - incr) {
    602             PyErr_SetString(PyExc_OverflowError,
    603                             "encoded result is too long for a Python string");
    604             return NULL;
    605         }
    606         size += incr;
    607     }
    608 
    609     str = _PyBytesWriter_Prepare(writer, str, size);
    610     if (str == NULL)
    611         return NULL;
    612 
    613     /* generate replacement */
    614     for (i = collstart; i < collend; ++i) {
    615         ch = PyUnicode_READ(kind, data, i);
    616         *str++ = '\\';
    617         if (ch >= 0x00010000) {
    618             *str++ = 'U';
    619             *str++ = Py_hexdigits[(ch>>28)&0xf];
    620             *str++ = Py_hexdigits[(ch>>24)&0xf];
    621             *str++ = Py_hexdigits[(ch>>20)&0xf];
    622             *str++ = Py_hexdigits[(ch>>16)&0xf];
    623             *str++ = Py_hexdigits[(ch>>12)&0xf];
    624             *str++ = Py_hexdigits[(ch>>8)&0xf];
    625         }
    626         else if (ch >= 0x100) {
    627             *str++ = 'u';
    628             *str++ = Py_hexdigits[(ch>>12)&0xf];
    629             *str++ = Py_hexdigits[(ch>>8)&0xf];
    630         }
    631         else
    632             *str++ = 'x';
    633         *str++ = Py_hexdigits[(ch>>4)&0xf];
    634         *str++ = Py_hexdigits[ch&0xf];
    635     }
    636     return str;
    637 }
    638 
    639 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
    640    ASCII, Latin1, UTF-8, etc. */
    641 static char*
    642 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
    643                   PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
    644 {
    645     Py_ssize_t size, i;
    646     Py_UCS4 ch;
    647     enum PyUnicode_Kind kind;
    648     void *data;
    649 
    650     assert(PyUnicode_IS_READY(unicode));
    651     kind = PyUnicode_KIND(unicode);
    652     data = PyUnicode_DATA(unicode);
    653 
    654     size = 0;
    655     /* determine replacement size */
    656     for (i = collstart; i < collend; ++i) {
    657         Py_ssize_t incr;
    658 
    659         ch = PyUnicode_READ(kind, data, i);
    660         if (ch < 10)
    661             incr = 2+1+1;
    662         else if (ch < 100)
    663             incr = 2+2+1;
    664         else if (ch < 1000)
    665             incr = 2+3+1;
    666         else if (ch < 10000)
    667             incr = 2+4+1;
    668         else if (ch < 100000)
    669             incr = 2+5+1;
    670         else if (ch < 1000000)
    671             incr = 2+6+1;
    672         else {
    673             assert(ch <= MAX_UNICODE);
    674             incr = 2+7+1;
    675         }
    676         if (size > PY_SSIZE_T_MAX - incr) {
    677             PyErr_SetString(PyExc_OverflowError,
    678                             "encoded result is too long for a Python string");
    679             return NULL;
    680         }
    681         size += incr;
    682     }
    683 
    684     str = _PyBytesWriter_Prepare(writer, str, size);
    685     if (str == NULL)
    686         return NULL;
    687 
    688     /* generate replacement */
    689     for (i = collstart; i < collend; ++i) {
    690         str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
    691     }
    692     return str;
    693 }
    694 
    695 /* --- Bloom Filters ----------------------------------------------------- */
    696 
    697 /* stuff to implement simple "bloom filters" for Unicode characters.
    698    to keep things simple, we use a single bitmask, using the least 5
    699    bits from each unicode characters as the bit index. */
    700 
    701 /* the linebreak mask is set up by Unicode_Init below */
    702 
    703 #if LONG_BIT >= 128
    704 #define BLOOM_WIDTH 128
    705 #elif LONG_BIT >= 64
    706 #define BLOOM_WIDTH 64
    707 #elif LONG_BIT >= 32
    708 #define BLOOM_WIDTH 32
    709 #else
    710 #error "LONG_BIT is smaller than 32"
    711 #endif
    712 
    713 #define BLOOM_MASK unsigned long
    714 
    715 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
    716 
    717 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
    718 
    719 #define BLOOM_LINEBREAK(ch)                                             \
    720     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
    721      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
    722 
    723 static inline BLOOM_MASK
    724 make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
    725 {
    726 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
    727     do {                                               \
    728         TYPE *data = (TYPE *)PTR;                      \
    729         TYPE *end = data + LEN;                        \
    730         Py_UCS4 ch;                                    \
    731         for (; data != end; data++) {                  \
    732             ch = *data;                                \
    733             MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
    734         }                                              \
    735         break;                                         \
    736     } while (0)
    737 
    738     /* calculate simple bloom-style bitmask for a given unicode string */
    739 
    740     BLOOM_MASK mask;
    741 
    742     mask = 0;
    743     switch (kind) {
    744     case PyUnicode_1BYTE_KIND:
    745         BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
    746         break;
    747     case PyUnicode_2BYTE_KIND:
    748         BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
    749         break;
    750     case PyUnicode_4BYTE_KIND:
    751         BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
    752         break;
    753     default:
    754         assert(0);
    755     }
    756     return mask;
    757 
    758 #undef BLOOM_UPDATE
    759 }
    760 
    761 static int
    762 ensure_unicode(PyObject *obj)
    763 {
    764     if (!PyUnicode_Check(obj)) {
    765         PyErr_Format(PyExc_TypeError,
    766                      "must be str, not %.100s",
    767                      Py_TYPE(obj)->tp_name);
    768         return -1;
    769     }
    770     return PyUnicode_READY(obj);
    771 }
    772 
    773 /* Compilation of templated routines */
    774 
    775 #include "stringlib/asciilib.h"
    776 #include "stringlib/fastsearch.h"
    777 #include "stringlib/partition.h"
    778 #include "stringlib/split.h"
    779 #include "stringlib/count.h"
    780 #include "stringlib/find.h"
    781 #include "stringlib/find_max_char.h"
    782 #include "stringlib/localeutil.h"
    783 #include "stringlib/undef.h"
    784 
    785 #include "stringlib/ucs1lib.h"
    786 #include "stringlib/fastsearch.h"
    787 #include "stringlib/partition.h"
    788 #include "stringlib/split.h"
    789 #include "stringlib/count.h"
    790 #include "stringlib/find.h"
    791 #include "stringlib/replace.h"
    792 #include "stringlib/find_max_char.h"
    793 #include "stringlib/localeutil.h"
    794 #include "stringlib/undef.h"
    795 
    796 #include "stringlib/ucs2lib.h"
    797 #include "stringlib/fastsearch.h"
    798 #include "stringlib/partition.h"
    799 #include "stringlib/split.h"
    800 #include "stringlib/count.h"
    801 #include "stringlib/find.h"
    802 #include "stringlib/replace.h"
    803 #include "stringlib/find_max_char.h"
    804 #include "stringlib/localeutil.h"
    805 #include "stringlib/undef.h"
    806 
    807 #include "stringlib/ucs4lib.h"
    808 #include "stringlib/fastsearch.h"
    809 #include "stringlib/partition.h"
    810 #include "stringlib/split.h"
    811 #include "stringlib/count.h"
    812 #include "stringlib/find.h"
    813 #include "stringlib/replace.h"
    814 #include "stringlib/find_max_char.h"
    815 #include "stringlib/localeutil.h"
    816 #include "stringlib/undef.h"
    817 
    818 #include "stringlib/unicodedefs.h"
    819 #include "stringlib/fastsearch.h"
    820 #include "stringlib/count.h"
    821 #include "stringlib/find.h"
    822 #include "stringlib/undef.h"
    823 
    824 /* --- Unicode Object ----------------------------------------------------- */
    825 
    826 static PyObject *
    827 fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
    828 
    829 static inline Py_ssize_t
    830 findchar(const void *s, int kind,
    831          Py_ssize_t size, Py_UCS4 ch,
    832          int direction)
    833 {
    834     switch (kind) {
    835     case PyUnicode_1BYTE_KIND:
    836         if ((Py_UCS1) ch != ch)
    837             return -1;
    838         if (direction > 0)
    839             return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
    840         else
    841             return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
    842     case PyUnicode_2BYTE_KIND:
    843         if ((Py_UCS2) ch != ch)
    844             return -1;
    845         if (direction > 0)
    846             return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
    847         else
    848             return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
    849     case PyUnicode_4BYTE_KIND:
    850         if (direction > 0)
    851             return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
    852         else
    853             return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
    854     default:
    855         assert(0);
    856         return -1;
    857     }
    858 }
    859 
    860 #ifdef Py_DEBUG
    861 /* Fill the data of a Unicode string with invalid characters to detect bugs
    862    earlier.
    863 
    864    _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
    865    ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
    866    invalid character in Unicode 6.0. */
    867 static void
    868 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
    869 {
    870     int kind = PyUnicode_KIND(unicode);
    871     Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
    872     Py_ssize_t length = _PyUnicode_LENGTH(unicode);
    873     if (length <= old_length)
    874         return;
    875     memset(data + old_length * kind, 0xff, (length - old_length) * kind);
    876 }
    877 #endif
    878 
    879 static PyObject*
    880 resize_compact(PyObject *unicode, Py_ssize_t length)
    881 {
    882     Py_ssize_t char_size;
    883     Py_ssize_t struct_size;
    884     Py_ssize_t new_size;
    885     int share_wstr;
    886     PyObject *new_unicode;
    887 #ifdef Py_DEBUG
    888     Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
    889 #endif
    890 
    891     assert(unicode_modifiable(unicode));
    892     assert(PyUnicode_IS_READY(unicode));
    893     assert(PyUnicode_IS_COMPACT(unicode));
    894 
    895     char_size = PyUnicode_KIND(unicode);
    896     if (PyUnicode_IS_ASCII(unicode))
    897         struct_size = sizeof(PyASCIIObject);
    898     else
    899         struct_size = sizeof(PyCompactUnicodeObject);
    900     share_wstr = _PyUnicode_SHARE_WSTR(unicode);
    901 
    902     if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
    903         PyErr_NoMemory();
    904         return NULL;
    905     }
    906     new_size = (struct_size + (length + 1) * char_size);
    907 
    908     if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
    909         PyObject_DEL(_PyUnicode_UTF8(unicode));
    910         _PyUnicode_UTF8(unicode) = NULL;
    911         _PyUnicode_UTF8_LENGTH(unicode) = 0;
    912     }
    913     _Py_DEC_REFTOTAL;
    914     _Py_ForgetReference(unicode);
    915 
    916     new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
    917     if (new_unicode == NULL) {
    918         _Py_NewReference(unicode);
    919         PyErr_NoMemory();
    920         return NULL;
    921     }
    922     unicode = new_unicode;
    923     _Py_NewReference(unicode);
    924 
    925     _PyUnicode_LENGTH(unicode) = length;
    926     if (share_wstr) {
    927         _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
    928         if (!PyUnicode_IS_ASCII(unicode))
    929             _PyUnicode_WSTR_LENGTH(unicode) = length;
    930     }
    931     else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
    932         PyObject_DEL(_PyUnicode_WSTR(unicode));
    933         _PyUnicode_WSTR(unicode) = NULL;
    934         if (!PyUnicode_IS_ASCII(unicode))
    935             _PyUnicode_WSTR_LENGTH(unicode) = 0;
    936     }
    937 #ifdef Py_DEBUG
    938     unicode_fill_invalid(unicode, old_length);
    939 #endif
    940     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
    941                     length, 0);
    942     assert(_PyUnicode_CheckConsistency(unicode, 0));
    943     return unicode;
    944 }
    945 
    946 static int
    947 resize_inplace(PyObject *unicode, Py_ssize_t length)
    948 {
    949     wchar_t *wstr;
    950     Py_ssize_t new_size;
    951     assert(!PyUnicode_IS_COMPACT(unicode));
    952     assert(Py_REFCNT(unicode) == 1);
    953 
    954     if (PyUnicode_IS_READY(unicode)) {
    955         Py_ssize_t char_size;
    956         int share_wstr, share_utf8;
    957         void *data;
    958 #ifdef Py_DEBUG
    959         Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
    960 #endif
    961 
    962         data = _PyUnicode_DATA_ANY(unicode);
    963         char_size = PyUnicode_KIND(unicode);
    964         share_wstr = _PyUnicode_SHARE_WSTR(unicode);
    965         share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
    966 
    967         if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
    968             PyErr_NoMemory();
    969             return -1;
    970         }
    971         new_size = (length + 1) * char_size;
    972 
    973         if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
    974         {
    975             PyObject_DEL(_PyUnicode_UTF8(unicode));
    976             _PyUnicode_UTF8(unicode) = NULL;
    977             _PyUnicode_UTF8_LENGTH(unicode) = 0;
    978         }
    979 
    980         data = (PyObject *)PyObject_REALLOC(data, new_size);
    981         if (data == NULL) {
    982             PyErr_NoMemory();
    983             return -1;
    984         }
    985         _PyUnicode_DATA_ANY(unicode) = data;
    986         if (share_wstr) {
    987             _PyUnicode_WSTR(unicode) = data;
    988             _PyUnicode_WSTR_LENGTH(unicode) = length;
    989         }
    990         if (share_utf8) {
    991             _PyUnicode_UTF8(unicode) = data;
    992             _PyUnicode_UTF8_LENGTH(unicode) = length;
    993         }
    994         _PyUnicode_LENGTH(unicode) = length;
    995         PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
    996 #ifdef Py_DEBUG
    997         unicode_fill_invalid(unicode, old_length);
    998 #endif
    999         if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
   1000             assert(_PyUnicode_CheckConsistency(unicode, 0));
   1001             return 0;
   1002         }
   1003     }
   1004     assert(_PyUnicode_WSTR(unicode) != NULL);
   1005 
   1006     /* check for integer overflow */
   1007     if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
   1008         PyErr_NoMemory();
   1009         return -1;
   1010     }
   1011     new_size = sizeof(wchar_t) * (length + 1);
   1012     wstr =  _PyUnicode_WSTR(unicode);
   1013     wstr = PyObject_REALLOC(wstr, new_size);
   1014     if (!wstr) {
   1015         PyErr_NoMemory();
   1016         return -1;
   1017     }
   1018     _PyUnicode_WSTR(unicode) = wstr;
   1019     _PyUnicode_WSTR(unicode)[length] = 0;
   1020     _PyUnicode_WSTR_LENGTH(unicode) = length;
   1021     assert(_PyUnicode_CheckConsistency(unicode, 0));
   1022     return 0;
   1023 }
   1024 
   1025 static PyObject*
   1026 resize_copy(PyObject *unicode, Py_ssize_t length)
   1027 {
   1028     Py_ssize_t copy_length;
   1029     if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
   1030         PyObject *copy;
   1031 
   1032         if (PyUnicode_READY(unicode) == -1)
   1033             return NULL;
   1034 
   1035         copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
   1036         if (copy == NULL)
   1037             return NULL;
   1038 
   1039         copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
   1040         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
   1041         return copy;
   1042     }
   1043     else {
   1044         PyObject *w;
   1045 
   1046         w = (PyObject*)_PyUnicode_New(length);
   1047         if (w == NULL)
   1048             return NULL;
   1049         copy_length = _PyUnicode_WSTR_LENGTH(unicode);
   1050         copy_length = Py_MIN(copy_length, length);
   1051         memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
   1052                   copy_length * sizeof(wchar_t));
   1053         return w;
   1054     }
   1055 }
   1056 
   1057 /* We allocate one more byte to make sure the string is
   1058    Ux0000 terminated; some code (e.g. new_identifier)
   1059    relies on that.
   1060 
   1061    XXX This allocator could further be enhanced by assuring that the
   1062    free list never reduces its size below 1.
   1063 
   1064 */
   1065 
   1066 static PyUnicodeObject *
   1067 _PyUnicode_New(Py_ssize_t length)
   1068 {
   1069     PyUnicodeObject *unicode;
   1070     size_t new_size;
   1071 
   1072     /* Optimization for empty strings */
   1073     if (length == 0 && unicode_empty != NULL) {
   1074         Py_INCREF(unicode_empty);
   1075         return (PyUnicodeObject*)unicode_empty;
   1076     }
   1077 
   1078     /* Ensure we won't overflow the size. */
   1079     if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
   1080         return (PyUnicodeObject *)PyErr_NoMemory();
   1081     }
   1082     if (length < 0) {
   1083         PyErr_SetString(PyExc_SystemError,
   1084                         "Negative size passed to _PyUnicode_New");
   1085         return NULL;
   1086     }
   1087 
   1088     unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
   1089     if (unicode == NULL)
   1090         return NULL;
   1091     new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
   1092 
   1093     _PyUnicode_WSTR_LENGTH(unicode) = length;
   1094     _PyUnicode_HASH(unicode) = -1;
   1095     _PyUnicode_STATE(unicode).interned = 0;
   1096     _PyUnicode_STATE(unicode).kind = 0;
   1097     _PyUnicode_STATE(unicode).compact = 0;
   1098     _PyUnicode_STATE(unicode).ready = 0;
   1099     _PyUnicode_STATE(unicode).ascii = 0;
   1100     _PyUnicode_DATA_ANY(unicode) = NULL;
   1101     _PyUnicode_LENGTH(unicode) = 0;
   1102     _PyUnicode_UTF8(unicode) = NULL;
   1103     _PyUnicode_UTF8_LENGTH(unicode) = 0;
   1104 
   1105     _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
   1106     if (!_PyUnicode_WSTR(unicode)) {
   1107         Py_DECREF(unicode);
   1108         PyErr_NoMemory();
   1109         return NULL;
   1110     }
   1111 
   1112     /* Initialize the first element to guard against cases where
   1113      * the caller fails before initializing str -- unicode_resize()
   1114      * reads str[0], and the Keep-Alive optimization can keep memory
   1115      * allocated for str alive across a call to unicode_dealloc(unicode).
   1116      * We don't want unicode_resize to read uninitialized memory in
   1117      * that case.
   1118      */
   1119     _PyUnicode_WSTR(unicode)[0] = 0;
   1120     _PyUnicode_WSTR(unicode)[length] = 0;
   1121 
   1122     assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
   1123     return unicode;
   1124 }
   1125 
   1126 static const char*
   1127 unicode_kind_name(PyObject *unicode)
   1128 {
   1129     /* don't check consistency: unicode_kind_name() is called from
   1130        _PyUnicode_Dump() */
   1131     if (!PyUnicode_IS_COMPACT(unicode))
   1132     {
   1133         if (!PyUnicode_IS_READY(unicode))
   1134             return "wstr";
   1135         switch (PyUnicode_KIND(unicode))
   1136         {
   1137         case PyUnicode_1BYTE_KIND:
   1138             if (PyUnicode_IS_ASCII(unicode))
   1139                 return "legacy ascii";
   1140             else
   1141                 return "legacy latin1";
   1142         case PyUnicode_2BYTE_KIND:
   1143             return "legacy UCS2";
   1144         case PyUnicode_4BYTE_KIND:
   1145             return "legacy UCS4";
   1146         default:
   1147             return "<legacy invalid kind>";
   1148         }
   1149     }
   1150     assert(PyUnicode_IS_READY(unicode));
   1151     switch (PyUnicode_KIND(unicode)) {
   1152     case PyUnicode_1BYTE_KIND:
   1153         if (PyUnicode_IS_ASCII(unicode))
   1154             return "ascii";
   1155         else
   1156             return "latin1";
   1157     case PyUnicode_2BYTE_KIND:
   1158         return "UCS2";
   1159     case PyUnicode_4BYTE_KIND:
   1160         return "UCS4";
   1161     default:
   1162         return "<invalid compact kind>";
   1163     }
   1164 }
   1165 
   1166 #ifdef Py_DEBUG
   1167 /* Functions wrapping macros for use in debugger */
   1168 char *_PyUnicode_utf8(void *unicode){
   1169     return PyUnicode_UTF8(unicode);
   1170 }
   1171 
   1172 void *_PyUnicode_compact_data(void *unicode) {
   1173     return _PyUnicode_COMPACT_DATA(unicode);
   1174 }
   1175 void *_PyUnicode_data(void *unicode){
   1176     printf("obj %p\n", unicode);
   1177     printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
   1178     printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
   1179     printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
   1180     printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
   1181     printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
   1182     return PyUnicode_DATA(unicode);
   1183 }
   1184 
   1185 void
   1186 _PyUnicode_Dump(PyObject *op)
   1187 {
   1188     PyASCIIObject *ascii = (PyASCIIObject *)op;
   1189     PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
   1190     PyUnicodeObject *unicode = (PyUnicodeObject *)op;
   1191     void *data;
   1192 
   1193     if (ascii->state.compact)
   1194     {
   1195         if (ascii->state.ascii)
   1196             data = (ascii + 1);
   1197         else
   1198             data = (compact + 1);
   1199     }
   1200     else
   1201         data = unicode->data.any;
   1202     printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
   1203            unicode_kind_name(op), ascii->length);
   1204 
   1205     if (ascii->wstr == data)
   1206         printf("shared ");
   1207     printf("wstr=%p", ascii->wstr);
   1208 
   1209     if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
   1210         printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
   1211         if (!ascii->state.compact && compact->utf8 == unicode->data.any)
   1212             printf("shared ");
   1213         printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
   1214                compact->utf8, compact->utf8_length);
   1215     }
   1216     printf(", data=%p\n", data);
   1217 }
   1218 #endif
   1219 
   1220 PyObject *
   1221 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
   1222 {
   1223     PyObject *obj;
   1224     PyCompactUnicodeObject *unicode;
   1225     void *data;
   1226     enum PyUnicode_Kind kind;
   1227     int is_sharing, is_ascii;
   1228     Py_ssize_t char_size;
   1229     Py_ssize_t struct_size;
   1230 
   1231     /* Optimization for empty strings */
   1232     if (size == 0 && unicode_empty != NULL) {
   1233         Py_INCREF(unicode_empty);
   1234         return unicode_empty;
   1235     }
   1236 
   1237     is_ascii = 0;
   1238     is_sharing = 0;
   1239     struct_size = sizeof(PyCompactUnicodeObject);
   1240     if (maxchar < 128) {
   1241         kind = PyUnicode_1BYTE_KIND;
   1242         char_size = 1;
   1243         is_ascii = 1;
   1244         struct_size = sizeof(PyASCIIObject);
   1245     }
   1246     else if (maxchar < 256) {
   1247         kind = PyUnicode_1BYTE_KIND;
   1248         char_size = 1;
   1249     }
   1250     else if (maxchar < 65536) {
   1251         kind = PyUnicode_2BYTE_KIND;
   1252         char_size = 2;
   1253         if (sizeof(wchar_t) == 2)
   1254             is_sharing = 1;
   1255     }
   1256     else {
   1257         if (maxchar > MAX_UNICODE) {
   1258             PyErr_SetString(PyExc_SystemError,
   1259                             "invalid maximum character passed to PyUnicode_New");
   1260             return NULL;
   1261         }
   1262         kind = PyUnicode_4BYTE_KIND;
   1263         char_size = 4;
   1264         if (sizeof(wchar_t) == 4)
   1265             is_sharing = 1;
   1266     }
   1267 
   1268     /* Ensure we won't overflow the size. */
   1269     if (size < 0) {
   1270         PyErr_SetString(PyExc_SystemError,
   1271                         "Negative size passed to PyUnicode_New");
   1272         return NULL;
   1273     }
   1274     if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
   1275         return PyErr_NoMemory();
   1276 
   1277     /* Duplicated allocation code from _PyObject_New() instead of a call to
   1278      * PyObject_New() so we are able to allocate space for the object and
   1279      * it's data buffer.
   1280      */
   1281     obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
   1282     if (obj == NULL)
   1283         return PyErr_NoMemory();
   1284     obj = PyObject_INIT(obj, &PyUnicode_Type);
   1285     if (obj == NULL)
   1286         return NULL;
   1287 
   1288     unicode = (PyCompactUnicodeObject *)obj;
   1289     if (is_ascii)
   1290         data = ((PyASCIIObject*)obj) + 1;
   1291     else
   1292         data = unicode + 1;
   1293     _PyUnicode_LENGTH(unicode) = size;
   1294     _PyUnicode_HASH(unicode) = -1;
   1295     _PyUnicode_STATE(unicode).interned = 0;
   1296     _PyUnicode_STATE(unicode).kind = kind;
   1297     _PyUnicode_STATE(unicode).compact = 1;
   1298     _PyUnicode_STATE(unicode).ready = 1;
   1299     _PyUnicode_STATE(unicode).ascii = is_ascii;
   1300     if (is_ascii) {
   1301         ((char*)data)[size] = 0;
   1302         _PyUnicode_WSTR(unicode) = NULL;
   1303     }
   1304     else if (kind == PyUnicode_1BYTE_KIND) {
   1305         ((char*)data)[size] = 0;
   1306         _PyUnicode_WSTR(unicode) = NULL;
   1307         _PyUnicode_WSTR_LENGTH(unicode) = 0;
   1308         unicode->utf8 = NULL;
   1309         unicode->utf8_length = 0;
   1310     }
   1311     else {
   1312         unicode->utf8 = NULL;
   1313         unicode->utf8_length = 0;
   1314         if (kind == PyUnicode_2BYTE_KIND)
   1315             ((Py_UCS2*)data)[size] = 0;
   1316         else /* kind == PyUnicode_4BYTE_KIND */
   1317             ((Py_UCS4*)data)[size] = 0;
   1318         if (is_sharing) {
   1319             _PyUnicode_WSTR_LENGTH(unicode) = size;
   1320             _PyUnicode_WSTR(unicode) = (wchar_t *)data;
   1321         }
   1322         else {
   1323             _PyUnicode_WSTR_LENGTH(unicode) = 0;
   1324             _PyUnicode_WSTR(unicode) = NULL;
   1325         }
   1326     }
   1327 #ifdef Py_DEBUG
   1328     unicode_fill_invalid((PyObject*)unicode, 0);
   1329 #endif
   1330     assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
   1331     return obj;
   1332 }
   1333 
   1334 #if SIZEOF_WCHAR_T == 2
   1335 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
   1336    will decode surrogate pairs, the other conversions are implemented as macros
   1337    for efficiency.
   1338 
   1339    This function assumes that unicode can hold one more code point than wstr
   1340    characters for a terminating null character. */
   1341 static void
   1342 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
   1343                               PyObject *unicode)
   1344 {
   1345     const wchar_t *iter;
   1346     Py_UCS4 *ucs4_out;
   1347 
   1348     assert(unicode != NULL);
   1349     assert(_PyUnicode_CHECK(unicode));
   1350     assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
   1351     ucs4_out = PyUnicode_4BYTE_DATA(unicode);
   1352 
   1353     for (iter = begin; iter < end; ) {
   1354         assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
   1355                            _PyUnicode_GET_LENGTH(unicode)));
   1356         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
   1357             && (iter+1) < end
   1358             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
   1359         {
   1360             *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
   1361             iter += 2;
   1362         }
   1363         else {
   1364             *ucs4_out++ = *iter;
   1365             iter++;
   1366         }
   1367     }
   1368     assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
   1369                         _PyUnicode_GET_LENGTH(unicode)));
   1370 
   1371 }
   1372 #endif
   1373 
   1374 static int
   1375 unicode_check_modifiable(PyObject *unicode)
   1376 {
   1377     if (!unicode_modifiable(unicode)) {
   1378         PyErr_SetString(PyExc_SystemError,
   1379                         "Cannot modify a string currently used");
   1380         return -1;
   1381     }
   1382     return 0;
   1383 }
   1384 
   1385 static int
   1386 _copy_characters(PyObject *to, Py_ssize_t to_start,
   1387                  PyObject *from, Py_ssize_t from_start,
   1388                  Py_ssize_t how_many, int check_maxchar)
   1389 {
   1390     unsigned int from_kind, to_kind;
   1391     void *from_data, *to_data;
   1392 
   1393     assert(0 <= how_many);
   1394     assert(0 <= from_start);
   1395     assert(0 <= to_start);
   1396     assert(PyUnicode_Check(from));
   1397     assert(PyUnicode_IS_READY(from));
   1398     assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
   1399 
   1400     assert(PyUnicode_Check(to));
   1401     assert(PyUnicode_IS_READY(to));
   1402     assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
   1403 
   1404     if (how_many == 0)
   1405         return 0;
   1406 
   1407     from_kind = PyUnicode_KIND(from);
   1408     from_data = PyUnicode_DATA(from);
   1409     to_kind = PyUnicode_KIND(to);
   1410     to_data = PyUnicode_DATA(to);
   1411 
   1412 #ifdef Py_DEBUG
   1413     if (!check_maxchar
   1414         && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
   1415     {
   1416         const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
   1417         Py_UCS4 ch;
   1418         Py_ssize_t i;
   1419         for (i=0; i < how_many; i++) {
   1420             ch = PyUnicode_READ(from_kind, from_data, from_start + i);
   1421             assert(ch <= to_maxchar);
   1422         }
   1423     }
   1424 #endif
   1425 
   1426     if (from_kind == to_kind) {
   1427         if (check_maxchar
   1428             && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
   1429         {
   1430             /* Writing Latin-1 characters into an ASCII string requires to
   1431                check that all written characters are pure ASCII */
   1432             Py_UCS4 max_char;
   1433             max_char = ucs1lib_find_max_char(from_data,
   1434                                              (Py_UCS1*)from_data + how_many);
   1435             if (max_char >= 128)
   1436                 return -1;
   1437         }
   1438         memcpy((char*)to_data + to_kind * to_start,
   1439                   (char*)from_data + from_kind * from_start,
   1440                   to_kind * how_many);
   1441     }
   1442     else if (from_kind == PyUnicode_1BYTE_KIND
   1443              && to_kind == PyUnicode_2BYTE_KIND)
   1444     {
   1445         _PyUnicode_CONVERT_BYTES(
   1446             Py_UCS1, Py_UCS2,
   1447             PyUnicode_1BYTE_DATA(from) + from_start,
   1448             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
   1449             PyUnicode_2BYTE_DATA(to) + to_start
   1450             );
   1451     }
   1452     else if (from_kind == PyUnicode_1BYTE_KIND
   1453              && to_kind == PyUnicode_4BYTE_KIND)
   1454     {
   1455         _PyUnicode_CONVERT_BYTES(
   1456             Py_UCS1, Py_UCS4,
   1457             PyUnicode_1BYTE_DATA(from) + from_start,
   1458             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
   1459             PyUnicode_4BYTE_DATA(to) + to_start
   1460             );
   1461     }
   1462     else if (from_kind == PyUnicode_2BYTE_KIND
   1463              && to_kind == PyUnicode_4BYTE_KIND)
   1464     {
   1465         _PyUnicode_CONVERT_BYTES(
   1466             Py_UCS2, Py_UCS4,
   1467             PyUnicode_2BYTE_DATA(from) + from_start,
   1468             PyUnicode_2BYTE_DATA(from) + from_start + how_many,
   1469             PyUnicode_4BYTE_DATA(to) + to_start
   1470             );
   1471     }
   1472     else {
   1473         assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
   1474 
   1475         if (!check_maxchar) {
   1476             if (from_kind == PyUnicode_2BYTE_KIND
   1477                 && to_kind == PyUnicode_1BYTE_KIND)
   1478             {
   1479                 _PyUnicode_CONVERT_BYTES(
   1480                     Py_UCS2, Py_UCS1,
   1481                     PyUnicode_2BYTE_DATA(from) + from_start,
   1482                     PyUnicode_2BYTE_DATA(from) + from_start + how_many,
   1483                     PyUnicode_1BYTE_DATA(to) + to_start
   1484                     );
   1485             }
   1486             else if (from_kind == PyUnicode_4BYTE_KIND
   1487                      && to_kind == PyUnicode_1BYTE_KIND)
   1488             {
   1489                 _PyUnicode_CONVERT_BYTES(
   1490                     Py_UCS4, Py_UCS1,
   1491                     PyUnicode_4BYTE_DATA(from) + from_start,
   1492                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
   1493                     PyUnicode_1BYTE_DATA(to) + to_start
   1494                     );
   1495             }
   1496             else if (from_kind == PyUnicode_4BYTE_KIND
   1497                      && to_kind == PyUnicode_2BYTE_KIND)
   1498             {
   1499                 _PyUnicode_CONVERT_BYTES(
   1500                     Py_UCS4, Py_UCS2,
   1501                     PyUnicode_4BYTE_DATA(from) + from_start,
   1502                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
   1503                     PyUnicode_2BYTE_DATA(to) + to_start
   1504                     );
   1505             }
   1506             else {
   1507                 assert(0);
   1508                 return -1;
   1509             }
   1510         }
   1511         else {
   1512             const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
   1513             Py_UCS4 ch;
   1514             Py_ssize_t i;
   1515 
   1516             for (i=0; i < how_many; i++) {
   1517                 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
   1518                 if (ch > to_maxchar)
   1519                     return -1;
   1520                 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
   1521             }
   1522         }
   1523     }
   1524     return 0;
   1525 }
   1526 
   1527 void
   1528 _PyUnicode_FastCopyCharacters(
   1529     PyObject *to, Py_ssize_t to_start,
   1530     PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
   1531 {
   1532     (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
   1533 }
   1534 
   1535 Py_ssize_t
   1536 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
   1537                          PyObject *from, Py_ssize_t from_start,
   1538                          Py_ssize_t how_many)
   1539 {
   1540     int err;
   1541 
   1542     if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
   1543         PyErr_BadInternalCall();
   1544         return -1;
   1545     }
   1546 
   1547     if (PyUnicode_READY(from) == -1)
   1548         return -1;
   1549     if (PyUnicode_READY(to) == -1)
   1550         return -1;
   1551 
   1552     if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
   1553         PyErr_SetString(PyExc_IndexError, "string index out of range");
   1554         return -1;
   1555     }
   1556     if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
   1557         PyErr_SetString(PyExc_IndexError, "string index out of range");
   1558         return -1;
   1559     }
   1560     if (how_many < 0) {
   1561         PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
   1562         return -1;
   1563     }
   1564     how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
   1565     if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
   1566         PyErr_Format(PyExc_SystemError,
   1567                      "Cannot write %zi characters at %zi "
   1568                      "in a string of %zi characters",
   1569                      how_many, to_start, PyUnicode_GET_LENGTH(to));
   1570         return -1;
   1571     }
   1572 
   1573     if (how_many == 0)
   1574         return 0;
   1575 
   1576     if (unicode_check_modifiable(to))
   1577         return -1;
   1578 
   1579     err = _copy_characters(to, to_start, from, from_start, how_many, 1);
   1580     if (err) {
   1581         PyErr_Format(PyExc_SystemError,
   1582                      "Cannot copy %s characters "
   1583                      "into a string of %s characters",
   1584                      unicode_kind_name(from),
   1585                      unicode_kind_name(to));
   1586         return -1;
   1587     }
   1588     return how_many;
   1589 }
   1590 
   1591 /* Find the maximum code point and count the number of surrogate pairs so a
   1592    correct string length can be computed before converting a string to UCS4.
   1593    This function counts single surrogates as a character and not as a pair.
   1594 
   1595    Return 0 on success, or -1 on error. */
   1596 static int
   1597 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
   1598                         Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
   1599 {
   1600     const wchar_t *iter;
   1601     Py_UCS4 ch;
   1602 
   1603     assert(num_surrogates != NULL && maxchar != NULL);
   1604     *num_surrogates = 0;
   1605     *maxchar = 0;
   1606 
   1607     for (iter = begin; iter < end; ) {
   1608 #if SIZEOF_WCHAR_T == 2
   1609         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
   1610             && (iter+1) < end
   1611             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
   1612         {
   1613             ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
   1614             ++(*num_surrogates);
   1615             iter += 2;
   1616         }
   1617         else
   1618 #endif
   1619         {
   1620             ch = *iter;
   1621             iter++;
   1622         }
   1623         if (ch > *maxchar) {
   1624             *maxchar = ch;
   1625             if (*maxchar > MAX_UNICODE) {
   1626                 PyErr_Format(PyExc_ValueError,
   1627                              "character U+%x is not in range [U+0000; U+10ffff]",
   1628                              ch);
   1629                 return -1;
   1630             }
   1631         }
   1632     }
   1633     return 0;
   1634 }
   1635 
   1636 int
   1637 _PyUnicode_Ready(PyObject *unicode)
   1638 {
   1639     wchar_t *end;
   1640     Py_UCS4 maxchar = 0;
   1641     Py_ssize_t num_surrogates;
   1642 #if SIZEOF_WCHAR_T == 2
   1643     Py_ssize_t length_wo_surrogates;
   1644 #endif
   1645 
   1646     /* _PyUnicode_Ready() is only intended for old-style API usage where
   1647        strings were created using _PyObject_New() and where no canonical
   1648        representation (the str field) has been set yet aka strings
   1649        which are not yet ready. */
   1650     assert(_PyUnicode_CHECK(unicode));
   1651     assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
   1652     assert(_PyUnicode_WSTR(unicode) != NULL);
   1653     assert(_PyUnicode_DATA_ANY(unicode) == NULL);
   1654     assert(_PyUnicode_UTF8(unicode) == NULL);
   1655     /* Actually, it should neither be interned nor be anything else: */
   1656     assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
   1657 
   1658     end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
   1659     if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
   1660                                 &maxchar, &num_surrogates) == -1)
   1661         return -1;
   1662 
   1663     if (maxchar < 256) {
   1664         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
   1665         if (!_PyUnicode_DATA_ANY(unicode)) {
   1666             PyErr_NoMemory();
   1667             return -1;
   1668         }
   1669         _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
   1670                                 _PyUnicode_WSTR(unicode), end,
   1671                                 PyUnicode_1BYTE_DATA(unicode));
   1672         PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
   1673         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
   1674         _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
   1675         if (maxchar < 128) {
   1676             _PyUnicode_STATE(unicode).ascii = 1;
   1677             _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
   1678             _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
   1679         }
   1680         else {
   1681             _PyUnicode_STATE(unicode).ascii = 0;
   1682             _PyUnicode_UTF8(unicode) = NULL;
   1683             _PyUnicode_UTF8_LENGTH(unicode) = 0;
   1684         }
   1685         PyObject_FREE(_PyUnicode_WSTR(unicode));
   1686         _PyUnicode_WSTR(unicode) = NULL;
   1687         _PyUnicode_WSTR_LENGTH(unicode) = 0;
   1688     }
   1689     /* In this case we might have to convert down from 4-byte native
   1690        wchar_t to 2-byte unicode. */
   1691     else if (maxchar < 65536) {
   1692         assert(num_surrogates == 0 &&
   1693                "FindMaxCharAndNumSurrogatePairs() messed up");
   1694 
   1695 #if SIZEOF_WCHAR_T == 2
   1696         /* We can share representations and are done. */
   1697         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
   1698         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
   1699         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
   1700         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
   1701         _PyUnicode_UTF8(unicode) = NULL;
   1702         _PyUnicode_UTF8_LENGTH(unicode) = 0;
   1703 #else
   1704         /* sizeof(wchar_t) == 4 */
   1705         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
   1706             2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
   1707         if (!_PyUnicode_DATA_ANY(unicode)) {
   1708             PyErr_NoMemory();
   1709             return -1;
   1710         }
   1711         _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
   1712                                 _PyUnicode_WSTR(unicode), end,
   1713                                 PyUnicode_2BYTE_DATA(unicode));
   1714         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
   1715         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
   1716         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
   1717         _PyUnicode_UTF8(unicode) = NULL;
   1718         _PyUnicode_UTF8_LENGTH(unicode) = 0;
   1719         PyObject_FREE(_PyUnicode_WSTR(unicode));
   1720         _PyUnicode_WSTR(unicode) = NULL;
   1721         _PyUnicode_WSTR_LENGTH(unicode) = 0;
   1722 #endif
   1723     }
   1724     /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
   1725     else {
   1726 #if SIZEOF_WCHAR_T == 2
   1727         /* in case the native representation is 2-bytes, we need to allocate a
   1728            new normalized 4-byte version. */
   1729         length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
   1730         if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
   1731             PyErr_NoMemory();
   1732             return -1;
   1733         }
   1734         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
   1735         if (!_PyUnicode_DATA_ANY(unicode)) {
   1736             PyErr_NoMemory();
   1737             return -1;
   1738         }
   1739         _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
   1740         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
   1741         _PyUnicode_UTF8(unicode) = NULL;
   1742         _PyUnicode_UTF8_LENGTH(unicode) = 0;
   1743         /* unicode_convert_wchar_to_ucs4() requires a ready string */
   1744         _PyUnicode_STATE(unicode).ready = 1;
   1745         unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
   1746         PyObject_FREE(_PyUnicode_WSTR(unicode));
   1747         _PyUnicode_WSTR(unicode) = NULL;
   1748         _PyUnicode_WSTR_LENGTH(unicode) = 0;
   1749 #else
   1750         assert(num_surrogates == 0);
   1751 
   1752         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
   1753         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
   1754         _PyUnicode_UTF8(unicode) = NULL;
   1755         _PyUnicode_UTF8_LENGTH(unicode) = 0;
   1756         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
   1757 #endif
   1758         PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
   1759     }
   1760     _PyUnicode_STATE(unicode).ready = 1;
   1761     assert(_PyUnicode_CheckConsistency(unicode, 1));
   1762     return 0;
   1763 }
   1764 
   1765 static void
   1766 unicode_dealloc(PyObject *unicode)
   1767 {
   1768     switch (PyUnicode_CHECK_INTERNED(unicode)) {
   1769     case SSTATE_NOT_INTERNED:
   1770         break;
   1771 
   1772     case SSTATE_INTERNED_MORTAL:
   1773         /* revive dead object temporarily for DelItem */
   1774         Py_REFCNT(unicode) = 3;
   1775         if (PyDict_DelItem(interned, unicode) != 0)
   1776             Py_FatalError(
   1777                 "deletion of interned string failed");
   1778         break;
   1779 
   1780     case SSTATE_INTERNED_IMMORTAL:
   1781         Py_FatalError("Immortal interned string died.");
   1782 
   1783     default:
   1784         Py_FatalError("Inconsistent interned string state.");
   1785     }
   1786 
   1787     if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
   1788         PyObject_DEL(_PyUnicode_WSTR(unicode));
   1789     if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
   1790         PyObject_DEL(_PyUnicode_UTF8(unicode));
   1791     if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
   1792         PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
   1793 
   1794     Py_TYPE(unicode)->tp_free(unicode);
   1795 }
   1796 
   1797 #ifdef Py_DEBUG
   1798 static int
   1799 unicode_is_singleton(PyObject *unicode)
   1800 {
   1801     PyASCIIObject *ascii = (PyASCIIObject *)unicode;
   1802     if (unicode == unicode_empty)
   1803         return 1;
   1804     if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
   1805     {
   1806         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
   1807         if (ch < 256 && unicode_latin1[ch] == unicode)
   1808             return 1;
   1809     }
   1810     return 0;
   1811 }
   1812 #endif
   1813 
   1814 static int
   1815 unicode_modifiable(PyObject *unicode)
   1816 {
   1817     assert(_PyUnicode_CHECK(unicode));
   1818     if (Py_REFCNT(unicode) != 1)
   1819         return 0;
   1820     if (_PyUnicode_HASH(unicode) != -1)
   1821         return 0;
   1822     if (PyUnicode_CHECK_INTERNED(unicode))
   1823         return 0;
   1824     if (!PyUnicode_CheckExact(unicode))
   1825         return 0;
   1826 #ifdef Py_DEBUG
   1827     /* singleton refcount is greater than 1 */
   1828     assert(!unicode_is_singleton(unicode));
   1829 #endif
   1830     return 1;
   1831 }
   1832 
   1833 static int
   1834 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
   1835 {
   1836     PyObject *unicode;
   1837     Py_ssize_t old_length;
   1838 
   1839     assert(p_unicode != NULL);
   1840     unicode = *p_unicode;
   1841 
   1842     assert(unicode != NULL);
   1843     assert(PyUnicode_Check(unicode));
   1844     assert(0 <= length);
   1845 
   1846     if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
   1847         old_length = PyUnicode_WSTR_LENGTH(unicode);
   1848     else
   1849         old_length = PyUnicode_GET_LENGTH(unicode);
   1850     if (old_length == length)
   1851         return 0;
   1852 
   1853     if (length == 0) {
   1854         _Py_INCREF_UNICODE_EMPTY();
   1855         if (!unicode_empty)
   1856             return -1;
   1857         Py_SETREF(*p_unicode, unicode_empty);
   1858         return 0;
   1859     }
   1860 
   1861     if (!unicode_modifiable(unicode)) {
   1862         PyObject *copy = resize_copy(unicode, length);
   1863         if (copy == NULL)
   1864             return -1;
   1865         Py_SETREF(*p_unicode, copy);
   1866         return 0;
   1867     }
   1868 
   1869     if (PyUnicode_IS_COMPACT(unicode)) {
   1870         PyObject *new_unicode = resize_compact(unicode, length);
   1871         if (new_unicode == NULL)
   1872             return -1;
   1873         *p_unicode = new_unicode;
   1874         return 0;
   1875     }
   1876     return resize_inplace(unicode, length);
   1877 }
   1878 
   1879 int
   1880 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
   1881 {
   1882     PyObject *unicode;
   1883     if (p_unicode == NULL) {
   1884         PyErr_BadInternalCall();
   1885         return -1;
   1886     }
   1887     unicode = *p_unicode;
   1888     if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
   1889     {
   1890         PyErr_BadInternalCall();
   1891         return -1;
   1892     }
   1893     return unicode_resize(p_unicode, length);
   1894 }
   1895 
   1896 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
   1897 
   1898    WARNING: The function doesn't copy the terminating null character and
   1899    doesn't check the maximum character (may write a latin1 character in an
   1900    ASCII string). */
   1901 static void
   1902 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
   1903                    const char *str, Py_ssize_t len)
   1904 {
   1905     enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
   1906     void *data = PyUnicode_DATA(unicode);
   1907     const char *end = str + len;
   1908 
   1909     switch (kind) {
   1910     case PyUnicode_1BYTE_KIND: {
   1911         assert(index + len <= PyUnicode_GET_LENGTH(unicode));
   1912 #ifdef Py_DEBUG
   1913         if (PyUnicode_IS_ASCII(unicode)) {
   1914             Py_UCS4 maxchar = ucs1lib_find_max_char(
   1915                 (const Py_UCS1*)str,
   1916                 (const Py_UCS1*)str + len);
   1917             assert(maxchar < 128);
   1918         }
   1919 #endif
   1920         memcpy((char *) data + index, str, len);
   1921         break;
   1922     }
   1923     case PyUnicode_2BYTE_KIND: {
   1924         Py_UCS2 *start = (Py_UCS2 *)data + index;
   1925         Py_UCS2 *ucs2 = start;
   1926         assert(index <= PyUnicode_GET_LENGTH(unicode));
   1927 
   1928         for (; str < end; ++ucs2, ++str)
   1929             *ucs2 = (Py_UCS2)*str;
   1930 
   1931         assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
   1932         break;
   1933     }
   1934     default: {
   1935         Py_UCS4 *start = (Py_UCS4 *)data + index;
   1936         Py_UCS4 *ucs4 = start;
   1937         assert(kind == PyUnicode_4BYTE_KIND);
   1938         assert(index <= PyUnicode_GET_LENGTH(unicode));
   1939 
   1940         for (; str < end; ++ucs4, ++str)
   1941             *ucs4 = (Py_UCS4)*str;
   1942 
   1943         assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
   1944     }
   1945     }
   1946 }
   1947 
   1948 static PyObject*
   1949 get_latin1_char(unsigned char ch)
   1950 {
   1951     PyObject *unicode = unicode_latin1[ch];
   1952     if (!unicode) {
   1953         unicode = PyUnicode_New(1, ch);
   1954         if (!unicode)
   1955             return NULL;
   1956         PyUnicode_1BYTE_DATA(unicode)[0] = ch;
   1957         assert(_PyUnicode_CheckConsistency(unicode, 1));
   1958         unicode_latin1[ch] = unicode;
   1959     }
   1960     Py_INCREF(unicode);
   1961     return unicode;
   1962 }
   1963 
   1964 static PyObject*
   1965 unicode_char(Py_UCS4 ch)
   1966 {
   1967     PyObject *unicode;
   1968 
   1969     assert(ch <= MAX_UNICODE);
   1970 
   1971     if (ch < 256)
   1972         return get_latin1_char(ch);
   1973 
   1974     unicode = PyUnicode_New(1, ch);
   1975     if (unicode == NULL)
   1976         return NULL;
   1977     switch (PyUnicode_KIND(unicode)) {
   1978     case PyUnicode_1BYTE_KIND:
   1979         PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
   1980         break;
   1981     case PyUnicode_2BYTE_KIND:
   1982         PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
   1983         break;
   1984     default:
   1985         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
   1986         PyUnicode_4BYTE_DATA(unicode)[0] = ch;
   1987     }
   1988     assert(_PyUnicode_CheckConsistency(unicode, 1));
   1989     return unicode;
   1990 }
   1991 
   1992 PyObject *
   1993 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
   1994 {
   1995     PyObject *unicode;
   1996     Py_UCS4 maxchar = 0;
   1997     Py_ssize_t num_surrogates;
   1998 
   1999     if (u == NULL)
   2000         return (PyObject*)_PyUnicode_New(size);
   2001 
   2002     /* If the Unicode data is known at construction time, we can apply
   2003        some optimizations which share commonly used objects. */
   2004 
   2005     /* Optimization for empty strings */
   2006     if (size == 0)
   2007         _Py_RETURN_UNICODE_EMPTY();
   2008 
   2009     /* Single character Unicode objects in the Latin-1 range are
   2010        shared when using this constructor */
   2011     if (size == 1 && (Py_UCS4)*u < 256)
   2012         return get_latin1_char((unsigned char)*u);
   2013 
   2014     /* If not empty and not single character, copy the Unicode data
   2015        into the new object */
   2016     if (find_maxchar_surrogates(u, u + size,
   2017                                 &maxchar, &num_surrogates) == -1)
   2018         return NULL;
   2019 
   2020     unicode = PyUnicode_New(size - num_surrogates, maxchar);
   2021     if (!unicode)
   2022         return NULL;
   2023 
   2024     switch (PyUnicode_KIND(unicode)) {
   2025     case PyUnicode_1BYTE_KIND:
   2026         _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
   2027                                 u, u + size, PyUnicode_1BYTE_DATA(unicode));
   2028         break;
   2029     case PyUnicode_2BYTE_KIND:
   2030 #if Py_UNICODE_SIZE == 2
   2031         memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
   2032 #else
   2033         _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
   2034                                 u, u + size, PyUnicode_2BYTE_DATA(unicode));
   2035 #endif
   2036         break;
   2037     case PyUnicode_4BYTE_KIND:
   2038 #if SIZEOF_WCHAR_T == 2
   2039         /* This is the only case which has to process surrogates, thus
   2040            a simple copy loop is not enough and we need a function. */
   2041         unicode_convert_wchar_to_ucs4(u, u + size, unicode);
   2042 #else
   2043         assert(num_surrogates == 0);
   2044         memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
   2045 #endif
   2046         break;
   2047     default:
   2048         assert(0 && "Impossible state");
   2049     }
   2050 
   2051     return unicode_result(unicode);
   2052 }
   2053 
   2054 PyObject *
   2055 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
   2056 {
   2057     if (size < 0) {
   2058         PyErr_SetString(PyExc_SystemError,
   2059                         "Negative size passed to PyUnicode_FromStringAndSize");
   2060         return NULL;
   2061     }
   2062     if (u != NULL)
   2063         return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
   2064     else
   2065         return (PyObject *)_PyUnicode_New(size);
   2066 }
   2067 
   2068 PyObject *
   2069 PyUnicode_FromString(const char *u)
   2070 {
   2071     size_t size = strlen(u);
   2072     if (size > PY_SSIZE_T_MAX) {
   2073         PyErr_SetString(PyExc_OverflowError, "input too long");
   2074         return NULL;
   2075     }
   2076     return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
   2077 }
   2078 
   2079 PyObject *
   2080 _PyUnicode_FromId(_Py_Identifier *id)
   2081 {
   2082     if (!id->object) {
   2083         id->object = PyUnicode_DecodeUTF8Stateful(id->string,
   2084                                                   strlen(id->string),
   2085                                                   NULL, NULL);
   2086         if (!id->object)
   2087             return NULL;
   2088         PyUnicode_InternInPlace(&id->object);
   2089         assert(!id->next);
   2090         id->next = static_strings;
   2091         static_strings = id;
   2092     }
   2093     return id->object;
   2094 }
   2095 
   2096 void
   2097 _PyUnicode_ClearStaticStrings()
   2098 {
   2099     _Py_Identifier *tmp, *s = static_strings;
   2100     while (s) {
   2101         Py_CLEAR(s->object);
   2102         tmp = s->next;
   2103         s->next = NULL;
   2104         s = tmp;
   2105     }
   2106     static_strings = NULL;
   2107 }
   2108 
   2109 /* Internal function, doesn't check maximum character */
   2110 
   2111 PyObject*
   2112 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
   2113 {
   2114     const unsigned char *s = (const unsigned char *)buffer;
   2115     PyObject *unicode;
   2116     if (size == 1) {
   2117 #ifdef Py_DEBUG
   2118         assert((unsigned char)s[0] < 128);
   2119 #endif
   2120         return get_latin1_char(s[0]);
   2121     }
   2122     unicode = PyUnicode_New(size, 127);
   2123     if (!unicode)
   2124         return NULL;
   2125     memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
   2126     assert(_PyUnicode_CheckConsistency(unicode, 1));
   2127     return unicode;
   2128 }
   2129 
   2130 static Py_UCS4
   2131 kind_maxchar_limit(unsigned int kind)
   2132 {
   2133     switch (kind) {
   2134     case PyUnicode_1BYTE_KIND:
   2135         return 0x80;
   2136     case PyUnicode_2BYTE_KIND:
   2137         return 0x100;
   2138     case PyUnicode_4BYTE_KIND:
   2139         return 0x10000;
   2140     default:
   2141         assert(0 && "invalid kind");
   2142         return MAX_UNICODE;
   2143     }
   2144 }
   2145 
   2146 static inline Py_UCS4
   2147 align_maxchar(Py_UCS4 maxchar)
   2148 {
   2149     if (maxchar <= 127)
   2150         return 127;
   2151     else if (maxchar <= 255)
   2152         return 255;
   2153     else if (maxchar <= 65535)
   2154         return 65535;
   2155     else
   2156         return MAX_UNICODE;
   2157 }
   2158 
   2159 static PyObject*
   2160 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
   2161 {
   2162     PyObject *res;
   2163     unsigned char max_char;
   2164 
   2165     if (size == 0)
   2166         _Py_RETURN_UNICODE_EMPTY();
   2167     assert(size > 0);
   2168     if (size == 1)
   2169         return get_latin1_char(u[0]);
   2170 
   2171     max_char = ucs1lib_find_max_char(u, u + size);
   2172     res = PyUnicode_New(size, max_char);
   2173     if (!res)
   2174         return NULL;
   2175     memcpy(PyUnicode_1BYTE_DATA(res), u, size);
   2176     assert(_PyUnicode_CheckConsistency(res, 1));
   2177     return res;
   2178 }
   2179 
   2180 static PyObject*
   2181 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
   2182 {
   2183     PyObject *res;
   2184     Py_UCS2 max_char;
   2185 
   2186     if (size == 0)
   2187         _Py_RETURN_UNICODE_EMPTY();
   2188     assert(size > 0);
   2189     if (size == 1)
   2190         return unicode_char(u[0]);
   2191 
   2192     max_char = ucs2lib_find_max_char(u, u + size);
   2193     res = PyUnicode_New(size, max_char);
   2194     if (!res)
   2195         return NULL;
   2196     if (max_char >= 256)
   2197         memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
   2198     else {
   2199         _PyUnicode_CONVERT_BYTES(
   2200             Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
   2201     }
   2202     assert(_PyUnicode_CheckConsistency(res, 1));
   2203     return res;
   2204 }
   2205 
   2206 static PyObject*
   2207 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
   2208 {
   2209     PyObject *res;
   2210     Py_UCS4 max_char;
   2211 
   2212     if (size == 0)
   2213         _Py_RETURN_UNICODE_EMPTY();
   2214     assert(size > 0);
   2215     if (size == 1)
   2216         return unicode_char(u[0]);
   2217 
   2218     max_char = ucs4lib_find_max_char(u, u + size);
   2219     res = PyUnicode_New(size, max_char);
   2220     if (!res)
   2221         return NULL;
   2222     if (max_char < 256)
   2223         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
   2224                                  PyUnicode_1BYTE_DATA(res));
   2225     else if (max_char < 0x10000)
   2226         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
   2227                                  PyUnicode_2BYTE_DATA(res));
   2228     else
   2229         memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
   2230     assert(_PyUnicode_CheckConsistency(res, 1));
   2231     return res;
   2232 }
   2233 
   2234 PyObject*
   2235 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
   2236 {
   2237     if (size < 0) {
   2238         PyErr_SetString(PyExc_ValueError, "size must be positive");
   2239         return NULL;
   2240     }
   2241     switch (kind) {
   2242     case PyUnicode_1BYTE_KIND:
   2243         return _PyUnicode_FromUCS1(buffer, size);
   2244     case PyUnicode_2BYTE_KIND:
   2245         return _PyUnicode_FromUCS2(buffer, size);
   2246     case PyUnicode_4BYTE_KIND:
   2247         return _PyUnicode_FromUCS4(buffer, size);
   2248     default:
   2249         PyErr_SetString(PyExc_SystemError, "invalid kind");
   2250         return NULL;
   2251     }
   2252 }
   2253 
   2254 Py_UCS4
   2255 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
   2256 {
   2257     enum PyUnicode_Kind kind;
   2258     void *startptr, *endptr;
   2259 
   2260     assert(PyUnicode_IS_READY(unicode));
   2261     assert(0 <= start);
   2262     assert(end <= PyUnicode_GET_LENGTH(unicode));
   2263     assert(start <= end);
   2264 
   2265     if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
   2266         return PyUnicode_MAX_CHAR_VALUE(unicode);
   2267 
   2268     if (start == end)
   2269         return 127;
   2270 
   2271     if (PyUnicode_IS_ASCII(unicode))
   2272         return 127;
   2273 
   2274     kind = PyUnicode_KIND(unicode);
   2275     startptr = PyUnicode_DATA(unicode);
   2276     endptr = (char *)startptr + end * kind;
   2277     startptr = (char *)startptr + start * kind;
   2278     switch(kind) {
   2279     case PyUnicode_1BYTE_KIND:
   2280         return ucs1lib_find_max_char(startptr, endptr);
   2281     case PyUnicode_2BYTE_KIND:
   2282         return ucs2lib_find_max_char(startptr, endptr);
   2283     case PyUnicode_4BYTE_KIND:
   2284         return ucs4lib_find_max_char(startptr, endptr);
   2285     default:
   2286         assert(0);
   2287         return 0;
   2288     }
   2289 }
   2290 
   2291 /* Ensure that a string uses the most efficient storage, if it is not the
   2292    case: create a new string with of the right kind. Write NULL into *p_unicode
   2293    on error. */
   2294 static void
   2295 unicode_adjust_maxchar(PyObject **p_unicode)
   2296 {
   2297     PyObject *unicode, *copy;
   2298     Py_UCS4 max_char;
   2299     Py_ssize_t len;
   2300     unsigned int kind;
   2301 
   2302     assert(p_unicode != NULL);
   2303     unicode = *p_unicode;
   2304     assert(PyUnicode_IS_READY(unicode));
   2305     if (PyUnicode_IS_ASCII(unicode))
   2306         return;
   2307 
   2308     len = PyUnicode_GET_LENGTH(unicode);
   2309     kind = PyUnicode_KIND(unicode);
   2310     if (kind == PyUnicode_1BYTE_KIND) {
   2311         const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
   2312         max_char = ucs1lib_find_max_char(u, u + len);
   2313         if (max_char >= 128)
   2314             return;
   2315     }
   2316     else if (kind == PyUnicode_2BYTE_KIND) {
   2317         const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
   2318         max_char = ucs2lib_find_max_char(u, u + len);
   2319         if (max_char >= 256)
   2320             return;
   2321     }
   2322     else {
   2323         const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
   2324         assert(kind == PyUnicode_4BYTE_KIND);
   2325         max_char = ucs4lib_find_max_char(u, u + len);
   2326         if (max_char >= 0x10000)
   2327             return;
   2328     }
   2329     copy = PyUnicode_New(len, max_char);
   2330     if (copy != NULL)
   2331         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
   2332     Py_DECREF(unicode);
   2333     *p_unicode = copy;
   2334 }
   2335 
   2336 PyObject*
   2337 _PyUnicode_Copy(PyObject *unicode)
   2338 {
   2339     Py_ssize_t length;
   2340     PyObject *copy;
   2341 
   2342     if (!PyUnicode_Check(unicode)) {
   2343         PyErr_BadInternalCall();
   2344         return NULL;
   2345     }
   2346     if (PyUnicode_READY(unicode) == -1)
   2347         return NULL;
   2348 
   2349     length = PyUnicode_GET_LENGTH(unicode);
   2350     copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
   2351     if (!copy)
   2352         return NULL;
   2353     assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
   2354 
   2355     memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
   2356               length * PyUnicode_KIND(unicode));
   2357     assert(_PyUnicode_CheckConsistency(copy, 1));
   2358     return copy;
   2359 }
   2360 
   2361 
   2362 /* Widen Unicode objects to larger buffers. Don't write terminating null
   2363    character. Return NULL on error. */
   2364 
   2365 void*
   2366 _PyUnicode_AsKind(PyObject *s, unsigned int kind)
   2367 {
   2368     Py_ssize_t len;
   2369     void *result;
   2370     unsigned int skind;
   2371 
   2372     if (PyUnicode_READY(s) == -1)
   2373         return NULL;
   2374 
   2375     len = PyUnicode_GET_LENGTH(s);
   2376     skind = PyUnicode_KIND(s);
   2377     if (skind >= kind) {
   2378         PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
   2379         return NULL;
   2380     }
   2381     switch (kind) {
   2382     case PyUnicode_2BYTE_KIND:
   2383         result = PyMem_New(Py_UCS2, len);
   2384         if (!result)
   2385             return PyErr_NoMemory();
   2386         assert(skind == PyUnicode_1BYTE_KIND);
   2387         _PyUnicode_CONVERT_BYTES(
   2388             Py_UCS1, Py_UCS2,
   2389             PyUnicode_1BYTE_DATA(s),
   2390             PyUnicode_1BYTE_DATA(s) + len,
   2391             result);
   2392         return result;
   2393     case PyUnicode_4BYTE_KIND:
   2394         result = PyMem_New(Py_UCS4, len);
   2395         if (!result)
   2396             return PyErr_NoMemory();
   2397         if (skind == PyUnicode_2BYTE_KIND) {
   2398             _PyUnicode_CONVERT_BYTES(
   2399                 Py_UCS2, Py_UCS4,
   2400                 PyUnicode_2BYTE_DATA(s),
   2401                 PyUnicode_2BYTE_DATA(s) + len,
   2402                 result);
   2403         }
   2404         else {
   2405             assert(skind == PyUnicode_1BYTE_KIND);
   2406             _PyUnicode_CONVERT_BYTES(
   2407                 Py_UCS1, Py_UCS4,
   2408                 PyUnicode_1BYTE_DATA(s),
   2409                 PyUnicode_1BYTE_DATA(s) + len,
   2410                 result);
   2411         }
   2412         return result;
   2413     default:
   2414         break;
   2415     }
   2416     PyErr_SetString(PyExc_SystemError, "invalid kind");
   2417     return NULL;
   2418 }
   2419 
   2420 static Py_UCS4*
   2421 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
   2422         int copy_null)
   2423 {
   2424     int kind;
   2425     void *data;
   2426     Py_ssize_t len, targetlen;
   2427     if (PyUnicode_READY(string) == -1)
   2428         return NULL;
   2429     kind = PyUnicode_KIND(string);
   2430     data = PyUnicode_DATA(string);
   2431     len = PyUnicode_GET_LENGTH(string);
   2432     targetlen = len;
   2433     if (copy_null)
   2434         targetlen++;
   2435     if (!target) {
   2436         target = PyMem_New(Py_UCS4, targetlen);
   2437         if (!target) {
   2438             PyErr_NoMemory();
   2439             return NULL;
   2440         }
   2441     }
   2442     else {
   2443         if (targetsize < targetlen) {
   2444             PyErr_Format(PyExc_SystemError,
   2445                          "string is longer than the buffer");
   2446             if (copy_null && 0 < targetsize)
   2447                 target[0] = 0;
   2448             return NULL;
   2449         }
   2450     }
   2451     if (kind == PyUnicode_1BYTE_KIND) {
   2452         Py_UCS1 *start = (Py_UCS1 *) data;
   2453         _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
   2454     }
   2455     else if (kind == PyUnicode_2BYTE_KIND) {
   2456         Py_UCS2 *start = (Py_UCS2 *) data;
   2457         _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
   2458     }
   2459     else {
   2460         assert(kind == PyUnicode_4BYTE_KIND);
   2461         memcpy(target, data, len * sizeof(Py_UCS4));
   2462     }
   2463     if (copy_null)
   2464         target[len] = 0;
   2465     return target;
   2466 }
   2467 
   2468 Py_UCS4*
   2469 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
   2470                  int copy_null)
   2471 {
   2472     if (target == NULL || targetsize < 0) {
   2473         PyErr_BadInternalCall();
   2474         return NULL;
   2475     }
   2476     return as_ucs4(string, target, targetsize, copy_null);
   2477 }
   2478 
   2479 Py_UCS4*
   2480 PyUnicode_AsUCS4Copy(PyObject *string)
   2481 {
   2482     return as_ucs4(string, NULL, 0, 1);
   2483 }
   2484 
   2485 #ifdef HAVE_WCHAR_H
   2486 
   2487 PyObject *
   2488 PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
   2489 {
   2490     if (w == NULL) {
   2491         if (size == 0)
   2492             _Py_RETURN_UNICODE_EMPTY();
   2493         PyErr_BadInternalCall();
   2494         return NULL;
   2495     }
   2496 
   2497     if (size == -1) {
   2498         size = wcslen(w);
   2499     }
   2500 
   2501     return PyUnicode_FromUnicode(w, size);
   2502 }
   2503 
   2504 #endif /* HAVE_WCHAR_H */
   2505 
   2506 /* maximum number of characters required for output of %lld or %p.
   2507    We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
   2508    plus 1 for the sign.  53/22 is an upper bound for log10(256). */
   2509 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
   2510 
   2511 static int
   2512 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
   2513                              Py_ssize_t width, Py_ssize_t precision)
   2514 {
   2515     Py_ssize_t length, fill, arglen;
   2516     Py_UCS4 maxchar;
   2517 
   2518     if (PyUnicode_READY(str) == -1)
   2519         return -1;
   2520 
   2521     length = PyUnicode_GET_LENGTH(str);
   2522     if ((precision == -1 || precision >= length)
   2523         && width <= length)
   2524         return _PyUnicodeWriter_WriteStr(writer, str);
   2525 
   2526     if (precision != -1)
   2527         length = Py_MIN(precision, length);
   2528 
   2529     arglen = Py_MAX(length, width);
   2530     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
   2531         maxchar = _PyUnicode_FindMaxChar(str, 0, length);
   2532     else
   2533         maxchar = writer->maxchar;
   2534 
   2535     if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
   2536         return -1;
   2537 
   2538     if (width > length) {
   2539         fill = width - length;
   2540         if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
   2541             return -1;
   2542         writer->pos += fill;
   2543     }
   2544 
   2545     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
   2546                                   str, 0, length);
   2547     writer->pos += length;
   2548     return 0;
   2549 }
   2550 
   2551 static int
   2552 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
   2553                               Py_ssize_t width, Py_ssize_t precision)
   2554 {
   2555     /* UTF-8 */
   2556     Py_ssize_t length;
   2557     PyObject *unicode;
   2558     int res;
   2559 
   2560     length = strlen(str);
   2561     if (precision != -1)
   2562         length = Py_MIN(length, precision);
   2563     unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
   2564     if (unicode == NULL)
   2565         return -1;
   2566 
   2567     res = unicode_fromformat_write_str(writer, unicode, width, -1);
   2568     Py_DECREF(unicode);
   2569     return res;
   2570 }
   2571 
   2572 static const char*
   2573 unicode_fromformat_arg(_PyUnicodeWriter *writer,
   2574                        const char *f, va_list *vargs)
   2575 {
   2576     const char *p;
   2577     Py_ssize_t len;
   2578     int zeropad;
   2579     Py_ssize_t width;
   2580     Py_ssize_t precision;
   2581     int longflag;
   2582     int longlongflag;
   2583     int size_tflag;
   2584     Py_ssize_t fill;
   2585 
   2586     p = f;
   2587     f++;
   2588     zeropad = 0;
   2589     if (*f == '0') {
   2590         zeropad = 1;
   2591         f++;
   2592     }
   2593 
   2594     /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
   2595     width = -1;
   2596     if (Py_ISDIGIT((unsigned)*f)) {
   2597         width = *f - '0';
   2598         f++;
   2599         while (Py_ISDIGIT((unsigned)*f)) {
   2600             if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
   2601                 PyErr_SetString(PyExc_ValueError,
   2602                                 "width too big");
   2603                 return NULL;
   2604             }
   2605             width = (width * 10) + (*f - '0');
   2606             f++;
   2607         }
   2608     }
   2609     precision = -1;
   2610     if (*f == '.') {
   2611         f++;
   2612         if (Py_ISDIGIT((unsigned)*f)) {
   2613             precision = (*f - '0');
   2614             f++;
   2615             while (Py_ISDIGIT((unsigned)*f)) {
   2616                 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
   2617                     PyErr_SetString(PyExc_ValueError,
   2618                                     "precision too big");
   2619                     return NULL;
   2620                 }
   2621                 precision = (precision * 10) + (*f - '0');
   2622                 f++;
   2623             }
   2624         }
   2625         if (*f == '%') {
   2626             /* "%.3%s" => f points to "3" */
   2627             f--;
   2628         }
   2629     }
   2630     if (*f == '\0') {
   2631         /* bogus format "%.123" => go backward, f points to "3" */
   2632         f--;
   2633     }
   2634 
   2635     /* Handle %ld, %lu, %lld and %llu. */
   2636     longflag = 0;
   2637     longlongflag = 0;
   2638     size_tflag = 0;
   2639     if (*f == 'l') {
   2640         if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
   2641             longflag = 1;
   2642             ++f;
   2643         }
   2644         else if (f[1] == 'l' &&
   2645                  (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
   2646             longlongflag = 1;
   2647             f += 2;
   2648         }
   2649     }
   2650     /* handle the size_t flag. */
   2651     else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
   2652         size_tflag = 1;
   2653         ++f;
   2654     }
   2655 
   2656     if (f[1] == '\0')
   2657         writer->overallocate = 0;
   2658 
   2659     switch (*f) {
   2660     case 'c':
   2661     {
   2662         int ordinal = va_arg(*vargs, int);
   2663         if (ordinal < 0 || ordinal > MAX_UNICODE) {
   2664             PyErr_SetString(PyExc_OverflowError,
   2665                             "character argument not in range(0x110000)");
   2666             return NULL;
   2667         }
   2668         if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
   2669             return NULL;
   2670         break;
   2671     }
   2672 
   2673     case 'i':
   2674     case 'd':
   2675     case 'u':
   2676     case 'x':
   2677     {
   2678         /* used by sprintf */
   2679         char buffer[MAX_LONG_LONG_CHARS];
   2680         Py_ssize_t arglen;
   2681 
   2682         if (*f == 'u') {
   2683             if (longflag)
   2684                 len = sprintf(buffer, "%lu",
   2685                         va_arg(*vargs, unsigned long));
   2686             else if (longlongflag)
   2687                 len = sprintf(buffer, "%llu",
   2688                         va_arg(*vargs, unsigned long long));
   2689             else if (size_tflag)
   2690                 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
   2691                         va_arg(*vargs, size_t));
   2692             else
   2693                 len = sprintf(buffer, "%u",
   2694                         va_arg(*vargs, unsigned int));
   2695         }
   2696         else if (*f == 'x') {
   2697             len = sprintf(buffer, "%x", va_arg(*vargs, int));
   2698         }
   2699         else {
   2700             if (longflag)
   2701                 len = sprintf(buffer, "%li",
   2702                         va_arg(*vargs, long));
   2703             else if (longlongflag)
   2704                 len = sprintf(buffer, "%lli",
   2705                         va_arg(*vargs, long long));
   2706             else if (size_tflag)
   2707                 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
   2708                         va_arg(*vargs, Py_ssize_t));
   2709             else
   2710                 len = sprintf(buffer, "%i",
   2711                         va_arg(*vargs, int));
   2712         }
   2713         assert(len >= 0);
   2714 
   2715         if (precision < len)
   2716             precision = len;
   2717 
   2718         arglen = Py_MAX(precision, width);
   2719         if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
   2720             return NULL;
   2721 
   2722         if (width > precision) {
   2723             Py_UCS4 fillchar;
   2724             fill = width - precision;
   2725             fillchar = zeropad?'0':' ';
   2726             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
   2727                 return NULL;
   2728             writer->pos += fill;
   2729         }
   2730         if (precision > len) {
   2731             fill = precision - len;
   2732             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
   2733                 return NULL;
   2734             writer->pos += fill;
   2735         }
   2736 
   2737         if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
   2738             return NULL;
   2739         break;
   2740     }
   2741 
   2742     case 'p':
   2743     {
   2744         char number[MAX_LONG_LONG_CHARS];
   2745 
   2746         len = sprintf(number, "%p", va_arg(*vargs, void*));
   2747         assert(len >= 0);
   2748 
   2749         /* %p is ill-defined:  ensure leading 0x. */
   2750         if (number[1] == 'X')
   2751             number[1] = 'x';
   2752         else if (number[1] != 'x') {
   2753             memmove(number + 2, number,
   2754                     strlen(number) + 1);
   2755             number[0] = '0';
   2756             number[1] = 'x';
   2757             len += 2;
   2758         }
   2759 
   2760         if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
   2761             return NULL;
   2762         break;
   2763     }
   2764 
   2765     case 's':
   2766     {
   2767         /* UTF-8 */
   2768         const char *s = va_arg(*vargs, const char*);
   2769         if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
   2770             return NULL;
   2771         break;
   2772     }
   2773 
   2774     case 'U':
   2775     {
   2776         PyObject *obj = va_arg(*vargs, PyObject *);
   2777         assert(obj && _PyUnicode_CHECK(obj));
   2778 
   2779         if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
   2780             return NULL;
   2781         break;
   2782     }
   2783 
   2784     case 'V':
   2785     {
   2786         PyObject *obj = va_arg(*vargs, PyObject *);
   2787         const char *str = va_arg(*vargs, const char *);
   2788         if (obj) {
   2789             assert(_PyUnicode_CHECK(obj));
   2790             if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
   2791                 return NULL;
   2792         }
   2793         else {
   2794             assert(str != NULL);
   2795             if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
   2796                 return NULL;
   2797         }
   2798         break;
   2799     }
   2800 
   2801     case 'S':
   2802     {
   2803         PyObject *obj = va_arg(*vargs, PyObject *);
   2804         PyObject *str;
   2805         assert(obj);
   2806         str = PyObject_Str(obj);
   2807         if (!str)
   2808             return NULL;
   2809         if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
   2810             Py_DECREF(str);
   2811             return NULL;
   2812         }
   2813         Py_DECREF(str);
   2814         break;
   2815     }
   2816 
   2817     case 'R':
   2818     {
   2819         PyObject *obj = va_arg(*vargs, PyObject *);
   2820         PyObject *repr;
   2821         assert(obj);
   2822         repr = PyObject_Repr(obj);
   2823         if (!repr)
   2824             return NULL;
   2825         if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
   2826             Py_DECREF(repr);
   2827             return NULL;
   2828         }
   2829         Py_DECREF(repr);
   2830         break;
   2831     }
   2832 
   2833     case 'A':
   2834     {
   2835         PyObject *obj = va_arg(*vargs, PyObject *);
   2836         PyObject *ascii;
   2837         assert(obj);
   2838         ascii = PyObject_ASCII(obj);
   2839         if (!ascii)
   2840             return NULL;
   2841         if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
   2842             Py_DECREF(ascii);
   2843             return NULL;
   2844         }
   2845         Py_DECREF(ascii);
   2846         break;
   2847     }
   2848 
   2849     case '%':
   2850         if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
   2851             return NULL;
   2852         break;
   2853 
   2854     default:
   2855         /* if we stumble upon an unknown formatting code, copy the rest
   2856            of the format string to the output string. (we cannot just
   2857            skip the code, since there's no way to know what's in the
   2858            argument list) */
   2859         len = strlen(p);
   2860         if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
   2861             return NULL;
   2862         f = p+len;
   2863         return f;
   2864     }
   2865 
   2866     f++;
   2867     return f;
   2868 }
   2869 
   2870 PyObject *
   2871 PyUnicode_FromFormatV(const char *format, va_list vargs)
   2872 {
   2873     va_list vargs2;
   2874     const char *f;
   2875     _PyUnicodeWriter writer;
   2876 
   2877     _PyUnicodeWriter_Init(&writer);
   2878     writer.min_length = strlen(format) + 100;
   2879     writer.overallocate = 1;
   2880 
   2881     // Copy varags to be able to pass a reference to a subfunction.
   2882     va_copy(vargs2, vargs);
   2883 
   2884     for (f = format; *f; ) {
   2885         if (*f == '%') {
   2886             f = unicode_fromformat_arg(&writer, f, &vargs2);
   2887             if (f == NULL)
   2888                 goto fail;
   2889         }
   2890         else {
   2891             const char *p;
   2892             Py_ssize_t len;
   2893 
   2894             p = f;
   2895             do
   2896             {
   2897                 if ((unsigned char)*p > 127) {
   2898                     PyErr_Format(PyExc_ValueError,
   2899                         "PyUnicode_FromFormatV() expects an ASCII-encoded format "
   2900                         "string, got a non-ASCII byte: 0x%02x",
   2901                         (unsigned char)*p);
   2902                     goto fail;
   2903                 }
   2904                 p++;
   2905             }
   2906             while (*p != '\0' && *p != '%');
   2907             len = p - f;
   2908 
   2909             if (*p == '\0')
   2910                 writer.overallocate = 0;
   2911 
   2912             if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
   2913                 goto fail;
   2914 
   2915             f = p;
   2916         }
   2917     }
   2918     va_end(vargs2);
   2919     return _PyUnicodeWriter_Finish(&writer);
   2920 
   2921   fail:
   2922     va_end(vargs2);
   2923     _PyUnicodeWriter_Dealloc(&writer);
   2924     return NULL;
   2925 }
   2926 
   2927 PyObject *
   2928 PyUnicode_FromFormat(const char *format, ...)
   2929 {
   2930     PyObject* ret;
   2931     va_list vargs;
   2932 
   2933 #ifdef HAVE_STDARG_PROTOTYPES
   2934     va_start(vargs, format);
   2935 #else
   2936     va_start(vargs);
   2937 #endif
   2938     ret = PyUnicode_FromFormatV(format, vargs);
   2939     va_end(vargs);
   2940     return ret;
   2941 }
   2942 
   2943 #ifdef HAVE_WCHAR_H
   2944 
   2945 /* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
   2946    convert a Unicode object to a wide character string.
   2947 
   2948    - If w is NULL: return the number of wide characters (including the null
   2949      character) required to convert the unicode object. Ignore size argument.
   2950 
   2951    - Otherwise: return the number of wide characters (excluding the null
   2952      character) written into w. Write at most size wide characters (including
   2953      the null character). */
   2954 static Py_ssize_t
   2955 unicode_aswidechar(PyObject *unicode,
   2956                    wchar_t *w,
   2957                    Py_ssize_t size)
   2958 {
   2959     Py_ssize_t res;
   2960     const wchar_t *wstr;
   2961 
   2962     wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
   2963     if (wstr == NULL)
   2964         return -1;
   2965 
   2966     if (w != NULL) {
   2967         if (size > res)
   2968             size = res + 1;
   2969         else
   2970             res = size;
   2971         memcpy(w, wstr, size * sizeof(wchar_t));
   2972         return res;
   2973     }
   2974     else
   2975         return res + 1;
   2976 }
   2977 
   2978 Py_ssize_t
   2979 PyUnicode_AsWideChar(PyObject *unicode,
   2980                      wchar_t *w,
   2981                      Py_ssize_t size)
   2982 {
   2983     if (unicode == NULL) {
   2984         PyErr_BadInternalCall();
   2985         return -1;
   2986     }
   2987     return unicode_aswidechar(unicode, w, size);
   2988 }
   2989 
   2990 wchar_t*
   2991 PyUnicode_AsWideCharString(PyObject *unicode,
   2992                            Py_ssize_t *size)
   2993 {
   2994     wchar_t* buffer;
   2995     Py_ssize_t buflen;
   2996 
   2997     if (unicode == NULL) {
   2998         PyErr_BadInternalCall();
   2999         return NULL;
   3000     }
   3001 
   3002     buflen = unicode_aswidechar(unicode, NULL, 0);
   3003     if (buflen == -1)
   3004         return NULL;
   3005     buffer = PyMem_NEW(wchar_t, buflen);
   3006     if (buffer == NULL) {
   3007         PyErr_NoMemory();
   3008         return NULL;
   3009     }
   3010     buflen = unicode_aswidechar(unicode, buffer, buflen);
   3011     if (buflen == -1) {
   3012         PyMem_FREE(buffer);
   3013         return NULL;
   3014     }
   3015     if (size != NULL)
   3016         *size = buflen;
   3017     return buffer;
   3018 }
   3019 
   3020 #endif /* HAVE_WCHAR_H */
   3021 
   3022 PyObject *
   3023 PyUnicode_FromOrdinal(int ordinal)
   3024 {
   3025     if (ordinal < 0 || ordinal > MAX_UNICODE) {
   3026         PyErr_SetString(PyExc_ValueError,
   3027                         "chr() arg not in range(0x110000)");
   3028         return NULL;
   3029     }
   3030 
   3031     return unicode_char((Py_UCS4)ordinal);
   3032 }
   3033 
   3034 PyObject *
   3035 PyUnicode_FromObject(PyObject *obj)
   3036 {
   3037     /* XXX Perhaps we should make this API an alias of
   3038        PyObject_Str() instead ?! */
   3039     if (PyUnicode_CheckExact(obj)) {
   3040         if (PyUnicode_READY(obj) == -1)
   3041             return NULL;
   3042         Py_INCREF(obj);
   3043         return obj;
   3044     }
   3045     if (PyUnicode_Check(obj)) {
   3046         /* For a Unicode subtype that's not a Unicode object,
   3047            return a true Unicode object with the same data. */
   3048         return _PyUnicode_Copy(obj);
   3049     }
   3050     PyErr_Format(PyExc_TypeError,
   3051                  "Can't convert '%.100s' object to str implicitly",
   3052                  Py_TYPE(obj)->tp_name);
   3053     return NULL;
   3054 }
   3055 
   3056 PyObject *
   3057 PyUnicode_FromEncodedObject(PyObject *obj,
   3058                             const char *encoding,
   3059                             const char *errors)
   3060 {
   3061     Py_buffer buffer;
   3062     PyObject *v;
   3063 
   3064     if (obj == NULL) {
   3065         PyErr_BadInternalCall();
   3066         return NULL;
   3067     }
   3068 
   3069     /* Decoding bytes objects is the most common case and should be fast */
   3070     if (PyBytes_Check(obj)) {
   3071         if (PyBytes_GET_SIZE(obj) == 0)
   3072             _Py_RETURN_UNICODE_EMPTY();
   3073         v = PyUnicode_Decode(
   3074                 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
   3075                 encoding, errors);
   3076         return v;
   3077     }
   3078 
   3079     if (PyUnicode_Check(obj)) {
   3080         PyErr_SetString(PyExc_TypeError,
   3081                         "decoding str is not supported");
   3082         return NULL;
   3083     }
   3084 
   3085     /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
   3086     if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
   3087         PyErr_Format(PyExc_TypeError,
   3088                      "decoding to str: need a bytes-like object, %.80s found",
   3089                      Py_TYPE(obj)->tp_name);
   3090         return NULL;
   3091     }
   3092 
   3093     if (buffer.len == 0) {
   3094         PyBuffer_Release(&buffer);
   3095         _Py_RETURN_UNICODE_EMPTY();
   3096     }
   3097 
   3098     v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
   3099     PyBuffer_Release(&buffer);
   3100     return v;
   3101 }
   3102 
   3103 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
   3104    also convert to lowercase. Return 1 on success, or 0 on error (encoding is
   3105    longer than lower_len-1). */
   3106 int
   3107 _Py_normalize_encoding(const char *encoding,
   3108                        char *lower,
   3109                        size_t lower_len)
   3110 {
   3111     const char *e;
   3112     char *l;
   3113     char *l_end;
   3114     int punct;
   3115 
   3116     assert(encoding != NULL);
   3117 
   3118     e = encoding;
   3119     l = lower;
   3120     l_end = &lower[lower_len - 1];
   3121     punct = 0;
   3122     while (1) {
   3123         char c = *e;
   3124         if (c == 0) {
   3125             break;
   3126         }
   3127 
   3128         if (Py_ISALNUM(c) || c == '.') {
   3129             if (punct && l != lower) {
   3130                 if (l == l_end) {
   3131                     return 0;
   3132                 }
   3133                 *l++ = '_';
   3134             }
   3135             punct = 0;
   3136 
   3137             if (l == l_end) {
   3138                 return 0;
   3139             }
   3140             *l++ = Py_TOLOWER(c);
   3141         }
   3142         else {
   3143             punct = 1;
   3144         }
   3145 
   3146         e++;
   3147     }
   3148     *l = '\0';
   3149     return 1;
   3150 }
   3151 
   3152 PyObject *
   3153 PyUnicode_Decode(const char *s,
   3154                  Py_ssize_t size,
   3155                  const char *encoding,
   3156                  const char *errors)
   3157 {
   3158     PyObject *buffer = NULL, *unicode;
   3159     Py_buffer info;
   3160     char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
   3161 
   3162     if (encoding == NULL) {
   3163         return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
   3164     }
   3165 
   3166     /* Shortcuts for common default encodings */
   3167     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
   3168         char *lower = buflower;
   3169 
   3170         /* Fast paths */
   3171         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
   3172             lower += 3;
   3173             if (*lower == '_') {
   3174                 /* Match "utf8" and "utf_8" */
   3175                 lower++;
   3176             }
   3177 
   3178             if (lower[0] == '8' && lower[1] == 0) {
   3179                 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
   3180             }
   3181             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
   3182                 return PyUnicode_DecodeUTF16(s, size, errors, 0);
   3183             }
   3184             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
   3185                 return PyUnicode_DecodeUTF32(s, size, errors, 0);
   3186             }
   3187         }
   3188         else {
   3189             if (strcmp(lower, "ascii") == 0
   3190                 || strcmp(lower, "us_ascii") == 0) {
   3191                 return PyUnicode_DecodeASCII(s, size, errors);
   3192             }
   3193     #ifdef MS_WINDOWS
   3194             else if (strcmp(lower, "mbcs") == 0) {
   3195                 return PyUnicode_DecodeMBCS(s, size, errors);
   3196             }
   3197     #endif
   3198             else if (strcmp(lower, "latin1") == 0
   3199                      || strcmp(lower, "latin_1") == 0
   3200                      || strcmp(lower, "iso_8859_1") == 0
   3201                      || strcmp(lower, "iso8859_1") == 0) {
   3202                 return PyUnicode_DecodeLatin1(s, size, errors);
   3203             }
   3204         }
   3205     }
   3206 
   3207     /* Decode via the codec registry */
   3208     buffer = NULL;
   3209     if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
   3210         goto onError;
   3211     buffer = PyMemoryView_FromBuffer(&info);
   3212     if (buffer == NULL)
   3213         goto onError;
   3214     unicode = _PyCodec_DecodeText(buffer, encoding, errors);
   3215     if (unicode == NULL)
   3216         goto onError;
   3217     if (!PyUnicode_Check(unicode)) {
   3218         PyErr_Format(PyExc_TypeError,
   3219                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
   3220                      "use codecs.decode() to decode to arbitrary types",
   3221                      encoding,
   3222                      Py_TYPE(unicode)->tp_name);
   3223         Py_DECREF(unicode);
   3224         goto onError;
   3225     }
   3226     Py_DECREF(buffer);
   3227     return unicode_result(unicode);
   3228 
   3229   onError:
   3230     Py_XDECREF(buffer);
   3231     return NULL;
   3232 }
   3233 
   3234 PyObject *
   3235 PyUnicode_AsDecodedObject(PyObject *unicode,
   3236                           const char *encoding,
   3237                           const char *errors)
   3238 {
   3239     if (!PyUnicode_Check(unicode)) {
   3240         PyErr_BadArgument();
   3241         return NULL;
   3242     }
   3243 
   3244     if (PyErr_WarnEx(PyExc_DeprecationWarning,
   3245                      "PyUnicode_AsDecodedObject() is deprecated; "
   3246                      "use PyCodec_Decode() to decode from str", 1) < 0)
   3247         return NULL;
   3248 
   3249     if (encoding == NULL)
   3250         encoding = PyUnicode_GetDefaultEncoding();
   3251 
   3252     /* Decode via the codec registry */
   3253     return PyCodec_Decode(unicode, encoding, errors);
   3254 }
   3255 
   3256 PyObject *
   3257 PyUnicode_AsDecodedUnicode(PyObject *unicode,
   3258                            const char *encoding,
   3259                            const char *errors)
   3260 {
   3261     PyObject *v;
   3262 
   3263     if (!PyUnicode_Check(unicode)) {
   3264         PyErr_BadArgument();
   3265         goto onError;
   3266     }
   3267 
   3268     if (PyErr_WarnEx(PyExc_DeprecationWarning,
   3269                      "PyUnicode_AsDecodedUnicode() is deprecated; "
   3270                      "use PyCodec_Decode() to decode from str to str", 1) < 0)
   3271         return NULL;
   3272 
   3273     if (encoding == NULL)
   3274         encoding = PyUnicode_GetDefaultEncoding();
   3275 
   3276     /* Decode via the codec registry */
   3277     v = PyCodec_Decode(unicode, encoding, errors);
   3278     if (v == NULL)
   3279         goto onError;
   3280     if (!PyUnicode_Check(v)) {
   3281         PyErr_Format(PyExc_TypeError,
   3282                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
   3283                      "use codecs.decode() to decode to arbitrary types",
   3284                      encoding,
   3285                      Py_TYPE(unicode)->tp_name);
   3286         Py_DECREF(v);
   3287         goto onError;
   3288     }
   3289     return unicode_result(v);
   3290 
   3291   onError:
   3292     return NULL;
   3293 }
   3294 
   3295 PyObject *
   3296 PyUnicode_Encode(const Py_UNICODE *s,
   3297                  Py_ssize_t size,
   3298                  const char *encoding,
   3299                  const char *errors)
   3300 {
   3301     PyObject *v, *unicode;
   3302 
   3303     unicode = PyUnicode_FromUnicode(s, size);
   3304     if (unicode == NULL)
   3305         return NULL;
   3306     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
   3307     Py_DECREF(unicode);
   3308     return v;
   3309 }
   3310 
   3311 PyObject *
   3312 PyUnicode_AsEncodedObject(PyObject *unicode,
   3313                           const char *encoding,
   3314                           const char *errors)
   3315 {
   3316     PyObject *v;
   3317 
   3318     if (!PyUnicode_Check(unicode)) {
   3319         PyErr_BadArgument();
   3320         goto onError;
   3321     }
   3322 
   3323     if (PyErr_WarnEx(PyExc_DeprecationWarning,
   3324                      "PyUnicode_AsEncodedObject() is deprecated; "
   3325                      "use PyUnicode_AsEncodedString() to encode from str to bytes "
   3326                      "or PyCodec_Encode() for generic encoding", 1) < 0)
   3327         return NULL;
   3328 
   3329     if (encoding == NULL)
   3330         encoding = PyUnicode_GetDefaultEncoding();
   3331 
   3332     /* Encode via the codec registry */
   3333     v = PyCodec_Encode(unicode, encoding, errors);
   3334     if (v == NULL)
   3335         goto onError;
   3336     return v;
   3337 
   3338   onError:
   3339     return NULL;
   3340 }
   3341 
   3342 static size_t
   3343 wcstombs_errorpos(const wchar_t *wstr)
   3344 {
   3345     size_t len;
   3346 #if SIZEOF_WCHAR_T == 2
   3347     wchar_t buf[3];
   3348 #else
   3349     wchar_t buf[2];
   3350 #endif
   3351     char outbuf[MB_LEN_MAX];
   3352     const wchar_t *start, *previous;
   3353 
   3354 #if SIZEOF_WCHAR_T == 2
   3355     buf[2] = 0;
   3356 #else
   3357     buf[1] = 0;
   3358 #endif
   3359     start = wstr;
   3360     while (*wstr != L'\0')
   3361     {
   3362         previous = wstr;
   3363 #if SIZEOF_WCHAR_T == 2
   3364         if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
   3365             && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
   3366         {
   3367             buf[0] = wstr[0];
   3368             buf[1] = wstr[1];
   3369             wstr += 2;
   3370         }
   3371         else {
   3372             buf[0] = *wstr;
   3373             buf[1] = 0;
   3374             wstr++;
   3375         }
   3376 #else
   3377         buf[0] = *wstr;
   3378         wstr++;
   3379 #endif
   3380         len = wcstombs(outbuf, buf, sizeof(outbuf));
   3381         if (len == (size_t)-1)
   3382             return previous - start;
   3383     }
   3384 
   3385     /* failed to find the unencodable character */
   3386     return 0;
   3387 }
   3388 
   3389 static int
   3390 locale_error_handler(const char *errors, int *surrogateescape)
   3391 {
   3392     _Py_error_handler error_handler = get_error_handler(errors);
   3393     switch (error_handler)
   3394     {
   3395     case _Py_ERROR_STRICT:
   3396         *surrogateescape = 0;
   3397         return 0;
   3398     case _Py_ERROR_SURROGATEESCAPE:
   3399         *surrogateescape = 1;
   3400         return 0;
   3401     default:
   3402         PyErr_Format(PyExc_ValueError,
   3403                      "only 'strict' and 'surrogateescape' error handlers "
   3404                      "are supported, not '%s'",
   3405                      errors);
   3406         return -1;
   3407     }
   3408 }
   3409 
   3410 PyObject *
   3411 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
   3412 {
   3413     Py_ssize_t wlen, wlen2;
   3414     wchar_t *wstr;
   3415     PyObject *bytes = NULL;
   3416     char *errmsg;
   3417     PyObject *reason = NULL;
   3418     PyObject *exc;
   3419     size_t error_pos;
   3420     int surrogateescape;
   3421 
   3422     if (locale_error_handler(errors, &surrogateescape) < 0)
   3423         return NULL;
   3424 
   3425     wstr = PyUnicode_AsWideCharString(unicode, &wlen);
   3426     if (wstr == NULL)
   3427         return NULL;
   3428 
   3429     wlen2 = wcslen(wstr);
   3430     if (wlen2 != wlen) {
   3431         PyMem_Free(wstr);
   3432         PyErr_SetString(PyExc_ValueError, "embedded null character");
   3433         return NULL;
   3434     }
   3435 
   3436     if (surrogateescape) {
   3437         /* "surrogateescape" error handler */
   3438         char *str;
   3439 
   3440         str = Py_EncodeLocale(wstr, &error_pos);
   3441         if (str == NULL) {
   3442             if (error_pos == (size_t)-1) {
   3443                 PyErr_NoMemory();
   3444                 PyMem_Free(wstr);
   3445                 return NULL;
   3446             }
   3447             else {
   3448                 goto encode_error;
   3449             }
   3450         }
   3451         PyMem_Free(wstr);
   3452 
   3453         bytes = PyBytes_FromString(str);
   3454         PyMem_Free(str);
   3455     }
   3456     else {
   3457         /* strict mode */
   3458         size_t len, len2;
   3459 
   3460         len = wcstombs(NULL, wstr, 0);
   3461         if (len == (size_t)-1) {
   3462             error_pos = (size_t)-1;
   3463             goto encode_error;
   3464         }
   3465 
   3466         bytes = PyBytes_FromStringAndSize(NULL, len);
   3467         if (bytes == NULL) {
   3468             PyMem_Free(wstr);
   3469             return NULL;
   3470         }
   3471 
   3472         len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
   3473         if (len2 == (size_t)-1 || len2 > len) {
   3474             error_pos = (size_t)-1;
   3475             goto encode_error;
   3476         }
   3477         PyMem_Free(wstr);
   3478     }
   3479     return bytes;
   3480 
   3481 encode_error:
   3482     errmsg = strerror(errno);
   3483     assert(errmsg != NULL);
   3484 
   3485     if (error_pos == (size_t)-1)
   3486         error_pos = wcstombs_errorpos(wstr);
   3487 
   3488     PyMem_Free(wstr);
   3489     Py_XDECREF(bytes);
   3490 
   3491     if (errmsg != NULL) {
   3492         size_t errlen;
   3493         wstr = Py_DecodeLocale(errmsg, &errlen);
   3494         if (wstr != NULL) {
   3495             reason = PyUnicode_FromWideChar(wstr, errlen);
   3496             PyMem_RawFree(wstr);
   3497         } else
   3498             errmsg = NULL;
   3499     }
   3500     if (errmsg == NULL)
   3501         reason = PyUnicode_FromString(
   3502             "wcstombs() encountered an unencodable "
   3503             "wide character");
   3504     if (reason == NULL)
   3505         return NULL;
   3506 
   3507     exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
   3508                                 "locale", unicode,
   3509                                 (Py_ssize_t)error_pos,
   3510                                 (Py_ssize_t)(error_pos+1),
   3511                                 reason);
   3512     Py_DECREF(reason);
   3513     if (exc != NULL) {
   3514         PyCodec_StrictErrors(exc);
   3515         Py_XDECREF(exc);
   3516     }
   3517     return NULL;
   3518 }
   3519 
   3520 PyObject *
   3521 PyUnicode_EncodeFSDefault(PyObject *unicode)
   3522 {
   3523 #if defined(__APPLE__)
   3524     return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
   3525 #else
   3526     PyInterpreterState *interp = PyThreadState_GET()->interp;
   3527     /* Bootstrap check: if the filesystem codec is implemented in Python, we
   3528        cannot use it to encode and decode filenames before it is loaded. Load
   3529        the Python codec requires to encode at least its own filename. Use the C
   3530        version of the locale codec until the codec registry is initialized and
   3531        the Python codec is loaded.
   3532 
   3533        Py_FileSystemDefaultEncoding is shared between all interpreters, we
   3534        cannot only rely on it: check also interp->fscodec_initialized for
   3535        subinterpreters. */
   3536     if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
   3537         return PyUnicode_AsEncodedString(unicode,
   3538                                          Py_FileSystemDefaultEncoding,
   3539                                          Py_FileSystemDefaultEncodeErrors);
   3540     }
   3541     else {
   3542         return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
   3543     }
   3544 #endif
   3545 }
   3546 
   3547 PyObject *
   3548 PyUnicode_AsEncodedString(PyObject *unicode,
   3549                           const char *encoding,
   3550                           const char *errors)
   3551 {
   3552     PyObject *v;
   3553     char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
   3554 
   3555     if (!PyUnicode_Check(unicode)) {
   3556         PyErr_BadArgument();
   3557         return NULL;
   3558     }
   3559 
   3560     if (encoding == NULL) {
   3561         return _PyUnicode_AsUTF8String(unicode, errors);
   3562     }
   3563 
   3564     /* Shortcuts for common default encodings */
   3565     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
   3566         char *lower = buflower;
   3567 
   3568         /* Fast paths */
   3569         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
   3570             lower += 3;
   3571             if (*lower == '_') {
   3572                 /* Match "utf8" and "utf_8" */
   3573                 lower++;
   3574             }
   3575 
   3576             if (lower[0] == '8' && lower[1] == 0) {
   3577                 return _PyUnicode_AsUTF8String(unicode, errors);
   3578             }
   3579             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
   3580                 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
   3581             }
   3582             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
   3583                 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
   3584             }
   3585         }
   3586         else {
   3587             if (strcmp(lower, "ascii") == 0
   3588                 || strcmp(lower, "us_ascii") == 0) {
   3589                 return _PyUnicode_AsASCIIString(unicode, errors);
   3590             }
   3591 #ifdef MS_WINDOWS
   3592             else if (strcmp(lower, "mbcs") == 0) {
   3593                 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
   3594             }
   3595 #endif
   3596             else if (strcmp(lower, "latin1") == 0 ||
   3597                      strcmp(lower, "latin_1") == 0 ||
   3598                      strcmp(lower, "iso_8859_1") == 0 ||
   3599                      strcmp(lower, "iso8859_1") == 0) {
   3600                 return _PyUnicode_AsLatin1String(unicode, errors);
   3601             }
   3602         }
   3603     }
   3604 
   3605     /* Encode via the codec registry */
   3606     v = _PyCodec_EncodeText(unicode, encoding, errors);
   3607     if (v == NULL)
   3608         return NULL;
   3609 
   3610     /* The normal path */
   3611     if (PyBytes_Check(v))
   3612         return v;
   3613 
   3614     /* If the codec returns a buffer, raise a warning and convert to bytes */
   3615     if (PyByteArray_Check(v)) {
   3616         int error;
   3617         PyObject *b;
   3618 
   3619         error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
   3620             "encoder %s returned bytearray instead of bytes; "
   3621             "use codecs.encode() to encode to arbitrary types",
   3622             encoding);
   3623         if (error) {
   3624             Py_DECREF(v);
   3625             return NULL;
   3626         }
   3627 
   3628         b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
   3629         Py_DECREF(v);
   3630         return b;
   3631     }
   3632 
   3633     PyErr_Format(PyExc_TypeError,
   3634                  "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
   3635                  "use codecs.encode() to encode to arbitrary types",
   3636                  encoding,
   3637                  Py_TYPE(v)->tp_name);
   3638     Py_DECREF(v);
   3639     return NULL;
   3640 }
   3641 
   3642 PyObject *
   3643 PyUnicode_AsEncodedUnicode(PyObject *unicode,
   3644                            const char *encoding,
   3645                            const char *errors)
   3646 {
   3647     PyObject *v;
   3648 
   3649     if (!PyUnicode_Check(unicode)) {
   3650         PyErr_BadArgument();
   3651         goto onError;
   3652     }
   3653 
   3654     if (PyErr_WarnEx(PyExc_DeprecationWarning,
   3655                      "PyUnicode_AsEncodedUnicode() is deprecated; "
   3656                      "use PyCodec_Encode() to encode from str to str", 1) < 0)
   3657         return NULL;
   3658 
   3659     if (encoding == NULL)
   3660         encoding = PyUnicode_GetDefaultEncoding();
   3661 
   3662     /* Encode via the codec registry */
   3663     v = PyCodec_Encode(unicode, encoding, errors);
   3664     if (v == NULL)
   3665         goto onError;
   3666     if (!PyUnicode_Check(v)) {
   3667         PyErr_Format(PyExc_TypeError,
   3668                      "'%.400s' encoder returned '%.400s' instead of 'str'; "
   3669                      "use codecs.encode() to encode to arbitrary types",
   3670                      encoding,
   3671                      Py_TYPE(v)->tp_name);
   3672         Py_DECREF(v);
   3673         goto onError;
   3674     }
   3675     return v;
   3676 
   3677   onError:
   3678     return NULL;
   3679 }
   3680 
   3681 static size_t
   3682 mbstowcs_errorpos(const char *str, size_t len)
   3683 {
   3684 #ifdef HAVE_MBRTOWC
   3685     const char *start = str;
   3686     mbstate_t mbs;
   3687     size_t converted;
   3688     wchar_t ch;
   3689 
   3690     memset(&mbs, 0, sizeof mbs);
   3691     while (len)
   3692     {
   3693         converted = mbrtowc(&ch, str, len, &mbs);
   3694         if (converted == 0)
   3695             /* Reached end of string */
   3696             break;
   3697         if (converted == (size_t)-1 || converted == (size_t)-2) {
   3698             /* Conversion error or incomplete character */
   3699             return str - start;
   3700         }
   3701         else {
   3702             str += converted;
   3703             len -= converted;
   3704         }
   3705     }
   3706     /* failed to find the undecodable byte sequence */
   3707     return 0;
   3708 #endif
   3709     return 0;
   3710 }
   3711 
   3712 PyObject*
   3713 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
   3714                               const char *errors)
   3715 {
   3716     wchar_t smallbuf[256];
   3717     size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
   3718     wchar_t *wstr;
   3719     size_t wlen, wlen2;
   3720     PyObject *unicode;
   3721     int surrogateescape;
   3722     size_t error_pos;
   3723     char *errmsg;
   3724     PyObject *reason = NULL;   /* initialize to prevent gcc warning */
   3725     PyObject *exc;
   3726 
   3727     if (locale_error_handler(errors, &surrogateescape) < 0)
   3728         return NULL;
   3729 
   3730     if (str[len] != '\0' || (size_t)len != strlen(str))  {
   3731         PyErr_SetString(PyExc_ValueError, "embedded null byte");
   3732         return NULL;
   3733     }
   3734 
   3735     if (surrogateescape) {
   3736         /* "surrogateescape" error handler */
   3737         wstr = Py_DecodeLocale(str, &wlen);
   3738         if (wstr == NULL) {
   3739             if (wlen == (size_t)-1)
   3740                 PyErr_NoMemory();
   3741             else
   3742                 PyErr_SetFromErrno(PyExc_OSError);
   3743             return NULL;
   3744         }
   3745 
   3746         unicode = PyUnicode_FromWideChar(wstr, wlen);
   3747         PyMem_RawFree(wstr);
   3748     }
   3749     else {
   3750         /* strict mode */
   3751 #ifndef HAVE_BROKEN_MBSTOWCS
   3752         wlen = mbstowcs(NULL, str, 0);
   3753 #else
   3754         wlen = len;
   3755 #endif
   3756         if (wlen == (size_t)-1)
   3757             goto decode_error;
   3758         if (wlen+1 <= smallbuf_len) {
   3759             wstr = smallbuf;
   3760         }
   3761         else {
   3762             wstr = PyMem_New(wchar_t, wlen+1);
   3763             if (!wstr)
   3764                 return PyErr_NoMemory();
   3765         }
   3766 
   3767         wlen2 = mbstowcs(wstr, str, wlen+1);
   3768         if (wlen2 == (size_t)-1) {
   3769             if (wstr != smallbuf)
   3770                 PyMem_Free(wstr);
   3771             goto decode_error;
   3772         }
   3773 #ifdef HAVE_BROKEN_MBSTOWCS
   3774         assert(wlen2 == wlen);
   3775 #endif
   3776         unicode = PyUnicode_FromWideChar(wstr, wlen2);
   3777         if (wstr != smallbuf)
   3778             PyMem_Free(wstr);
   3779     }
   3780     return unicode;
   3781 
   3782 decode_error:
   3783     reason = NULL;
   3784     errmsg = strerror(errno);
   3785     assert(errmsg != NULL);
   3786 
   3787     error_pos = mbstowcs_errorpos(str, len);
   3788     if (errmsg != NULL) {
   3789         size_t errlen;
   3790         wstr = Py_DecodeLocale(errmsg, &errlen);
   3791         if (wstr != NULL) {
   3792             reason = PyUnicode_FromWideChar(wstr, errlen);
   3793             PyMem_RawFree(wstr);
   3794         }
   3795     }
   3796     if (reason == NULL)
   3797         reason = PyUnicode_FromString(
   3798             "mbstowcs() encountered an invalid multibyte sequence");
   3799     if (reason == NULL)
   3800         return NULL;
   3801 
   3802     exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
   3803                                 "locale", str, len,
   3804                                 (Py_ssize_t)error_pos,
   3805                                 (Py_ssize_t)(error_pos+1),
   3806                                 reason);
   3807     Py_DECREF(reason);
   3808     if (exc != NULL) {
   3809         PyCodec_StrictErrors(exc);
   3810         Py_XDECREF(exc);
   3811     }
   3812     return NULL;
   3813 }
   3814 
   3815 PyObject*
   3816 PyUnicode_DecodeLocale(const char *str, const char *errors)
   3817 {
   3818     Py_ssize_t size = (Py_ssize_t)strlen(str);
   3819     return PyUnicode_DecodeLocaleAndSize(str, size, errors);
   3820 }
   3821 
   3822 
   3823 PyObject*
   3824 PyUnicode_DecodeFSDefault(const char *s) {
   3825     Py_ssize_t size = (Py_ssize_t)strlen(s);
   3826     return PyUnicode_DecodeFSDefaultAndSize(s, size);
   3827 }
   3828 
   3829 PyObject*
   3830 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
   3831 {
   3832 #if defined(__APPLE__)
   3833     return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
   3834 #else
   3835     PyInterpreterState *interp = PyThreadState_GET()->interp;
   3836     /* Bootstrap check: if the filesystem codec is implemented in Python, we
   3837        cannot use it to encode and decode filenames before it is loaded. Load
   3838        the Python codec requires to encode at least its own filename. Use the C
   3839        version of the locale codec until the codec registry is initialized and
   3840        the Python codec is loaded.
   3841 
   3842        Py_FileSystemDefaultEncoding is shared between all interpreters, we
   3843        cannot only rely on it: check also interp->fscodec_initialized for
   3844        subinterpreters. */
   3845     if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
   3846         return PyUnicode_Decode(s, size,
   3847                                 Py_FileSystemDefaultEncoding,
   3848                                 Py_FileSystemDefaultEncodeErrors);
   3849     }
   3850     else {
   3851         return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
   3852     }
   3853 #endif
   3854 }
   3855 
   3856 
   3857 int
   3858 PyUnicode_FSConverter(PyObject* arg, void* addr)
   3859 {
   3860     PyObject *path = NULL;
   3861     PyObject *output = NULL;
   3862     Py_ssize_t size;
   3863     void *data;
   3864     if (arg == NULL) {
   3865         Py_DECREF(*(PyObject**)addr);
   3866         *(PyObject**)addr = NULL;
   3867         return 1;
   3868     }
   3869     path = PyOS_FSPath(arg);
   3870     if (path == NULL) {
   3871         return 0;
   3872     }
   3873     if (PyBytes_Check(path)) {
   3874         output = path;
   3875     }
   3876     else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
   3877         output = PyUnicode_EncodeFSDefault(path);
   3878         Py_DECREF(path);
   3879         if (!output) {
   3880             return 0;
   3881         }
   3882         assert(PyBytes_Check(output));
   3883     }
   3884 
   3885     size = PyBytes_GET_SIZE(output);
   3886     data = PyBytes_AS_STRING(output);
   3887     if ((size_t)size != strlen(data)) {
   3888         PyErr_SetString(PyExc_ValueError, "embedded null byte");
   3889         Py_DECREF(output);
   3890         return 0;
   3891     }
   3892     *(PyObject**)addr = output;
   3893     return Py_CLEANUP_SUPPORTED;
   3894 }
   3895 
   3896 
   3897 int
   3898 PyUnicode_FSDecoder(PyObject* arg, void* addr)
   3899 {
   3900     int is_buffer = 0;
   3901     PyObject *path = NULL;
   3902     PyObject *output = NULL;
   3903     if (arg == NULL) {
   3904         Py_DECREF(*(PyObject**)addr);
   3905         return 1;
   3906     }
   3907 
   3908     is_buffer = PyObject_CheckBuffer(arg);
   3909     if (!is_buffer) {
   3910         path = PyOS_FSPath(arg);
   3911         if (path == NULL) {
   3912             return 0;
   3913         }
   3914     }
   3915     else {
   3916         path = arg;
   3917         Py_INCREF(arg);
   3918     }
   3919 
   3920     if (PyUnicode_Check(path)) {
   3921         if (PyUnicode_READY(path) == -1) {
   3922             Py_DECREF(path);
   3923             return 0;
   3924         }
   3925         output = path;
   3926     }
   3927     else if (PyBytes_Check(path) || is_buffer) {
   3928         PyObject *path_bytes = NULL;
   3929 
   3930         if (!PyBytes_Check(path) &&
   3931             PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
   3932             "path should be string, bytes, or os.PathLike, not %.200s",
   3933             Py_TYPE(arg)->tp_name)) {
   3934                 Py_DECREF(path);
   3935             return 0;
   3936         }
   3937         path_bytes = PyBytes_FromObject(path);
   3938         Py_DECREF(path);
   3939         if (!path_bytes) {
   3940             return 0;
   3941         }
   3942         output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
   3943                                                   PyBytes_GET_SIZE(path_bytes));
   3944         Py_DECREF(path_bytes);
   3945         if (!output) {
   3946             return 0;
   3947         }
   3948     }
   3949     else {
   3950         PyErr_Format(PyExc_TypeError,
   3951                      "path should be string, bytes, or os.PathLike, not %.200s",
   3952                      Py_TYPE(arg)->tp_name);
   3953         Py_DECREF(path);
   3954         return 0;
   3955     }
   3956     if (PyUnicode_READY(output) == -1) {
   3957         Py_DECREF(output);
   3958         return 0;
   3959     }
   3960     if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
   3961                  PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
   3962         PyErr_SetString(PyExc_ValueError, "embedded null character");
   3963         Py_DECREF(output);
   3964         return 0;
   3965     }
   3966     *(PyObject**)addr = output;
   3967     return Py_CLEANUP_SUPPORTED;
   3968 }
   3969 
   3970 
   3971 char*
   3972 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
   3973 {
   3974     PyObject *bytes;
   3975 
   3976     if (!PyUnicode_Check(unicode)) {
   3977         PyErr_BadArgument();
   3978         return NULL;
   3979     }
   3980     if (PyUnicode_READY(unicode) == -1)
   3981         return NULL;
   3982 
   3983     if (PyUnicode_UTF8(unicode) == NULL) {
   3984         assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
   3985         bytes = _PyUnicode_AsUTF8String(unicode, NULL);
   3986         if (bytes == NULL)
   3987             return NULL;
   3988         _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
   3989         if (_PyUnicode_UTF8(unicode) == NULL) {
   3990             PyErr_NoMemory();
   3991             Py_DECREF(bytes);
   3992             return NULL;
   3993         }
   3994         _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
   3995         memcpy(_PyUnicode_UTF8(unicode),
   3996                   PyBytes_AS_STRING(bytes),
   3997                   _PyUnicode_UTF8_LENGTH(unicode) + 1);
   3998         Py_DECREF(bytes);
   3999     }
   4000 
   4001     if (psize)
   4002         *psize = PyUnicode_UTF8_LENGTH(unicode);
   4003     return PyUnicode_UTF8(unicode);
   4004 }
   4005 
   4006 char*
   4007 PyUnicode_AsUTF8(PyObject *unicode)
   4008 {
   4009     return PyUnicode_AsUTF8AndSize(unicode, NULL);
   4010 }
   4011 
   4012 Py_UNICODE *
   4013 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
   4014 {
   4015     const unsigned char *one_byte;
   4016 #if SIZEOF_WCHAR_T == 4
   4017     const Py_UCS2 *two_bytes;
   4018 #else
   4019     const Py_UCS4 *four_bytes;
   4020     const Py_UCS4 *ucs4_end;
   4021     Py_ssize_t num_surrogates;
   4022 #endif
   4023     wchar_t *w;
   4024     wchar_t *wchar_end;
   4025 
   4026     if (!PyUnicode_Check(unicode)) {
   4027         PyErr_BadArgument();
   4028         return NULL;
   4029     }
   4030     if (_PyUnicode_WSTR(unicode) == NULL) {
   4031         /* Non-ASCII compact unicode object */
   4032         assert(_PyUnicode_KIND(unicode) != 0);
   4033         assert(PyUnicode_IS_READY(unicode));
   4034 
   4035         if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
   4036 #if SIZEOF_WCHAR_T == 2
   4037             four_bytes = PyUnicode_4BYTE_DATA(unicode);
   4038             ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
   4039             num_surrogates = 0;
   4040 
   4041             for (; four_bytes < ucs4_end; ++four_bytes) {
   4042                 if (*four_bytes > 0xFFFF)
   4043                     ++num_surrogates;
   4044             }
   4045 
   4046             _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
   4047                     sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
   4048             if (!_PyUnicode_WSTR(unicode)) {
   4049                 PyErr_NoMemory();
   4050                 return NULL;
   4051             }
   4052             _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
   4053 
   4054             w = _PyUnicode_WSTR(unicode);
   4055             wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
   4056             four_bytes = PyUnicode_4BYTE_DATA(unicode);
   4057             for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
   4058                 if (*four_bytes > 0xFFFF) {
   4059                     assert(*four_bytes <= MAX_UNICODE);
   4060                     /* encode surrogate pair in this case */
   4061                     *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
   4062                     *w   = Py_UNICODE_LOW_SURROGATE(*four_bytes);
   4063                 }
   4064                 else
   4065                     *w = *four_bytes;
   4066 
   4067                 if (w > wchar_end) {
   4068                     assert(0 && "Miscalculated string end");
   4069                 }
   4070             }
   4071             *w = 0;
   4072 #else
   4073             /* sizeof(wchar_t) == 4 */
   4074             Py_FatalError("Impossible unicode object state, wstr and str "
   4075                           "should share memory already.");
   4076             return NULL;
   4077 #endif
   4078         }
   4079         else {
   4080             if ((size_t)_PyUnicode_LENGTH(unicode) >
   4081                     PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
   4082                 PyErr_NoMemory();
   4083                 return NULL;
   4084             }
   4085             _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
   4086                                                   (_PyUnicode_LENGTH(unicode) + 1));
   4087             if (!_PyUnicode_WSTR(unicode)) {
   4088                 PyErr_NoMemory();
   4089                 return NULL;
   4090             }
   4091             if (!PyUnicode_IS_COMPACT_ASCII(unicode))
   4092                 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
   4093             w = _PyUnicode_WSTR(unicode);
   4094             wchar_end = w + _PyUnicode_LENGTH(unicode);
   4095 
   4096             if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
   4097                 one_byte = PyUnicode_1BYTE_DATA(unicode);
   4098                 for (; w < wchar_end; ++one_byte, ++w)
   4099                     *w = *one_byte;
   4100                 /* null-terminate the wstr */
   4101                 *w = 0;
   4102             }
   4103             else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
   4104 #if SIZEOF_WCHAR_T == 4
   4105                 two_bytes = PyUnicode_2BYTE_DATA(unicode);
   4106                 for (; w < wchar_end; ++two_bytes, ++w)
   4107                     *w = *two_bytes;
   4108                 /* null-terminate the wstr */
   4109                 *w = 0;
   4110 #else
   4111                 /* sizeof(wchar_t) == 2 */
   4112                 PyObject_FREE(_PyUnicode_WSTR(unicode));
   4113                 _PyUnicode_WSTR(unicode) = NULL;
   4114                 Py_FatalError("Impossible unicode object state, wstr "
   4115                               "and str should share memory already.");
   4116                 return NULL;
   4117 #endif
   4118             }
   4119             else {
   4120                 assert(0 && "This should never happen.");
   4121             }
   4122         }
   4123     }
   4124     if (size != NULL)
   4125         *size = PyUnicode_WSTR_LENGTH(unicode);
   4126     return _PyUnicode_WSTR(unicode);
   4127 }
   4128 
   4129 Py_UNICODE *
   4130 PyUnicode_AsUnicode(PyObject *unicode)
   4131 {
   4132     return PyUnicode_AsUnicodeAndSize(unicode, NULL);
   4133 }
   4134 
   4135 
   4136 Py_ssize_t
   4137 PyUnicode_GetSize(PyObject *unicode)
   4138 {
   4139     if (!PyUnicode_Check(unicode)) {
   4140         PyErr_BadArgument();
   4141         goto onError;
   4142     }
   4143     return PyUnicode_GET_SIZE(unicode);
   4144 
   4145   onError:
   4146     return -1;
   4147 }
   4148 
   4149 Py_ssize_t
   4150 PyUnicode_GetLength(PyObject *unicode)
   4151 {
   4152     if (!PyUnicode_Check(unicode)) {
   4153         PyErr_BadArgument();
   4154         return -1;
   4155     }
   4156     if (PyUnicode_READY(unicode) == -1)
   4157         return -1;
   4158     return PyUnicode_GET_LENGTH(unicode);
   4159 }
   4160 
   4161 Py_UCS4
   4162 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
   4163 {
   4164     void *data;
   4165     int kind;
   4166 
   4167     if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
   4168         PyErr_BadArgument();
   4169         return (Py_UCS4)-1;
   4170     }
   4171     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
   4172         PyErr_SetString(PyExc_IndexError, "string index out of range");
   4173         return (Py_UCS4)-1;
   4174     }
   4175     data = PyUnicode_DATA(unicode);
   4176     kind = PyUnicode_KIND(unicode);
   4177     return PyUnicode_READ(kind, data, index);
   4178 }
   4179 
   4180 int
   4181 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
   4182 {
   4183     if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
   4184         PyErr_BadArgument();
   4185         return -1;
   4186     }
   4187     assert(PyUnicode_IS_READY(unicode));
   4188     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
   4189         PyErr_SetString(PyExc_IndexError, "string index out of range");
   4190         return -1;
   4191     }
   4192     if (unicode_check_modifiable(unicode))
   4193         return -1;
   4194     if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
   4195         PyErr_SetString(PyExc_ValueError, "character out of range");
   4196         return -1;
   4197     }
   4198     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
   4199                     index, ch);
   4200     return 0;
   4201 }
   4202 
   4203 const char *
   4204 PyUnicode_GetDefaultEncoding(void)
   4205 {
   4206     return "utf-8";
   4207 }
   4208 
   4209 /* create or adjust a UnicodeDecodeError */
   4210 static void
   4211 make_decode_exception(PyObject **exceptionObject,
   4212                       const char *encoding,
   4213                       const char *input, Py_ssize_t length,
   4214                       Py_ssize_t startpos, Py_ssize_t endpos,
   4215                       const char *reason)
   4216 {
   4217     if (*exceptionObject == NULL) {
   4218         *exceptionObject = PyUnicodeDecodeError_Create(
   4219             encoding, input, length, startpos, endpos, reason);
   4220     }
   4221     else {
   4222         if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
   4223             goto onError;
   4224         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
   4225             goto onError;
   4226         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
   4227             goto onError;
   4228     }
   4229     return;
   4230 
   4231 onError:
   4232     Py_CLEAR(*exceptionObject);
   4233 }
   4234 
   4235 #ifdef MS_WINDOWS
   4236 /* error handling callback helper:
   4237    build arguments, call the callback and check the arguments,
   4238    if no exception occurred, copy the replacement to the output
   4239    and adjust various state variables.
   4240    return 0 on success, -1 on error
   4241 */
   4242 
   4243 static int
   4244 unicode_decode_call_errorhandler_wchar(
   4245     const char *errors, PyObject **errorHandler,
   4246     const char *encoding, const char *reason,
   4247     const char **input, const char **inend, Py_ssize_t *startinpos,
   4248     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
   4249     PyObject **output, Py_ssize_t *outpos)
   4250 {
   4251     static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
   4252 
   4253     PyObject *restuple = NULL;
   4254     PyObject *repunicode = NULL;
   4255     Py_ssize_t outsize;
   4256     Py_ssize_t insize;
   4257     Py_ssize_t requiredsize;
   4258     Py_ssize_t newpos;
   4259     PyObject *inputobj = NULL;
   4260     wchar_t *repwstr;
   4261     Py_ssize_t repwlen;
   4262 
   4263     assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
   4264     outsize = _PyUnicode_WSTR_LENGTH(*output);
   4265 
   4266     if (*errorHandler == NULL) {
   4267         *errorHandler = PyCodec_LookupError(errors);
   4268         if (*errorHandler == NULL)
   4269             goto onError;
   4270     }
   4271 
   4272     make_decode_exception(exceptionObject,
   4273         encoding,
   4274         *input, *inend - *input,
   4275         *startinpos, *endinpos,
   4276         reason);
   4277     if (*exceptionObject == NULL)
   4278         goto onError;
   4279 
   4280     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
   4281     if (restuple == NULL)
   4282         goto onError;
   4283     if (!PyTuple_Check(restuple)) {
   4284         PyErr_SetString(PyExc_TypeError, &argparse[4]);
   4285         goto onError;
   4286     }
   4287     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
   4288         goto onError;
   4289 
   4290     /* Copy back the bytes variables, which might have been modified by the
   4291        callback */
   4292     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
   4293     if (!inputobj)
   4294         goto onError;
   4295     if (!PyBytes_Check(inputobj)) {
   4296         PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
   4297     }
   4298     *input = PyBytes_AS_STRING(inputobj);
   4299     insize = PyBytes_GET_SIZE(inputobj);
   4300     *inend = *input + insize;
   4301     /* we can DECREF safely, as the exception has another reference,
   4302        so the object won't go away. */
   4303     Py_DECREF(inputobj);
   4304 
   4305     if (newpos<0)
   4306         newpos = insize+newpos;
   4307     if (newpos<0 || newpos>insize) {
   4308         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
   4309         goto onError;
   4310     }
   4311 
   4312     repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
   4313     if (repwstr == NULL)
   4314         goto onError;
   4315     /* need more space? (at least enough for what we
   4316        have+the replacement+the rest of the string (starting
   4317        at the new input position), so we won't have to check space
   4318        when there are no errors in the rest of the string) */
   4319     requiredsize = *outpos;
   4320     if (requiredsize > PY_SSIZE_T_MAX - repwlen)
   4321         goto overflow;
   4322     requiredsize += repwlen;
   4323     if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
   4324         goto overflow;
   4325     requiredsize += insize - newpos;
   4326     if (requiredsize > outsize) {
   4327         if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
   4328             requiredsize = 2*outsize;
   4329         if (unicode_resize(output, requiredsize) < 0)
   4330             goto onError;
   4331     }
   4332     wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
   4333     *outpos += repwlen;
   4334     *endinpos = newpos;
   4335     *inptr = *input + newpos;
   4336 
   4337     /* we made it! */
   4338     Py_XDECREF(restuple);
   4339     return 0;
   4340 
   4341   overflow:
   4342     PyErr_SetString(PyExc_OverflowError,
   4343                     "decoded result is too long for a Python string");
   4344 
   4345   onError:
   4346     Py_XDECREF(restuple);
   4347     return -1;
   4348 }
   4349 #endif   /* MS_WINDOWS */
   4350 
   4351 static int
   4352 unicode_decode_call_errorhandler_writer(
   4353     const char *errors, PyObject **errorHandler,
   4354     const char *encoding, const char *reason,
   4355     const char **input, const char **inend, Py_ssize_t *startinpos,
   4356     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
   4357     _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
   4358 {
   4359     static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
   4360 
   4361     PyObject *restuple = NULL;
   4362     PyObject *repunicode = NULL;
   4363     Py_ssize_t insize;
   4364     Py_ssize_t newpos;
   4365     Py_ssize_t replen;
   4366     PyObject *inputobj = NULL;
   4367 
   4368     if (*errorHandler == NULL) {
   4369         *errorHandler = PyCodec_LookupError(errors);
   4370         if (*errorHandler == NULL)
   4371             goto onError;
   4372     }
   4373 
   4374     make_decode_exception(exceptionObject,
   4375         encoding,
   4376         *input, *inend - *input,
   4377         *startinpos, *endinpos,
   4378         reason);
   4379     if (*exceptionObject == NULL)
   4380         goto onError;
   4381 
   4382     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
   4383     if (restuple == NULL)
   4384         goto onError;
   4385     if (!PyTuple_Check(restuple)) {
   4386         PyErr_SetString(PyExc_TypeError, &argparse[4]);
   4387         goto onError;
   4388     }
   4389     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
   4390         goto onError;
   4391 
   4392     /* Copy back the bytes variables, which might have been modified by the
   4393        callback */
   4394     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
   4395     if (!inputobj)
   4396         goto onError;
   4397     if (!PyBytes_Check(inputobj)) {
   4398         PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
   4399     }
   4400     *input = PyBytes_AS_STRING(inputobj);
   4401     insize = PyBytes_GET_SIZE(inputobj);
   4402     *inend = *input + insize;
   4403     /* we can DECREF safely, as the exception has another reference,
   4404        so the object won't go away. */
   4405     Py_DECREF(inputobj);
   4406 
   4407     if (newpos<0)
   4408         newpos = insize+newpos;
   4409     if (newpos<0 || newpos>insize) {
   4410         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
   4411         goto onError;
   4412     }
   4413 
   4414     if (PyUnicode_READY(repunicode) < 0)
   4415         goto onError;
   4416     replen = PyUnicode_GET_LENGTH(repunicode);
   4417     if (replen > 1) {
   4418         writer->min_length += replen - 1;
   4419         writer->overallocate = 1;
   4420         if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
   4421                             PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
   4422             goto onError;
   4423     }
   4424     if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
   4425         goto onError;
   4426 
   4427     *endinpos = newpos;
   4428     *inptr = *input + newpos;
   4429 
   4430     /* we made it! */
   4431     Py_XDECREF(restuple);
   4432     return 0;
   4433 
   4434   onError:
   4435     Py_XDECREF(restuple);
   4436     return -1;
   4437 }
   4438 
   4439 /* --- UTF-7 Codec -------------------------------------------------------- */
   4440 
   4441 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
   4442 
   4443 /* Three simple macros defining base-64. */
   4444 
   4445 /* Is c a base-64 character? */
   4446 
   4447 #define IS_BASE64(c) \
   4448     (((c) >= 'A' && (c) <= 'Z') ||     \
   4449      ((c) >= 'a' && (c) <= 'z') ||     \
   4450      ((c) >= '0' && (c) <= '9') ||     \
   4451      (c) == '+' || (c) == '/')
   4452 
   4453 /* given that c is a base-64 character, what is its base-64 value? */
   4454 
   4455 #define FROM_BASE64(c)                                                  \
   4456     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
   4457      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
   4458      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
   4459      (c) == '+' ? 62 : 63)
   4460 
   4461 /* What is the base-64 character of the bottom 6 bits of n? */
   4462 
   4463 #define TO_BASE64(n)  \
   4464     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
   4465 
   4466 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
   4467  * decoded as itself.  We are permissive on decoding; the only ASCII
   4468  * byte not decoding to itself is the + which begins a base64
   4469  * string. */
   4470 
   4471 #define DECODE_DIRECT(c)                                \
   4472     ((c) <= 127 && (c) != '+')
   4473 
   4474 /* The UTF-7 encoder treats ASCII characters differently according to
   4475  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
   4476  * the above).  See RFC2152.  This array identifies these different
   4477  * sets:
   4478  * 0 : "Set D"
   4479  *     alphanumeric and '(),-./:?
   4480  * 1 : "Set O"
   4481  *     !"#$%&*;<=>@[]^_`{|}
   4482  * 2 : "whitespace"
   4483  *     ht nl cr sp
   4484  * 3 : special (must be base64 encoded)
   4485  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
   4486  */
   4487 
   4488 static
   4489 char utf7_category[128] = {
   4490 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
   4491     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
   4492 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
   4493     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
   4494 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
   4495     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
   4496 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
   4497     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
   4498 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
   4499     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   4500 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
   4501     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
   4502 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
   4503     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   4504 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
   4505     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
   4506 };
   4507 
   4508 /* ENCODE_DIRECT: this character should be encoded as itself.  The
   4509  * answer depends on whether we are encoding set O as itself, and also
   4510  * on whether we are encoding whitespace as itself.  RFC2152 makes it
   4511  * clear that the answers to these questions vary between
   4512  * applications, so this code needs to be flexible.  */
   4513 
   4514 #define ENCODE_DIRECT(c, directO, directWS)             \
   4515     ((c) < 128 && (c) > 0 &&                            \
   4516      ((utf7_category[(c)] == 0) ||                      \
   4517       (directWS && (utf7_category[(c)] == 2)) ||        \
   4518       (directO && (utf7_category[(c)] == 1))))
   4519 
   4520 PyObject *
   4521 PyUnicode_DecodeUTF7(const char *s,
   4522                      Py_ssize_t size,
   4523                      const char *errors)
   4524 {
   4525     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
   4526 }
   4527 
   4528 /* The decoder.  The only state we preserve is our read position,
   4529  * i.e. how many characters we have consumed.  So if we end in the
   4530  * middle of a shift sequence we have to back off the read position
   4531  * and the output to the beginning of the sequence, otherwise we lose
   4532  * all the shift state (seen bits, number of bits seen, high
   4533  * surrogate). */
   4534 
   4535 PyObject *
   4536 PyUnicode_DecodeUTF7Stateful(const char *s,
   4537                              Py_ssize_t size,
   4538                              const char *errors,
   4539                              Py_ssize_t *consumed)
   4540 {
   4541     const char *starts = s;
   4542     Py_ssize_t startinpos;
   4543     Py_ssize_t endinpos;
   4544     const char *e;
   4545     _PyUnicodeWriter writer;
   4546     const char *errmsg = "";
   4547     int inShift = 0;
   4548     Py_ssize_t shiftOutStart;
   4549     unsigned int base64bits = 0;
   4550     unsigned long base64buffer = 0;
   4551     Py_UCS4 surrogate = 0;
   4552     PyObject *errorHandler = NULL;
   4553     PyObject *exc = NULL;
   4554 
   4555     if (size == 0) {
   4556         if (consumed)
   4557             *consumed = 0;
   4558         _Py_RETURN_UNICODE_EMPTY();
   4559     }
   4560 
   4561     /* Start off assuming it's all ASCII. Widen later as necessary. */
   4562     _PyUnicodeWriter_Init(&writer);
   4563     writer.min_length = size;
   4564 
   4565     shiftOutStart = 0;
   4566     e = s + size;
   4567 
   4568     while (s < e) {
   4569         Py_UCS4 ch;
   4570       restart:
   4571         ch = (unsigned char) *s;
   4572 
   4573         if (inShift) { /* in a base-64 section */
   4574             if (IS_BASE64(ch)) { /* consume a base-64 character */
   4575                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
   4576                 base64bits += 6;
   4577                 s++;
   4578                 if (base64bits >= 16) {
   4579                     /* we have enough bits for a UTF-16 value */
   4580                     Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
   4581                     base64bits -= 16;
   4582                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
   4583                     assert(outCh <= 0xffff);
   4584                     if (surrogate) {
   4585                         /* expecting a second surrogate */
   4586                         if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
   4587                             Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
   4588                             if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
   4589                                 goto onError;
   4590                             surrogate = 0;
   4591                             continue;
   4592                         }
   4593                         else {
   4594                             if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
   4595                                 goto onError;
   4596                             surrogate = 0;
   4597                         }
   4598                     }
   4599                     if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
   4600                         /* first surrogate */
   4601                         surrogate = outCh;
   4602                     }
   4603                     else {
   4604                         if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
   4605                             goto onError;
   4606                     }
   4607                 }
   4608             }
   4609             else { /* now leaving a base-64 section */
   4610                 inShift = 0;
   4611                 if (base64bits > 0) { /* left-over bits */
   4612                     if (base64bits >= 6) {
   4613                         /* We've seen at least one base-64 character */
   4614                         s++;
   4615                         errmsg = "partial character in shift sequence";
   4616                         goto utf7Error;
   4617                     }
   4618                     else {
   4619                         /* Some bits remain; they should be zero */
   4620                         if (base64buffer != 0) {
   4621                             s++;
   4622                             errmsg = "non-zero padding bits in shift sequence";
   4623                             goto utf7Error;
   4624                         }
   4625                     }
   4626                 }
   4627                 if (surrogate && DECODE_DIRECT(ch)) {
   4628                     if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
   4629                         goto onError;
   4630                 }
   4631                 surrogate = 0;
   4632                 if (ch == '-') {
   4633                     /* '-' is absorbed; other terminating
   4634                        characters are preserved */
   4635                     s++;
   4636                 }
   4637             }
   4638         }
   4639         else if ( ch == '+' ) {
   4640             startinpos = s-starts;
   4641             s++; /* consume '+' */
   4642             if (s < e && *s == '-') { /* '+-' encodes '+' */
   4643                 s++;
   4644                 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
   4645                     goto onError;
   4646             }
   4647             else { /* begin base64-encoded section */
   4648                 inShift = 1;
   4649                 surrogate = 0;
   4650                 shiftOutStart = writer.pos;
   4651                 base64bits = 0;
   4652                 base64buffer = 0;
   4653             }
   4654         }
   4655         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
   4656             s++;
   4657             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
   4658                 goto onError;
   4659         }
   4660         else {
   4661             startinpos = s-starts;
   4662             s++;
   4663             errmsg = "unexpected special character";
   4664             goto utf7Error;
   4665         }
   4666         continue;
   4667 utf7Error:
   4668         endinpos = s-starts;
   4669         if (unicode_decode_call_errorhandler_writer(
   4670                 errors, &errorHandler,
   4671                 "utf7", errmsg,
   4672                 &starts, &e, &startinpos, &endinpos, &exc, &s,
   4673                 &writer))
   4674             goto onError;
   4675     }
   4676 
   4677     /* end of string */
   4678 
   4679     if (inShift && !consumed) { /* in shift sequence, no more to follow */
   4680         /* if we're in an inconsistent state, that's an error */
   4681         inShift = 0;
   4682         if (surrogate ||
   4683                 (base64bits >= 6) ||
   4684                 (base64bits > 0 && base64buffer != 0)) {
   4685             endinpos = size;
   4686             if (unicode_decode_call_errorhandler_writer(
   4687                     errors, &errorHandler,
   4688                     "utf7", "unterminated shift sequence",
   4689                     &starts, &e, &startinpos, &endinpos, &exc, &s,
   4690                     &writer))
   4691                 goto onError;
   4692             if (s < e)
   4693                 goto restart;
   4694         }
   4695     }
   4696 
   4697     /* return state */
   4698     if (consumed) {
   4699         if (inShift) {
   4700             *consumed = startinpos;
   4701             if (writer.pos != shiftOutStart && writer.maxchar > 127) {
   4702                 PyObject *result = PyUnicode_FromKindAndData(
   4703                         writer.kind, writer.data, shiftOutStart);
   4704                 Py_XDECREF(errorHandler);
   4705                 Py_XDECREF(exc);
   4706                 _PyUnicodeWriter_Dealloc(&writer);
   4707                 return result;
   4708             }
   4709             writer.pos = shiftOutStart; /* back off output */
   4710         }
   4711         else {
   4712             *consumed = s-starts;
   4713         }
   4714     }
   4715 
   4716     Py_XDECREF(errorHandler);
   4717     Py_XDECREF(exc);
   4718     return _PyUnicodeWriter_Finish(&writer);
   4719 
   4720   onError:
   4721     Py_XDECREF(errorHandler);
   4722     Py_XDECREF(exc);
   4723     _PyUnicodeWriter_Dealloc(&writer);
   4724     return NULL;
   4725 }
   4726 
   4727 
   4728 PyObject *
   4729 _PyUnicode_EncodeUTF7(PyObject *str,
   4730                       int base64SetO,
   4731                       int base64WhiteSpace,
   4732                       const char *errors)
   4733 {
   4734     int kind;
   4735     void *data;
   4736     Py_ssize_t len;
   4737     PyObject *v;
   4738     int inShift = 0;
   4739     Py_ssize_t i;
   4740     unsigned int base64bits = 0;
   4741     unsigned long base64buffer = 0;
   4742     char * out;
   4743     char * start;
   4744 
   4745     if (PyUnicode_READY(str) == -1)
   4746         return NULL;
   4747     kind = PyUnicode_KIND(str);
   4748     data = PyUnicode_DATA(str);
   4749     len = PyUnicode_GET_LENGTH(str);
   4750 
   4751     if (len == 0)
   4752         return PyBytes_FromStringAndSize(NULL, 0);
   4753 
   4754     /* It might be possible to tighten this worst case */
   4755     if (len > PY_SSIZE_T_MAX / 8)
   4756         return PyErr_NoMemory();
   4757     v = PyBytes_FromStringAndSize(NULL, len * 8);
   4758     if (v == NULL)
   4759         return NULL;
   4760 
   4761     start = out = PyBytes_AS_STRING(v);
   4762     for (i = 0; i < len; ++i) {
   4763         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   4764 
   4765         if (inShift) {
   4766             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
   4767                 /* shifting out */
   4768                 if (base64bits) { /* output remaining bits */
   4769                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
   4770                     base64buffer = 0;
   4771                     base64bits = 0;
   4772                 }
   4773                 inShift = 0;
   4774                 /* Characters not in the BASE64 set implicitly unshift the sequence
   4775                    so no '-' is required, except if the character is itself a '-' */
   4776                 if (IS_BASE64(ch) || ch == '-') {
   4777                     *out++ = '-';
   4778                 }
   4779                 *out++ = (char) ch;
   4780             }
   4781             else {
   4782                 goto encode_char;
   4783             }
   4784         }
   4785         else { /* not in a shift sequence */
   4786             if (ch == '+') {
   4787                 *out++ = '+';
   4788                         *out++ = '-';
   4789             }
   4790             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
   4791                 *out++ = (char) ch;
   4792             }
   4793             else {
   4794                 *out++ = '+';
   4795                 inShift = 1;
   4796                 goto encode_char;
   4797             }
   4798         }
   4799         continue;
   4800 encode_char:
   4801         if (ch >= 0x10000) {
   4802             assert(ch <= MAX_UNICODE);
   4803 
   4804             /* code first surrogate */
   4805             base64bits += 16;
   4806             base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
   4807             while (base64bits >= 6) {
   4808                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
   4809                 base64bits -= 6;
   4810             }
   4811             /* prepare second surrogate */
   4812             ch = Py_UNICODE_LOW_SURROGATE(ch);
   4813         }
   4814         base64bits += 16;
   4815         base64buffer = (base64buffer << 16) | ch;
   4816         while (base64bits >= 6) {
   4817             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
   4818             base64bits -= 6;
   4819         }
   4820     }
   4821     if (base64bits)
   4822         *out++= TO_BASE64(base64buffer << (6-base64bits) );
   4823     if (inShift)
   4824         *out++ = '-';
   4825     if (_PyBytes_Resize(&v, out - start) < 0)
   4826         return NULL;
   4827     return v;
   4828 }
   4829 PyObject *
   4830 PyUnicode_EncodeUTF7(const Py_UNICODE *s,
   4831                      Py_ssize_t size,
   4832                      int base64SetO,
   4833                      int base64WhiteSpace,
   4834                      const char *errors)
   4835 {
   4836     PyObject *result;
   4837     PyObject *tmp = PyUnicode_FromUnicode(s, size);
   4838     if (tmp == NULL)
   4839         return NULL;
   4840     result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
   4841                                    base64WhiteSpace, errors);
   4842     Py_DECREF(tmp);
   4843     return result;
   4844 }
   4845 
   4846 #undef IS_BASE64
   4847 #undef FROM_BASE64
   4848 #undef TO_BASE64
   4849 #undef DECODE_DIRECT
   4850 #undef ENCODE_DIRECT
   4851 
   4852 /* --- UTF-8 Codec -------------------------------------------------------- */
   4853 
   4854 PyObject *
   4855 PyUnicode_DecodeUTF8(const char *s,
   4856                      Py_ssize_t size,
   4857                      const char *errors)
   4858 {
   4859     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
   4860 }
   4861 
   4862 #include "stringlib/asciilib.h"
   4863 #include "stringlib/codecs.h"
   4864 #include "stringlib/undef.h"
   4865 
   4866 #include "stringlib/ucs1lib.h"
   4867 #include "stringlib/codecs.h"
   4868 #include "stringlib/undef.h"
   4869 
   4870 #include "stringlib/ucs2lib.h"
   4871 #include "stringlib/codecs.h"
   4872 #include "stringlib/undef.h"
   4873 
   4874 #include "stringlib/ucs4lib.h"
   4875 #include "stringlib/codecs.h"
   4876 #include "stringlib/undef.h"
   4877 
   4878 /* Mask to quickly check whether a C 'long' contains a
   4879    non-ASCII, UTF8-encoded char. */
   4880 #if (SIZEOF_LONG == 8)
   4881 # define ASCII_CHAR_MASK 0x8080808080808080UL
   4882 #elif (SIZEOF_LONG == 4)
   4883 # define ASCII_CHAR_MASK 0x80808080UL
   4884 #else
   4885 # error C 'long' size should be either 4 or 8!
   4886 #endif
   4887 
   4888 static Py_ssize_t
   4889 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
   4890 {
   4891     const char *p = start;
   4892     const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
   4893 
   4894     /*
   4895      * Issue #17237: m68k is a bit different from most architectures in
   4896      * that objects do not use "natural alignment" - for example, int and
   4897      * long are only aligned at 2-byte boundaries.  Therefore the assert()
   4898      * won't work; also, tests have shown that skipping the "optimised
   4899      * version" will even speed up m68k.
   4900      */
   4901 #if !defined(__m68k__)
   4902 #if SIZEOF_LONG <= SIZEOF_VOID_P
   4903     assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
   4904     if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
   4905         /* Fast path, see in STRINGLIB(utf8_decode) for
   4906            an explanation. */
   4907         /* Help allocation */
   4908         const char *_p = p;
   4909         Py_UCS1 * q = dest;
   4910         while (_p < aligned_end) {
   4911             unsigned long value = *(const unsigned long *) _p;
   4912             if (value & ASCII_CHAR_MASK)
   4913                 break;
   4914             *((unsigned long *)q) = value;
   4915             _p += SIZEOF_LONG;
   4916             q += SIZEOF_LONG;
   4917         }
   4918         p = _p;
   4919         while (p < end) {
   4920             if ((unsigned char)*p & 0x80)
   4921                 break;
   4922             *q++ = *p++;
   4923         }
   4924         return p - start;
   4925     }
   4926 #endif
   4927 #endif
   4928     while (p < end) {
   4929         /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
   4930            for an explanation. */
   4931         if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
   4932             /* Help allocation */
   4933             const char *_p = p;
   4934             while (_p < aligned_end) {
   4935                 unsigned long value = *(unsigned long *) _p;
   4936                 if (value & ASCII_CHAR_MASK)
   4937                     break;
   4938                 _p += SIZEOF_LONG;
   4939             }
   4940             p = _p;
   4941             if (_p == end)
   4942                 break;
   4943         }
   4944         if ((unsigned char)*p & 0x80)
   4945             break;
   4946         ++p;
   4947     }
   4948     memcpy(dest, start, p - start);
   4949     return p - start;
   4950 }
   4951 
   4952 PyObject *
   4953 PyUnicode_DecodeUTF8Stateful(const char *s,
   4954                              Py_ssize_t size,
   4955                              const char *errors,
   4956                              Py_ssize_t *consumed)
   4957 {
   4958     _PyUnicodeWriter writer;
   4959     const char *starts = s;
   4960     const char *end = s + size;
   4961 
   4962     Py_ssize_t startinpos;
   4963     Py_ssize_t endinpos;
   4964     const char *errmsg = "";
   4965     PyObject *error_handler_obj = NULL;
   4966     PyObject *exc = NULL;
   4967     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
   4968 
   4969     if (size == 0) {
   4970         if (consumed)
   4971             *consumed = 0;
   4972         _Py_RETURN_UNICODE_EMPTY();
   4973     }
   4974 
   4975     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
   4976     if (size == 1 && (unsigned char)s[0] < 128) {
   4977         if (consumed)
   4978             *consumed = 1;
   4979         return get_latin1_char((unsigned char)s[0]);
   4980     }
   4981 
   4982     _PyUnicodeWriter_Init(&writer);
   4983     writer.min_length = size;
   4984     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
   4985         goto onError;
   4986 
   4987     writer.pos = ascii_decode(s, end, writer.data);
   4988     s += writer.pos;
   4989     while (s < end) {
   4990         Py_UCS4 ch;
   4991         int kind = writer.kind;
   4992 
   4993         if (kind == PyUnicode_1BYTE_KIND) {
   4994             if (PyUnicode_IS_ASCII(writer.buffer))
   4995                 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
   4996             else
   4997                 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
   4998         } else if (kind == PyUnicode_2BYTE_KIND) {
   4999             ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
   5000         } else {
   5001             assert(kind == PyUnicode_4BYTE_KIND);
   5002             ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
   5003         }
   5004 
   5005         switch (ch) {
   5006         case 0:
   5007             if (s == end || consumed)
   5008                 goto End;
   5009             errmsg = "unexpected end of data";
   5010             startinpos = s - starts;
   5011             endinpos = end - starts;
   5012             break;
   5013         case 1:
   5014             errmsg = "invalid start byte";
   5015             startinpos = s - starts;
   5016             endinpos = startinpos + 1;
   5017             break;
   5018         case 2:
   5019         case 3:
   5020         case 4:
   5021             errmsg = "invalid continuation byte";
   5022             startinpos = s - starts;
   5023             endinpos = startinpos + ch - 1;
   5024             break;
   5025         default:
   5026             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
   5027                 goto onError;
   5028             continue;
   5029         }
   5030 
   5031         if (error_handler == _Py_ERROR_UNKNOWN)
   5032             error_handler = get_error_handler(errors);
   5033 
   5034         switch (error_handler) {
   5035         case _Py_ERROR_IGNORE:
   5036             s += (endinpos - startinpos);
   5037             break;
   5038 
   5039         case _Py_ERROR_REPLACE:
   5040             if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
   5041                 goto onError;
   5042             s += (endinpos - startinpos);
   5043             break;
   5044 
   5045         case _Py_ERROR_SURROGATEESCAPE:
   5046         {
   5047             Py_ssize_t i;
   5048 
   5049             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
   5050                 goto onError;
   5051             for (i=startinpos; i<endinpos; i++) {
   5052                 ch = (Py_UCS4)(unsigned char)(starts[i]);
   5053                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
   5054                                 ch + 0xdc00);
   5055                 writer.pos++;
   5056             }
   5057             s += (endinpos - startinpos);
   5058             break;
   5059         }
   5060 
   5061         default:
   5062             if (unicode_decode_call_errorhandler_writer(
   5063                     errors, &error_handler_obj,
   5064                     "utf-8", errmsg,
   5065                     &starts, &end, &startinpos, &endinpos, &exc, &s,
   5066                     &writer))
   5067                 goto onError;
   5068         }
   5069     }
   5070 
   5071 End:
   5072     if (consumed)
   5073         *consumed = s - starts;
   5074 
   5075     Py_XDECREF(error_handler_obj);
   5076     Py_XDECREF(exc);
   5077     return _PyUnicodeWriter_Finish(&writer);
   5078 
   5079 onError:
   5080     Py_XDECREF(error_handler_obj);
   5081     Py_XDECREF(exc);
   5082     _PyUnicodeWriter_Dealloc(&writer);
   5083     return NULL;
   5084 }
   5085 
   5086 #if defined(__APPLE__) || defined(__ANDROID__)
   5087 
   5088 /* Simplified UTF-8 decoder using surrogateescape error handler,
   5089    used to decode the command line arguments on Mac OS X and Android.
   5090 
   5091    Return a pointer to a newly allocated wide character string (use
   5092    PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
   5093 
   5094 wchar_t*
   5095 _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
   5096 {
   5097     const char *e;
   5098     wchar_t *unicode;
   5099     Py_ssize_t outpos;
   5100 
   5101     /* Note: size will always be longer than the resulting Unicode
   5102        character count */
   5103     if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
   5104         return NULL;
   5105     unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
   5106     if (!unicode)
   5107         return NULL;
   5108 
   5109     /* Unpack UTF-8 encoded data */
   5110     e = s + size;
   5111     outpos = 0;
   5112     while (s < e) {
   5113         Py_UCS4 ch;
   5114 #if SIZEOF_WCHAR_T == 4
   5115         ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
   5116 #else
   5117         ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
   5118 #endif
   5119         if (ch > 0xFF) {
   5120 #if SIZEOF_WCHAR_T == 4
   5121             assert(0);
   5122 #else
   5123             assert(ch > 0xFFFF && ch <= MAX_UNICODE);
   5124             /*  compute and append the two surrogates: */
   5125             unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
   5126             unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
   5127 #endif
   5128         }
   5129         else {
   5130             if (!ch && s == e)
   5131                 break;
   5132             /* surrogateescape */
   5133             unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
   5134         }
   5135     }
   5136     unicode[outpos] = L'\0';
   5137     return unicode;
   5138 }
   5139 
   5140 #endif /* __APPLE__ or __ANDROID__ */
   5141 
   5142 /* Primary internal function which creates utf8 encoded bytes objects.
   5143 
   5144    Allocation strategy:  if the string is short, convert into a stack buffer
   5145    and allocate exactly as much space needed at the end.  Else allocate the
   5146    maximum possible needed (4 result bytes per Unicode character), and return
   5147    the excess memory at the end.
   5148 */
   5149 PyObject *
   5150 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
   5151 {
   5152     enum PyUnicode_Kind kind;
   5153     void *data;
   5154     Py_ssize_t size;
   5155 
   5156     if (!PyUnicode_Check(unicode)) {
   5157         PyErr_BadArgument();
   5158         return NULL;
   5159     }
   5160 
   5161     if (PyUnicode_READY(unicode) == -1)
   5162         return NULL;
   5163 
   5164     if (PyUnicode_UTF8(unicode))
   5165         return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
   5166                                          PyUnicode_UTF8_LENGTH(unicode));
   5167 
   5168     kind = PyUnicode_KIND(unicode);
   5169     data = PyUnicode_DATA(unicode);
   5170     size = PyUnicode_GET_LENGTH(unicode);
   5171 
   5172     switch (kind) {
   5173     default:
   5174         assert(0);
   5175     case PyUnicode_1BYTE_KIND:
   5176         /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
   5177         assert(!PyUnicode_IS_ASCII(unicode));
   5178         return ucs1lib_utf8_encoder(unicode, data, size, errors);
   5179     case PyUnicode_2BYTE_KIND:
   5180         return ucs2lib_utf8_encoder(unicode, data, size, errors);
   5181     case PyUnicode_4BYTE_KIND:
   5182         return ucs4lib_utf8_encoder(unicode, data, size, errors);
   5183     }
   5184 }
   5185 
   5186 PyObject *
   5187 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
   5188                      Py_ssize_t size,
   5189                      const char *errors)
   5190 {
   5191     PyObject *v, *unicode;
   5192 
   5193     unicode = PyUnicode_FromUnicode(s, size);
   5194     if (unicode == NULL)
   5195         return NULL;
   5196     v = _PyUnicode_AsUTF8String(unicode, errors);
   5197     Py_DECREF(unicode);
   5198     return v;
   5199 }
   5200 
   5201 PyObject *
   5202 PyUnicode_AsUTF8String(PyObject *unicode)
   5203 {
   5204     return _PyUnicode_AsUTF8String(unicode, NULL);
   5205 }
   5206 
   5207 /* --- UTF-32 Codec ------------------------------------------------------- */
   5208 
   5209 PyObject *
   5210 PyUnicode_DecodeUTF32(const char *s,
   5211                       Py_ssize_t size,
   5212                       const char *errors,
   5213                       int *byteorder)
   5214 {
   5215     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
   5216 }
   5217 
   5218 PyObject *
   5219 PyUnicode_DecodeUTF32Stateful(const char *s,
   5220                               Py_ssize_t size,
   5221                               const char *errors,
   5222                               int *byteorder,
   5223                               Py_ssize_t *consumed)
   5224 {
   5225     const char *starts = s;
   5226     Py_ssize_t startinpos;
   5227     Py_ssize_t endinpos;
   5228     _PyUnicodeWriter writer;
   5229     const unsigned char *q, *e;
   5230     int le, bo = 0;       /* assume native ordering by default */
   5231     const char *encoding;
   5232     const char *errmsg = "";
   5233     PyObject *errorHandler = NULL;
   5234     PyObject *exc = NULL;
   5235 
   5236     q = (unsigned char *)s;
   5237     e = q + size;
   5238 
   5239     if (byteorder)
   5240         bo = *byteorder;
   5241 
   5242     /* Check for BOM marks (U+FEFF) in the input and adjust current
   5243        byte order setting accordingly. In native mode, the leading BOM
   5244        mark is skipped, in all other modes, it is copied to the output
   5245        stream as-is (giving a ZWNBSP character). */
   5246     if (bo == 0 && size >= 4) {
   5247         Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
   5248         if (bom == 0x0000FEFF) {
   5249             bo = -1;
   5250             q += 4;
   5251         }
   5252         else if (bom == 0xFFFE0000) {
   5253             bo = 1;
   5254             q += 4;
   5255         }
   5256         if (byteorder)
   5257             *byteorder = bo;
   5258     }
   5259 
   5260     if (q == e) {
   5261         if (consumed)
   5262             *consumed = size;
   5263         _Py_RETURN_UNICODE_EMPTY();
   5264     }
   5265 
   5266 #ifdef WORDS_BIGENDIAN
   5267     le = bo < 0;
   5268 #else
   5269     le = bo <= 0;
   5270 #endif
   5271     encoding = le ? "utf-32-le" : "utf-32-be";
   5272 
   5273     _PyUnicodeWriter_Init(&writer);
   5274     writer.min_length = (e - q + 3) / 4;
   5275     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
   5276         goto onError;
   5277 
   5278     while (1) {
   5279         Py_UCS4 ch = 0;
   5280         Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
   5281 
   5282         if (e - q >= 4) {
   5283             enum PyUnicode_Kind kind = writer.kind;
   5284             void *data = writer.data;
   5285             const unsigned char *last = e - 4;
   5286             Py_ssize_t pos = writer.pos;
   5287             if (le) {
   5288                 do {
   5289                     ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
   5290                     if (ch > maxch)
   5291                         break;
   5292                     if (kind != PyUnicode_1BYTE_KIND &&
   5293                         Py_UNICODE_IS_SURROGATE(ch))
   5294                         break;
   5295                     PyUnicode_WRITE(kind, data, pos++, ch);
   5296                     q += 4;
   5297                 } while (q <= last);
   5298             }
   5299             else {
   5300                 do {
   5301                     ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
   5302                     if (ch > maxch)
   5303                         break;
   5304                     if (kind != PyUnicode_1BYTE_KIND &&
   5305                         Py_UNICODE_IS_SURROGATE(ch))
   5306                         break;
   5307                     PyUnicode_WRITE(kind, data, pos++, ch);
   5308                     q += 4;
   5309                 } while (q <= last);
   5310             }
   5311             writer.pos = pos;
   5312         }
   5313 
   5314         if (Py_UNICODE_IS_SURROGATE(ch)) {
   5315             errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
   5316             startinpos = ((const char *)q) - starts;
   5317             endinpos = startinpos + 4;
   5318         }
   5319         else if (ch <= maxch) {
   5320             if (q == e || consumed)
   5321                 break;
   5322             /* remaining bytes at the end? (size should be divisible by 4) */
   5323             errmsg = "truncated data";
   5324             startinpos = ((const char *)q) - starts;
   5325             endinpos = ((const char *)e) - starts;
   5326         }
   5327         else {
   5328             if (ch < 0x110000) {
   5329                 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
   5330                     goto onError;
   5331                 q += 4;
   5332                 continue;
   5333             }
   5334             errmsg = "code point not in range(0x110000)";
   5335             startinpos = ((const char *)q) - starts;
   5336             endinpos = startinpos + 4;
   5337         }
   5338 
   5339         /* The remaining input chars are ignored if the callback
   5340            chooses to skip the input */
   5341         if (unicode_decode_call_errorhandler_writer(
   5342                 errors, &errorHandler,
   5343                 encoding, errmsg,
   5344                 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
   5345                 &writer))
   5346             goto onError;
   5347     }
   5348 
   5349     if (consumed)
   5350         *consumed = (const char *)q-starts;
   5351 
   5352     Py_XDECREF(errorHandler);
   5353     Py_XDECREF(exc);
   5354     return _PyUnicodeWriter_Finish(&writer);
   5355 
   5356   onError:
   5357     _PyUnicodeWriter_Dealloc(&writer);
   5358     Py_XDECREF(errorHandler);
   5359     Py_XDECREF(exc);
   5360     return NULL;
   5361 }
   5362 
   5363 PyObject *
   5364 _PyUnicode_EncodeUTF32(PyObject *str,
   5365                        const char *errors,
   5366                        int byteorder)
   5367 {
   5368     enum PyUnicode_Kind kind;
   5369     const void *data;
   5370     Py_ssize_t len;
   5371     PyObject *v;
   5372     uint32_t *out;
   5373 #if PY_LITTLE_ENDIAN
   5374     int native_ordering = byteorder <= 0;
   5375 #else
   5376     int native_ordering = byteorder >= 0;
   5377 #endif
   5378     const char *encoding;
   5379     Py_ssize_t nsize, pos;
   5380     PyObject *errorHandler = NULL;
   5381     PyObject *exc = NULL;
   5382     PyObject *rep = NULL;
   5383 
   5384     if (!PyUnicode_Check(str)) {
   5385         PyErr_BadArgument();
   5386         return NULL;
   5387     }
   5388     if (PyUnicode_READY(str) == -1)
   5389         return NULL;
   5390     kind = PyUnicode_KIND(str);
   5391     data = PyUnicode_DATA(str);
   5392     len = PyUnicode_GET_LENGTH(str);
   5393 
   5394     if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
   5395         return PyErr_NoMemory();
   5396     nsize = len + (byteorder == 0);
   5397     v = PyBytes_FromStringAndSize(NULL, nsize * 4);
   5398     if (v == NULL)
   5399         return NULL;
   5400 
   5401     /* output buffer is 4-bytes aligned */
   5402     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
   5403     out = (uint32_t *)PyBytes_AS_STRING(v);
   5404     if (byteorder == 0)
   5405         *out++ = 0xFEFF;
   5406     if (len == 0)
   5407         goto done;
   5408 
   5409     if (byteorder == -1)
   5410         encoding = "utf-32-le";
   5411     else if (byteorder == 1)
   5412         encoding = "utf-32-be";
   5413     else
   5414         encoding = "utf-32";
   5415 
   5416     if (kind == PyUnicode_1BYTE_KIND) {
   5417         ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
   5418         goto done;
   5419     }
   5420 
   5421     pos = 0;
   5422     while (pos < len) {
   5423         Py_ssize_t repsize, moreunits;
   5424 
   5425         if (kind == PyUnicode_2BYTE_KIND) {
   5426             pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
   5427                                         &out, native_ordering);
   5428         }
   5429         else {
   5430             assert(kind == PyUnicode_4BYTE_KIND);
   5431             pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
   5432                                         &out, native_ordering);
   5433         }
   5434         if (pos == len)
   5435             break;
   5436 
   5437         rep = unicode_encode_call_errorhandler(
   5438                 errors, &errorHandler,
   5439                 encoding, "surrogates not allowed",
   5440                 str, &exc, pos, pos + 1, &pos);
   5441         if (!rep)
   5442             goto error;
   5443 
   5444         if (PyBytes_Check(rep)) {
   5445             repsize = PyBytes_GET_SIZE(rep);
   5446             if (repsize & 3) {
   5447                 raise_encode_exception(&exc, encoding,
   5448                                        str, pos - 1, pos,
   5449                                        "surrogates not allowed");
   5450                 goto error;
   5451             }
   5452             moreunits = repsize / 4;
   5453         }
   5454         else {
   5455             assert(PyUnicode_Check(rep));
   5456             if (PyUnicode_READY(rep) < 0)
   5457                 goto error;
   5458             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
   5459             if (!PyUnicode_IS_ASCII(rep)) {
   5460                 raise_encode_exception(&exc, encoding,
   5461                                        str, pos - 1, pos,
   5462                                        "surrogates not allowed");
   5463                 goto error;
   5464             }
   5465         }
   5466 
   5467         /* four bytes are reserved for each surrogate */
   5468         if (moreunits > 1) {
   5469             Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
   5470             Py_ssize_t morebytes = 4 * (moreunits - 1);
   5471             if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
   5472                 /* integer overflow */
   5473                 PyErr_NoMemory();
   5474                 goto error;
   5475             }
   5476             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
   5477                 goto error;
   5478             out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
   5479         }
   5480 
   5481         if (PyBytes_Check(rep)) {
   5482             memcpy(out, PyBytes_AS_STRING(rep), repsize);
   5483             out += moreunits;
   5484         } else /* rep is unicode */ {
   5485             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
   5486             ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
   5487                                  &out, native_ordering);
   5488         }
   5489 
   5490         Py_CLEAR(rep);
   5491     }
   5492 
   5493     /* Cut back to size actually needed. This is necessary for, for example,
   5494        encoding of a string containing isolated surrogates and the 'ignore'
   5495        handler is used. */
   5496     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
   5497     if (nsize != PyBytes_GET_SIZE(v))
   5498       _PyBytes_Resize(&v, nsize);
   5499     Py_XDECREF(errorHandler);
   5500     Py_XDECREF(exc);
   5501   done:
   5502     return v;
   5503   error:
   5504     Py_XDECREF(rep);
   5505     Py_XDECREF(errorHandler);
   5506     Py_XDECREF(exc);
   5507     Py_XDECREF(v);
   5508     return NULL;
   5509 }
   5510 
   5511 PyObject *
   5512 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
   5513                       Py_ssize_t size,
   5514                       const char *errors,
   5515                       int byteorder)
   5516 {
   5517     PyObject *result;
   5518     PyObject *tmp = PyUnicode_FromUnicode(s, size);
   5519     if (tmp == NULL)
   5520         return NULL;
   5521     result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
   5522     Py_DECREF(tmp);
   5523     return result;
   5524 }
   5525 
   5526 PyObject *
   5527 PyUnicode_AsUTF32String(PyObject *unicode)
   5528 {
   5529     return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
   5530 }
   5531 
   5532 /* --- UTF-16 Codec ------------------------------------------------------- */
   5533 
   5534 PyObject *
   5535 PyUnicode_DecodeUTF16(const char *s,
   5536                       Py_ssize_t size,
   5537                       const char *errors,
   5538                       int *byteorder)
   5539 {
   5540     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
   5541 }
   5542 
   5543 PyObject *
   5544 PyUnicode_DecodeUTF16Stateful(const char *s,
   5545                               Py_ssize_t size,
   5546                               const char *errors,
   5547                               int *byteorder,
   5548                               Py_ssize_t *consumed)
   5549 {
   5550     const char *starts = s;
   5551     Py_ssize_t startinpos;
   5552     Py_ssize_t endinpos;
   5553     _PyUnicodeWriter writer;
   5554     const unsigned char *q, *e;
   5555     int bo = 0;       /* assume native ordering by default */
   5556     int native_ordering;
   5557     const char *errmsg = "";
   5558     PyObject *errorHandler = NULL;
   5559     PyObject *exc = NULL;
   5560     const char *encoding;
   5561 
   5562     q = (unsigned char *)s;
   5563     e = q + size;
   5564 
   5565     if (byteorder)
   5566         bo = *byteorder;
   5567 
   5568     /* Check for BOM marks (U+FEFF) in the input and adjust current
   5569        byte order setting accordingly. In native mode, the leading BOM
   5570        mark is skipped, in all other modes, it is copied to the output
   5571        stream as-is (giving a ZWNBSP character). */
   5572     if (bo == 0 && size >= 2) {
   5573         const Py_UCS4 bom = (q[1] << 8) | q[0];
   5574         if (bom == 0xFEFF) {
   5575             q += 2;
   5576             bo = -1;
   5577         }
   5578         else if (bom == 0xFFFE) {
   5579             q += 2;
   5580             bo = 1;
   5581         }
   5582         if (byteorder)
   5583             *byteorder = bo;
   5584     }
   5585 
   5586     if (q == e) {
   5587         if (consumed)
   5588             *consumed = size;
   5589         _Py_RETURN_UNICODE_EMPTY();
   5590     }
   5591 
   5592 #if PY_LITTLE_ENDIAN
   5593     native_ordering = bo <= 0;
   5594     encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
   5595 #else
   5596     native_ordering = bo >= 0;
   5597     encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
   5598 #endif
   5599 
   5600     /* Note: size will always be longer than the resulting Unicode
   5601        character count */
   5602     _PyUnicodeWriter_Init(&writer);
   5603     writer.min_length = (e - q + 1) / 2;
   5604     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
   5605         goto onError;
   5606 
   5607     while (1) {
   5608         Py_UCS4 ch = 0;
   5609         if (e - q >= 2) {
   5610             int kind = writer.kind;
   5611             if (kind == PyUnicode_1BYTE_KIND) {
   5612                 if (PyUnicode_IS_ASCII(writer.buffer))
   5613                     ch = asciilib_utf16_decode(&q, e,
   5614                             (Py_UCS1*)writer.data, &writer.pos,
   5615                             native_ordering);
   5616                 else
   5617                     ch = ucs1lib_utf16_decode(&q, e,
   5618                             (Py_UCS1*)writer.data, &writer.pos,
   5619                             native_ordering);
   5620             } else if (kind == PyUnicode_2BYTE_KIND) {
   5621                 ch = ucs2lib_utf16_decode(&q, e,
   5622                         (Py_UCS2*)writer.data, &writer.pos,
   5623                         native_ordering);
   5624             } else {
   5625                 assert(kind == PyUnicode_4BYTE_KIND);
   5626                 ch = ucs4lib_utf16_decode(&q, e,
   5627                         (Py_UCS4*)writer.data, &writer.pos,
   5628                         native_ordering);
   5629             }
   5630         }
   5631 
   5632         switch (ch)
   5633         {
   5634         case 0:
   5635             /* remaining byte at the end? (size should be even) */
   5636             if (q == e || consumed)
   5637                 goto End;
   5638             errmsg = "truncated data";
   5639             startinpos = ((const char *)q) - starts;
   5640             endinpos = ((const char *)e) - starts;
   5641             break;
   5642             /* The remaining input chars are ignored if the callback
   5643                chooses to skip the input */
   5644         case 1:
   5645             q -= 2;
   5646             if (consumed)
   5647                 goto End;
   5648             errmsg = "unexpected end of data";
   5649             startinpos = ((const char *)q) - starts;
   5650             endinpos = ((const char *)e) - starts;
   5651             break;
   5652         case 2:
   5653             errmsg = "illegal encoding";
   5654             startinpos = ((const char *)q) - 2 - starts;
   5655             endinpos = startinpos + 2;
   5656             break;
   5657         case 3:
   5658             errmsg = "illegal UTF-16 surrogate";
   5659             startinpos = ((const char *)q) - 4 - starts;
   5660             endinpos = startinpos + 2;
   5661             break;
   5662         default:
   5663             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
   5664                 goto onError;
   5665             continue;
   5666         }
   5667 
   5668         if (unicode_decode_call_errorhandler_writer(
   5669                 errors,
   5670                 &errorHandler,
   5671                 encoding, errmsg,
   5672                 &starts,
   5673                 (const char **)&e,
   5674                 &startinpos,
   5675                 &endinpos,
   5676                 &exc,
   5677                 (const char **)&q,
   5678                 &writer))
   5679             goto onError;
   5680     }
   5681 
   5682 End:
   5683     if (consumed)
   5684         *consumed = (const char *)q-starts;
   5685 
   5686     Py_XDECREF(errorHandler);
   5687     Py_XDECREF(exc);
   5688     return _PyUnicodeWriter_Finish(&writer);
   5689 
   5690   onError:
   5691     _PyUnicodeWriter_Dealloc(&writer);
   5692     Py_XDECREF(errorHandler);
   5693     Py_XDECREF(exc);
   5694     return NULL;
   5695 }
   5696 
   5697 PyObject *
   5698 _PyUnicode_EncodeUTF16(PyObject *str,
   5699                        const char *errors,
   5700                        int byteorder)
   5701 {
   5702     enum PyUnicode_Kind kind;
   5703     const void *data;
   5704     Py_ssize_t len;
   5705     PyObject *v;
   5706     unsigned short *out;
   5707     Py_ssize_t pairs;
   5708 #if PY_BIG_ENDIAN
   5709     int native_ordering = byteorder >= 0;
   5710 #else
   5711     int native_ordering = byteorder <= 0;
   5712 #endif
   5713     const char *encoding;
   5714     Py_ssize_t nsize, pos;
   5715     PyObject *errorHandler = NULL;
   5716     PyObject *exc = NULL;
   5717     PyObject *rep = NULL;
   5718 
   5719     if (!PyUnicode_Check(str)) {
   5720         PyErr_BadArgument();
   5721         return NULL;
   5722     }
   5723     if (PyUnicode_READY(str) == -1)
   5724         return NULL;
   5725     kind = PyUnicode_KIND(str);
   5726     data = PyUnicode_DATA(str);
   5727     len = PyUnicode_GET_LENGTH(str);
   5728 
   5729     pairs = 0;
   5730     if (kind == PyUnicode_4BYTE_KIND) {
   5731         const Py_UCS4 *in = (const Py_UCS4 *)data;
   5732         const Py_UCS4 *end = in + len;
   5733         while (in < end) {
   5734             if (*in++ >= 0x10000) {
   5735                 pairs++;
   5736             }
   5737         }
   5738     }
   5739     if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
   5740         return PyErr_NoMemory();
   5741     }
   5742     nsize = len + pairs + (byteorder == 0);
   5743     v = PyBytes_FromStringAndSize(NULL, nsize * 2);
   5744     if (v == NULL) {
   5745         return NULL;
   5746     }
   5747 
   5748     /* output buffer is 2-bytes aligned */
   5749     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
   5750     out = (unsigned short *)PyBytes_AS_STRING(v);
   5751     if (byteorder == 0) {
   5752         *out++ = 0xFEFF;
   5753     }
   5754     if (len == 0) {
   5755         goto done;
   5756     }
   5757 
   5758     if (kind == PyUnicode_1BYTE_KIND) {
   5759         ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
   5760         goto done;
   5761     }
   5762 
   5763     if (byteorder < 0) {
   5764         encoding = "utf-16-le";
   5765     }
   5766     else if (byteorder > 0) {
   5767         encoding = "utf-16-be";
   5768     }
   5769     else {
   5770         encoding = "utf-16";
   5771     }
   5772 
   5773     pos = 0;
   5774     while (pos < len) {
   5775         Py_ssize_t repsize, moreunits;
   5776 
   5777         if (kind == PyUnicode_2BYTE_KIND) {
   5778             pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
   5779                                         &out, native_ordering);
   5780         }
   5781         else {
   5782             assert(kind == PyUnicode_4BYTE_KIND);
   5783             pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
   5784                                         &out, native_ordering);
   5785         }
   5786         if (pos == len)
   5787             break;
   5788 
   5789         rep = unicode_encode_call_errorhandler(
   5790                 errors, &errorHandler,
   5791                 encoding, "surrogates not allowed",
   5792                 str, &exc, pos, pos + 1, &pos);
   5793         if (!rep)
   5794             goto error;
   5795 
   5796         if (PyBytes_Check(rep)) {
   5797             repsize = PyBytes_GET_SIZE(rep);
   5798             if (repsize & 1) {
   5799                 raise_encode_exception(&exc, encoding,
   5800                                        str, pos - 1, pos,
   5801                                        "surrogates not allowed");
   5802                 goto error;
   5803             }
   5804             moreunits = repsize / 2;
   5805         }
   5806         else {
   5807             assert(PyUnicode_Check(rep));
   5808             if (PyUnicode_READY(rep) < 0)
   5809                 goto error;
   5810             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
   5811             if (!PyUnicode_IS_ASCII(rep)) {
   5812                 raise_encode_exception(&exc, encoding,
   5813                                        str, pos - 1, pos,
   5814                                        "surrogates not allowed");
   5815                 goto error;
   5816             }
   5817         }
   5818 
   5819         /* two bytes are reserved for each surrogate */
   5820         if (moreunits > 1) {
   5821             Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
   5822             Py_ssize_t morebytes = 2 * (moreunits - 1);
   5823             if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
   5824                 /* integer overflow */
   5825                 PyErr_NoMemory();
   5826                 goto error;
   5827             }
   5828             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
   5829                 goto error;
   5830             out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
   5831         }
   5832 
   5833         if (PyBytes_Check(rep)) {
   5834             memcpy(out, PyBytes_AS_STRING(rep), repsize);
   5835             out += moreunits;
   5836         } else /* rep is unicode */ {
   5837             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
   5838             ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
   5839                                  &out, native_ordering);
   5840         }
   5841 
   5842         Py_CLEAR(rep);
   5843     }
   5844 
   5845     /* Cut back to size actually needed. This is necessary for, for example,
   5846     encoding of a string containing isolated surrogates and the 'ignore' handler
   5847     is used. */
   5848     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
   5849     if (nsize != PyBytes_GET_SIZE(v))
   5850       _PyBytes_Resize(&v, nsize);
   5851     Py_XDECREF(errorHandler);
   5852     Py_XDECREF(exc);
   5853   done:
   5854     return v;
   5855   error:
   5856     Py_XDECREF(rep);
   5857     Py_XDECREF(errorHandler);
   5858     Py_XDECREF(exc);
   5859     Py_XDECREF(v);
   5860     return NULL;
   5861 #undef STORECHAR
   5862 }
   5863 
   5864 PyObject *
   5865 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
   5866                       Py_ssize_t size,
   5867                       const char *errors,
   5868                       int byteorder)
   5869 {
   5870     PyObject *result;
   5871     PyObject *tmp = PyUnicode_FromUnicode(s, size);
   5872     if (tmp == NULL)
   5873         return NULL;
   5874     result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
   5875     Py_DECREF(tmp);
   5876     return result;
   5877 }
   5878 
   5879 PyObject *
   5880 PyUnicode_AsUTF16String(PyObject *unicode)
   5881 {
   5882     return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
   5883 }
   5884 
   5885 /* --- Unicode Escape Codec ----------------------------------------------- */
   5886 
   5887 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
   5888 
   5889 PyObject *
   5890 _PyUnicode_DecodeUnicodeEscape(const char *s,
   5891                                Py_ssize_t size,
   5892                                const char *errors,
   5893                                const char **first_invalid_escape)
   5894 {
   5895     const char *starts = s;
   5896     _PyUnicodeWriter writer;
   5897     const char *end;
   5898     PyObject *errorHandler = NULL;
   5899     PyObject *exc = NULL;
   5900 
   5901     // so we can remember if we've seen an invalid escape char or not
   5902     *first_invalid_escape = NULL;
   5903 
   5904     if (size == 0) {
   5905         _Py_RETURN_UNICODE_EMPTY();
   5906     }
   5907     /* Escaped strings will always be longer than the resulting
   5908        Unicode string, so we start with size here and then reduce the
   5909        length after conversion to the true value.
   5910        (but if the error callback returns a long replacement string
   5911        we'll have to allocate more space) */
   5912     _PyUnicodeWriter_Init(&writer);
   5913     writer.min_length = size;
   5914     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
   5915         goto onError;
   5916     }
   5917 
   5918     end = s + size;
   5919     while (s < end) {
   5920         unsigned char c = (unsigned char) *s++;
   5921         Py_UCS4 ch;
   5922         int count;
   5923         Py_ssize_t startinpos;
   5924         Py_ssize_t endinpos;
   5925         const char *message;
   5926 
   5927 #define WRITE_ASCII_CHAR(ch)                                                  \
   5928             do {                                                              \
   5929                 assert(ch <= 127);                                            \
   5930                 assert(writer.pos < writer.size);                             \
   5931                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
   5932             } while(0)
   5933 
   5934 #define WRITE_CHAR(ch)                                                        \
   5935             do {                                                              \
   5936                 if (ch <= writer.maxchar) {                                   \
   5937                     assert(writer.pos < writer.size);                         \
   5938                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
   5939                 }                                                             \
   5940                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
   5941                     goto onError;                                             \
   5942                 }                                                             \
   5943             } while(0)
   5944 
   5945         /* Non-escape characters are interpreted as Unicode ordinals */
   5946         if (c != '\\') {
   5947             WRITE_CHAR(c);
   5948             continue;
   5949         }
   5950 
   5951         startinpos = s - starts - 1;
   5952         /* \ - Escapes */
   5953         if (s >= end) {
   5954             message = "\\ at end of string";
   5955             goto error;
   5956         }
   5957         c = (unsigned char) *s++;
   5958 
   5959         assert(writer.pos < writer.size);
   5960         switch (c) {
   5961 
   5962             /* \x escapes */
   5963         case '\n': continue;
   5964         case '\\': WRITE_ASCII_CHAR('\\'); continue;
   5965         case '\'': WRITE_ASCII_CHAR('\''); continue;
   5966         case '\"': WRITE_ASCII_CHAR('\"'); continue;
   5967         case 'b': WRITE_ASCII_CHAR('\b'); continue;
   5968         /* FF */
   5969         case 'f': WRITE_ASCII_CHAR('\014'); continue;
   5970         case 't': WRITE_ASCII_CHAR('\t'); continue;
   5971         case 'n': WRITE_ASCII_CHAR('\n'); continue;
   5972         case 'r': WRITE_ASCII_CHAR('\r'); continue;
   5973         /* VT */
   5974         case 'v': WRITE_ASCII_CHAR('\013'); continue;
   5975         /* BEL, not classic C */
   5976         case 'a': WRITE_ASCII_CHAR('\007'); continue;
   5977 
   5978             /* \OOO (octal) escapes */
   5979         case '0': case '1': case '2': case '3':
   5980         case '4': case '5': case '6': case '7':
   5981             ch = c - '0';
   5982             if (s < end && '0' <= *s && *s <= '7') {
   5983                 ch = (ch<<3) + *s++ - '0';
   5984                 if (s < end && '0' <= *s && *s <= '7') {
   5985                     ch = (ch<<3) + *s++ - '0';
   5986                 }
   5987             }
   5988             WRITE_CHAR(ch);
   5989             continue;
   5990 
   5991             /* hex escapes */
   5992             /* \xXX */
   5993         case 'x':
   5994             count = 2;
   5995             message = "truncated \\xXX escape";
   5996             goto hexescape;
   5997 
   5998             /* \uXXXX */
   5999         case 'u':
   6000             count = 4;
   6001             message = "truncated \\uXXXX escape";
   6002             goto hexescape;
   6003 
   6004             /* \UXXXXXXXX */
   6005         case 'U':
   6006             count = 8;
   6007             message = "truncated \\UXXXXXXXX escape";
   6008         hexescape:
   6009             for (ch = 0; count && s < end; ++s, --count) {
   6010                 c = (unsigned char)*s;
   6011                 ch <<= 4;
   6012                 if (c >= '0' && c <= '9') {
   6013                     ch += c - '0';
   6014                 }
   6015                 else if (c >= 'a' && c <= 'f') {
   6016                     ch += c - ('a' - 10);
   6017                 }
   6018                 else if (c >= 'A' && c <= 'F') {
   6019                     ch += c - ('A' - 10);
   6020                 }
   6021                 else {
   6022                     break;
   6023                 }
   6024             }
   6025             if (count) {
   6026                 goto error;
   6027             }
   6028 
   6029             /* when we get here, ch is a 32-bit unicode character */
   6030             if (ch > MAX_UNICODE) {
   6031                 message = "illegal Unicode character";
   6032                 goto error;
   6033             }
   6034 
   6035             WRITE_CHAR(ch);
   6036             continue;
   6037 
   6038             /* \N{name} */
   6039         case 'N':
   6040             if (ucnhash_CAPI == NULL) {
   6041                 /* load the unicode data module */
   6042                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
   6043                                                 PyUnicodeData_CAPSULE_NAME, 1);
   6044                 if (ucnhash_CAPI == NULL) {
   6045                     PyErr_SetString(
   6046                         PyExc_UnicodeError,
   6047                         "\\N escapes not supported (can't load unicodedata module)"
   6048                         );
   6049                     goto onError;
   6050                 }
   6051             }
   6052 
   6053             message = "malformed \\N character escape";
   6054             if (*s == '{') {
   6055                 const char *start = ++s;
   6056                 size_t namelen;
   6057                 /* look for the closing brace */
   6058                 while (s < end && *s != '}')
   6059                     s++;
   6060                 namelen = s - start;
   6061                 if (namelen && s < end) {
   6062                     /* found a name.  look it up in the unicode database */
   6063                     s++;
   6064                     ch = 0xffffffff; /* in case 'getcode' messes up */
   6065                     if (namelen <= INT_MAX &&
   6066                         ucnhash_CAPI->getcode(NULL, start, (int)namelen,
   6067                                               &ch, 0)) {
   6068                         assert(ch <= MAX_UNICODE);
   6069                         WRITE_CHAR(ch);
   6070                         continue;
   6071                     }
   6072                     message = "unknown Unicode character name";
   6073                 }
   6074             }
   6075             goto error;
   6076 
   6077         default:
   6078             if (*first_invalid_escape == NULL) {
   6079                 *first_invalid_escape = s-1; /* Back up one char, since we've
   6080                                                 already incremented s. */
   6081             }
   6082             WRITE_ASCII_CHAR('\\');
   6083             WRITE_CHAR(c);
   6084             continue;
   6085         }
   6086 
   6087       error:
   6088         endinpos = s-starts;
   6089         writer.min_length = end - s + writer.pos;
   6090         if (unicode_decode_call_errorhandler_writer(
   6091                 errors, &errorHandler,
   6092                 "unicodeescape", message,
   6093                 &starts, &end, &startinpos, &endinpos, &exc, &s,
   6094                 &writer)) {
   6095             goto onError;
   6096         }
   6097         if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
   6098             goto onError;
   6099         }
   6100 
   6101 #undef WRITE_ASCII_CHAR
   6102 #undef WRITE_CHAR
   6103     }
   6104 
   6105     Py_XDECREF(errorHandler);
   6106     Py_XDECREF(exc);
   6107     return _PyUnicodeWriter_Finish(&writer);
   6108 
   6109   onError:
   6110     _PyUnicodeWriter_Dealloc(&writer);
   6111     Py_XDECREF(errorHandler);
   6112     Py_XDECREF(exc);
   6113     return NULL;
   6114 }
   6115 
   6116 PyObject *
   6117 PyUnicode_DecodeUnicodeEscape(const char *s,
   6118                               Py_ssize_t size,
   6119                               const char *errors)
   6120 {
   6121     const char *first_invalid_escape;
   6122     PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
   6123                                                       &first_invalid_escape);
   6124     if (result == NULL)
   6125         return NULL;
   6126     if (first_invalid_escape != NULL) {
   6127         if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
   6128                              "invalid escape sequence '\\%c'",
   6129                              *first_invalid_escape) < 0) {
   6130             Py_DECREF(result);
   6131             return NULL;
   6132         }
   6133     }
   6134     return result;
   6135 }
   6136 
   6137 /* Return a Unicode-Escape string version of the Unicode object. */
   6138 
   6139 PyObject *
   6140 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
   6141 {
   6142     Py_ssize_t i, len;
   6143     PyObject *repr;
   6144     char *p;
   6145     enum PyUnicode_Kind kind;
   6146     void *data;
   6147     Py_ssize_t expandsize;
   6148 
   6149     /* Initial allocation is based on the longest-possible character
   6150        escape.
   6151 
   6152        For UCS1 strings it's '\xxx', 4 bytes per source character.
   6153        For UCS2 strings it's '\uxxxx', 6 bytes per source character.
   6154        For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
   6155     */
   6156 
   6157     if (!PyUnicode_Check(unicode)) {
   6158         PyErr_BadArgument();
   6159         return NULL;
   6160     }
   6161     if (PyUnicode_READY(unicode) == -1) {
   6162         return NULL;
   6163     }
   6164 
   6165     len = PyUnicode_GET_LENGTH(unicode);
   6166     if (len == 0) {
   6167         return PyBytes_FromStringAndSize(NULL, 0);
   6168     }
   6169 
   6170     kind = PyUnicode_KIND(unicode);
   6171     data = PyUnicode_DATA(unicode);
   6172     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
   6173        bytes, and 1 byte characters 4. */
   6174     expandsize = kind * 2 + 2;
   6175     if (len > PY_SSIZE_T_MAX / expandsize) {
   6176         return PyErr_NoMemory();
   6177     }
   6178     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
   6179     if (repr == NULL) {
   6180         return NULL;
   6181     }
   6182 
   6183     p = PyBytes_AS_STRING(repr);
   6184     for (i = 0; i < len; i++) {
   6185         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   6186 
   6187         /* U+0000-U+00ff range */
   6188         if (ch < 0x100) {
   6189             if (ch >= ' ' && ch < 127) {
   6190                 if (ch != '\\') {
   6191                     /* Copy printable US ASCII as-is */
   6192                     *p++ = (char) ch;
   6193                 }
   6194                 /* Escape backslashes */
   6195                 else {
   6196                     *p++ = '\\';
   6197                     *p++ = '\\';
   6198                 }
   6199             }
   6200 
   6201             /* Map special whitespace to '\t', \n', '\r' */
   6202             else if (ch == '\t') {
   6203                 *p++ = '\\';
   6204                 *p++ = 't';
   6205             }
   6206             else if (ch == '\n') {
   6207                 *p++ = '\\';
   6208                 *p++ = 'n';
   6209             }
   6210             else if (ch == '\r') {
   6211                 *p++ = '\\';
   6212                 *p++ = 'r';
   6213             }
   6214 
   6215             /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
   6216             else {
   6217                 *p++ = '\\';
   6218                 *p++ = 'x';
   6219                 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
   6220                 *p++ = Py_hexdigits[ch & 0x000F];
   6221             }
   6222         }
   6223         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
   6224         else if (ch < 0x10000) {
   6225             *p++ = '\\';
   6226             *p++ = 'u';
   6227             *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
   6228             *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
   6229             *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
   6230             *p++ = Py_hexdigits[ch & 0x000F];
   6231         }
   6232         /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
   6233         else {
   6234 
   6235             /* Make sure that the first two digits are zero */
   6236             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
   6237             *p++ = '\\';
   6238             *p++ = 'U';
   6239             *p++ = '0';
   6240             *p++ = '0';
   6241             *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
   6242             *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
   6243             *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
   6244             *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
   6245             *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
   6246             *p++ = Py_hexdigits[ch & 0x0000000F];
   6247         }
   6248     }
   6249 
   6250     assert(p - PyBytes_AS_STRING(repr) > 0);
   6251     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
   6252         return NULL;
   6253     }
   6254     return repr;
   6255 }
   6256 
   6257 PyObject *
   6258 PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
   6259                               Py_ssize_t size)
   6260 {
   6261     PyObject *result;
   6262     PyObject *tmp = PyUnicode_FromUnicode(s, size);
   6263     if (tmp == NULL) {
   6264         return NULL;
   6265     }
   6266 
   6267     result = PyUnicode_AsUnicodeEscapeString(tmp);
   6268     Py_DECREF(tmp);
   6269     return result;
   6270 }
   6271 
   6272 /* --- Raw Unicode Escape Codec ------------------------------------------- */
   6273 
   6274 PyObject *
   6275 PyUnicode_DecodeRawUnicodeEscape(const char *s,
   6276                                  Py_ssize_t size,
   6277                                  const char *errors)
   6278 {
   6279     const char *starts = s;
   6280     _PyUnicodeWriter writer;
   6281     const char *end;
   6282     PyObject *errorHandler = NULL;
   6283     PyObject *exc = NULL;
   6284 
   6285     if (size == 0) {
   6286         _Py_RETURN_UNICODE_EMPTY();
   6287     }
   6288 
   6289     /* Escaped strings will always be longer than the resulting
   6290        Unicode string, so we start with size here and then reduce the
   6291        length after conversion to the true value. (But decoding error
   6292        handler might have to resize the string) */
   6293     _PyUnicodeWriter_Init(&writer);
   6294      writer.min_length = size;
   6295     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
   6296         goto onError;
   6297     }
   6298 
   6299     end = s + size;
   6300     while (s < end) {
   6301         unsigned char c = (unsigned char) *s++;
   6302         Py_UCS4 ch;
   6303         int count;
   6304         Py_ssize_t startinpos;
   6305         Py_ssize_t endinpos;
   6306         const char *message;
   6307 
   6308 #define WRITE_CHAR(ch)                                                        \
   6309             do {                                                              \
   6310                 if (ch <= writer.maxchar) {                                   \
   6311                     assert(writer.pos < writer.size);                         \
   6312                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
   6313                 }                                                             \
   6314                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
   6315                     goto onError;                                             \
   6316                 }                                                             \
   6317             } while(0)
   6318 
   6319         /* Non-escape characters are interpreted as Unicode ordinals */
   6320         if (c != '\\' || s >= end) {
   6321             WRITE_CHAR(c);
   6322             continue;
   6323         }
   6324 
   6325         c = (unsigned char) *s++;
   6326         if (c == 'u') {
   6327             count = 4;
   6328             message = "truncated \\uXXXX escape";
   6329         }
   6330         else if (c == 'U') {
   6331             count = 8;
   6332             message = "truncated \\UXXXXXXXX escape";
   6333         }
   6334         else {
   6335             assert(writer.pos < writer.size);
   6336             PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
   6337             WRITE_CHAR(c);
   6338             continue;
   6339         }
   6340         startinpos = s - starts - 2;
   6341 
   6342         /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
   6343         for (ch = 0; count && s < end; ++s, --count) {
   6344             c = (unsigned char)*s;
   6345             ch <<= 4;
   6346             if (c >= '0' && c <= '9') {
   6347                 ch += c - '0';
   6348             }
   6349             else if (c >= 'a' && c <= 'f') {
   6350                 ch += c - ('a' - 10);
   6351             }
   6352             else if (c >= 'A' && c <= 'F') {
   6353                 ch += c - ('A' - 10);
   6354             }
   6355             else {
   6356                 break;
   6357             }
   6358         }
   6359         if (!count) {
   6360             if (ch <= MAX_UNICODE) {
   6361                 WRITE_CHAR(ch);
   6362                 continue;
   6363             }
   6364             message = "\\Uxxxxxxxx out of range";
   6365         }
   6366 
   6367         endinpos = s-starts;
   6368         writer.min_length = end - s + writer.pos;
   6369         if (unicode_decode_call_errorhandler_writer(
   6370                 errors, &errorHandler,
   6371                 "rawunicodeescape", message,
   6372                 &starts, &end, &startinpos, &endinpos, &exc, &s,
   6373                 &writer)) {
   6374             goto onError;
   6375         }
   6376         if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
   6377             goto onError;
   6378         }
   6379 
   6380 #undef WRITE_CHAR
   6381     }
   6382     Py_XDECREF(errorHandler);
   6383     Py_XDECREF(exc);
   6384     return _PyUnicodeWriter_Finish(&writer);
   6385 
   6386   onError:
   6387     _PyUnicodeWriter_Dealloc(&writer);
   6388     Py_XDECREF(errorHandler);
   6389     Py_XDECREF(exc);
   6390     return NULL;
   6391 
   6392 }
   6393 
   6394 
   6395 PyObject *
   6396 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
   6397 {
   6398     PyObject *repr;
   6399     char *p;
   6400     Py_ssize_t expandsize, pos;
   6401     int kind;
   6402     void *data;
   6403     Py_ssize_t len;
   6404 
   6405     if (!PyUnicode_Check(unicode)) {
   6406         PyErr_BadArgument();
   6407         return NULL;
   6408     }
   6409     if (PyUnicode_READY(unicode) == -1) {
   6410         return NULL;
   6411     }
   6412     kind = PyUnicode_KIND(unicode);
   6413     data = PyUnicode_DATA(unicode);
   6414     len = PyUnicode_GET_LENGTH(unicode);
   6415     if (kind == PyUnicode_1BYTE_KIND) {
   6416         return PyBytes_FromStringAndSize(data, len);
   6417     }
   6418 
   6419     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
   6420        bytes, and 1 byte characters 4. */
   6421     expandsize = kind * 2 + 2;
   6422 
   6423     if (len > PY_SSIZE_T_MAX / expandsize) {
   6424         return PyErr_NoMemory();
   6425     }
   6426     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
   6427     if (repr == NULL) {
   6428         return NULL;
   6429     }
   6430     if (len == 0) {
   6431         return repr;
   6432     }
   6433 
   6434     p = PyBytes_AS_STRING(repr);
   6435     for (pos = 0; pos < len; pos++) {
   6436         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
   6437 
   6438         /* U+0000-U+00ff range: Copy 8-bit characters as-is */
   6439         if (ch < 0x100) {
   6440             *p++ = (char) ch;
   6441         }
   6442         /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
   6443         else if (ch < 0x10000) {
   6444             *p++ = '\\';
   6445             *p++ = 'u';
   6446             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
   6447             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
   6448             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
   6449             *p++ = Py_hexdigits[ch & 15];
   6450         }
   6451         /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
   6452         else {
   6453             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
   6454             *p++ = '\\';
   6455             *p++ = 'U';
   6456             *p++ = '0';
   6457             *p++ = '0';
   6458             *p++ = Py_hexdigits[(ch >> 20) & 0xf];
   6459             *p++ = Py_hexdigits[(ch >> 16) & 0xf];
   6460             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
   6461             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
   6462             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
   6463             *p++ = Py_hexdigits[ch & 15];
   6464         }
   6465     }
   6466 
   6467     assert(p > PyBytes_AS_STRING(repr));
   6468     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
   6469         return NULL;
   6470     }
   6471     return repr;
   6472 }
   6473 
   6474 PyObject *
   6475 PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
   6476                                  Py_ssize_t size)
   6477 {
   6478     PyObject *result;
   6479     PyObject *tmp = PyUnicode_FromUnicode(s, size);
   6480     if (tmp == NULL)
   6481         return NULL;
   6482     result = PyUnicode_AsRawUnicodeEscapeString(tmp);
   6483     Py_DECREF(tmp);
   6484     return result;
   6485 }
   6486 
   6487 /* --- Unicode Internal Codec ------------------------------------------- */
   6488 
   6489 PyObject *
   6490 _PyUnicode_DecodeUnicodeInternal(const char *s,
   6491                                  Py_ssize_t size,
   6492                                  const char *errors)
   6493 {
   6494     const char *starts = s;
   6495     Py_ssize_t startinpos;
   6496     Py_ssize_t endinpos;
   6497     _PyUnicodeWriter writer;
   6498     const char *end;
   6499     const char *reason;
   6500     PyObject *errorHandler = NULL;
   6501     PyObject *exc = NULL;
   6502 
   6503     if (PyErr_WarnEx(PyExc_DeprecationWarning,
   6504                      "unicode_internal codec has been deprecated",
   6505                      1))
   6506         return NULL;
   6507 
   6508     if (size == 0)
   6509         _Py_RETURN_UNICODE_EMPTY();
   6510 
   6511     _PyUnicodeWriter_Init(&writer);
   6512     if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
   6513         PyErr_NoMemory();
   6514         goto onError;
   6515     }
   6516     writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
   6517 
   6518     end = s + size;
   6519     while (s < end) {
   6520         Py_UNICODE uch;
   6521         Py_UCS4 ch;
   6522         if (end - s < Py_UNICODE_SIZE) {
   6523             endinpos = end-starts;
   6524             reason = "truncated input";
   6525             goto error;
   6526         }
   6527         /* We copy the raw representation one byte at a time because the
   6528            pointer may be unaligned (see test_codeccallbacks). */
   6529         ((char *) &uch)[0] = s[0];
   6530         ((char *) &uch)[1] = s[1];
   6531 #ifdef Py_UNICODE_WIDE
   6532         ((char *) &uch)[2] = s[2];
   6533         ((char *) &uch)[3] = s[3];
   6534 #endif
   6535         ch = uch;
   6536 #ifdef Py_UNICODE_WIDE
   6537         /* We have to sanity check the raw data, otherwise doom looms for
   6538            some malformed UCS-4 data. */
   6539         if (ch > 0x10ffff) {
   6540             endinpos = s - starts + Py_UNICODE_SIZE;
   6541             reason = "illegal code point (> 0x10FFFF)";
   6542             goto error;
   6543         }
   6544 #endif
   6545         s += Py_UNICODE_SIZE;
   6546 #ifndef Py_UNICODE_WIDE
   6547         if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
   6548         {
   6549             Py_UNICODE uch2;
   6550             ((char *) &uch2)[0] = s[0];
   6551             ((char *) &uch2)[1] = s[1];
   6552             if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
   6553             {
   6554                 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
   6555                 s += Py_UNICODE_SIZE;
   6556             }
   6557         }
   6558 #endif
   6559 
   6560         if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
   6561             goto onError;
   6562         continue;
   6563 
   6564   error:
   6565         startinpos = s - starts;
   6566         if (unicode_decode_call_errorhandler_writer(
   6567                 errors, &errorHandler,
   6568                 "unicode_internal", reason,
   6569                 &starts, &end, &startinpos, &endinpos, &exc, &s,
   6570                 &writer))
   6571             goto onError;
   6572     }
   6573 
   6574     Py_XDECREF(errorHandler);
   6575     Py_XDECREF(exc);
   6576     return _PyUnicodeWriter_Finish(&writer);
   6577 
   6578   onError:
   6579     _PyUnicodeWriter_Dealloc(&writer);
   6580     Py_XDECREF(errorHandler);
   6581     Py_XDECREF(exc);
   6582     return NULL;
   6583 }
   6584 
   6585 /* --- Latin-1 Codec ------------------------------------------------------ */
   6586 
   6587 PyObject *
   6588 PyUnicode_DecodeLatin1(const char *s,
   6589                        Py_ssize_t size,
   6590                        const char *errors)
   6591 {
   6592     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
   6593     return _PyUnicode_FromUCS1((unsigned char*)s, size);
   6594 }
   6595 
   6596 /* create or adjust a UnicodeEncodeError */
   6597 static void
   6598 make_encode_exception(PyObject **exceptionObject,
   6599                       const char *encoding,
   6600                       PyObject *unicode,
   6601                       Py_ssize_t startpos, Py_ssize_t endpos,
   6602                       const char *reason)
   6603 {
   6604     if (*exceptionObject == NULL) {
   6605         *exceptionObject = PyObject_CallFunction(
   6606             PyExc_UnicodeEncodeError, "sOnns",
   6607             encoding, unicode, startpos, endpos, reason);
   6608     }
   6609     else {
   6610         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
   6611             goto onError;
   6612         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
   6613             goto onError;
   6614         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
   6615             goto onError;
   6616         return;
   6617       onError:
   6618         Py_CLEAR(*exceptionObject);
   6619     }
   6620 }
   6621 
   6622 /* raises a UnicodeEncodeError */
   6623 static void
   6624 raise_encode_exception(PyObject **exceptionObject,
   6625                        const char *encoding,
   6626                        PyObject *unicode,
   6627                        Py_ssize_t startpos, Py_ssize_t endpos,
   6628                        const char *reason)
   6629 {
   6630     make_encode_exception(exceptionObject,
   6631                           encoding, unicode, startpos, endpos, reason);
   6632     if (*exceptionObject != NULL)
   6633         PyCodec_StrictErrors(*exceptionObject);
   6634 }
   6635 
   6636 /* error handling callback helper:
   6637    build arguments, call the callback and check the arguments,
   6638    put the result into newpos and return the replacement string, which
   6639    has to be freed by the caller */
   6640 static PyObject *
   6641 unicode_encode_call_errorhandler(const char *errors,
   6642                                  PyObject **errorHandler,
   6643                                  const char *encoding, const char *reason,
   6644                                  PyObject *unicode, PyObject **exceptionObject,
   6645                                  Py_ssize_t startpos, Py_ssize_t endpos,
   6646                                  Py_ssize_t *newpos)
   6647 {
   6648     static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
   6649     Py_ssize_t len;
   6650     PyObject *restuple;
   6651     PyObject *resunicode;
   6652 
   6653     if (*errorHandler == NULL) {
   6654         *errorHandler = PyCodec_LookupError(errors);
   6655         if (*errorHandler == NULL)
   6656             return NULL;
   6657     }
   6658 
   6659     if (PyUnicode_READY(unicode) == -1)
   6660         return NULL;
   6661     len = PyUnicode_GET_LENGTH(unicode);
   6662 
   6663     make_encode_exception(exceptionObject,
   6664                           encoding, unicode, startpos, endpos, reason);
   6665     if (*exceptionObject == NULL)
   6666         return NULL;
   6667 
   6668     restuple = PyObject_CallFunctionObjArgs(
   6669         *errorHandler, *exceptionObject, NULL);
   6670     if (restuple == NULL)
   6671         return NULL;
   6672     if (!PyTuple_Check(restuple)) {
   6673         PyErr_SetString(PyExc_TypeError, &argparse[3]);
   6674         Py_DECREF(restuple);
   6675         return NULL;
   6676     }
   6677     if (!PyArg_ParseTuple(restuple, argparse,
   6678                           &resunicode, newpos)) {
   6679         Py_DECREF(restuple);
   6680         return NULL;
   6681     }
   6682     if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
   6683         PyErr_SetString(PyExc_TypeError, &argparse[3]);
   6684         Py_DECREF(restuple);
   6685         return NULL;
   6686     }
   6687     if (*newpos<0)
   6688         *newpos = len + *newpos;
   6689     if (*newpos<0 || *newpos>len) {
   6690         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
   6691         Py_DECREF(restuple);
   6692         return NULL;
   6693     }
   6694     Py_INCREF(resunicode);
   6695     Py_DECREF(restuple);
   6696     return resunicode;
   6697 }
   6698 
   6699 static PyObject *
   6700 unicode_encode_ucs1(PyObject *unicode,
   6701                     const char *errors,
   6702                     const Py_UCS4 limit)
   6703 {
   6704     /* input state */
   6705     Py_ssize_t pos=0, size;
   6706     int kind;
   6707     void *data;
   6708     /* pointer into the output */
   6709     char *str;
   6710     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
   6711     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
   6712     PyObject *error_handler_obj = NULL;
   6713     PyObject *exc = NULL;
   6714     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
   6715     PyObject *rep = NULL;
   6716     /* output object */
   6717     _PyBytesWriter writer;
   6718 
   6719     if (PyUnicode_READY(unicode) == -1)
   6720         return NULL;
   6721     size = PyUnicode_GET_LENGTH(unicode);
   6722     kind = PyUnicode_KIND(unicode);
   6723     data = PyUnicode_DATA(unicode);
   6724     /* allocate enough for a simple encoding without
   6725        replacements, if we need more, we'll resize */
   6726     if (size == 0)
   6727         return PyBytes_FromStringAndSize(NULL, 0);
   6728 
   6729     _PyBytesWriter_Init(&writer);
   6730     str = _PyBytesWriter_Alloc(&writer, size);
   6731     if (str == NULL)
   6732         return NULL;
   6733 
   6734     while (pos < size) {
   6735         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
   6736 
   6737         /* can we encode this? */
   6738         if (ch < limit) {
   6739             /* no overflow check, because we know that the space is enough */
   6740             *str++ = (char)ch;
   6741             ++pos;
   6742         }
   6743         else {
   6744             Py_ssize_t newpos, i;
   6745             /* startpos for collecting unencodable chars */
   6746             Py_ssize_t collstart = pos;
   6747             Py_ssize_t collend = collstart + 1;
   6748             /* find all unecodable characters */
   6749 
   6750             while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
   6751                 ++collend;
   6752 
   6753             /* Only overallocate the buffer if it's not the last write */
   6754             writer.overallocate = (collend < size);
   6755 
   6756             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
   6757             if (error_handler == _Py_ERROR_UNKNOWN)
   6758                 error_handler = get_error_handler(errors);
   6759 
   6760             switch (error_handler) {
   6761             case _Py_ERROR_STRICT:
   6762                 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
   6763                 goto onError;
   6764 
   6765             case _Py_ERROR_REPLACE:
   6766                 memset(str, '?', collend - collstart);
   6767                 str += (collend - collstart);
   6768                 /* fall through ignore error handler */
   6769             case _Py_ERROR_IGNORE:
   6770                 pos = collend;
   6771                 break;
   6772 
   6773             case _Py_ERROR_BACKSLASHREPLACE:
   6774                 /* subtract preallocated bytes */
   6775                 writer.min_size -= (collend - collstart);
   6776                 str = backslashreplace(&writer, str,
   6777                                        unicode, collstart, collend);
   6778                 if (str == NULL)
   6779                     goto onError;
   6780                 pos = collend;
   6781                 break;
   6782 
   6783             case _Py_ERROR_XMLCHARREFREPLACE:
   6784                 /* subtract preallocated bytes */
   6785                 writer.min_size -= (collend - collstart);
   6786                 str = xmlcharrefreplace(&writer, str,
   6787                                         unicode, collstart, collend);
   6788                 if (str == NULL)
   6789                     goto onError;
   6790                 pos = collend;
   6791                 break;
   6792 
   6793             case _Py_ERROR_SURROGATEESCAPE:
   6794                 for (i = collstart; i < collend; ++i) {
   6795                     ch = PyUnicode_READ(kind, data, i);
   6796                     if (ch < 0xdc80 || 0xdcff < ch) {
   6797                         /* Not a UTF-8b surrogate */
   6798                         break;
   6799                     }
   6800                     *str++ = (char)(ch - 0xdc00);
   6801                     ++pos;
   6802                 }
   6803                 if (i >= collend)
   6804                     break;
   6805                 collstart = pos;
   6806                 assert(collstart != collend);
   6807                 /* fallback to general error handling */
   6808 
   6809             default:
   6810                 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
   6811                                                        encoding, reason, unicode, &exc,
   6812                                                        collstart, collend, &newpos);
   6813                 if (rep == NULL)
   6814                     goto onError;
   6815 
   6816                 /* subtract preallocated bytes */
   6817                 writer.min_size -= 1;
   6818 
   6819                 if (PyBytes_Check(rep)) {
   6820                     /* Directly copy bytes result to output. */
   6821                     str = _PyBytesWriter_WriteBytes(&writer, str,
   6822                                                     PyBytes_AS_STRING(rep),
   6823                                                     PyBytes_GET_SIZE(rep));
   6824                     if (str == NULL)
   6825                         goto onError;
   6826                 }
   6827                 else {
   6828                     assert(PyUnicode_Check(rep));
   6829 
   6830                     if (PyUnicode_READY(rep) < 0)
   6831                         goto onError;
   6832 
   6833                     if (PyUnicode_IS_ASCII(rep)) {
   6834                         /* Fast path: all characters are smaller than limit */
   6835                         assert(limit >= 128);
   6836                         assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
   6837                         str = _PyBytesWriter_WriteBytes(&writer, str,
   6838                                                         PyUnicode_DATA(rep),
   6839                                                         PyUnicode_GET_LENGTH(rep));
   6840                     }
   6841                     else {
   6842                         Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
   6843 
   6844                         str = _PyBytesWriter_Prepare(&writer, str, repsize);
   6845                         if (str == NULL)
   6846                             goto onError;
   6847 
   6848                         /* check if there is anything unencodable in the
   6849                            replacement and copy it to the output */
   6850                         for (i = 0; repsize-->0; ++i, ++str) {
   6851                             ch = PyUnicode_READ_CHAR(rep, i);
   6852                             if (ch >= limit) {
   6853                                 raise_encode_exception(&exc, encoding, unicode,
   6854                                                        pos, pos+1, reason);
   6855                                 goto onError;
   6856                             }
   6857                             *str = (char)ch;
   6858                         }
   6859                     }
   6860                 }
   6861                 pos = newpos;
   6862                 Py_CLEAR(rep);
   6863             }
   6864 
   6865             /* If overallocation was disabled, ensure that it was the last
   6866                write. Otherwise, we missed an optimization */
   6867             assert(writer.overallocate || pos == size);
   6868         }
   6869     }
   6870 
   6871     Py_XDECREF(error_handler_obj);
   6872     Py_XDECREF(exc);
   6873     return _PyBytesWriter_Finish(&writer, str);
   6874 
   6875   onError:
   6876     Py_XDECREF(rep);
   6877     _PyBytesWriter_Dealloc(&writer);
   6878     Py_XDECREF(error_handler_obj);
   6879     Py_XDECREF(exc);
   6880     return NULL;
   6881 }
   6882 
   6883 /* Deprecated */
   6884 PyObject *
   6885 PyUnicode_EncodeLatin1(const Py_UNICODE *p,
   6886                        Py_ssize_t size,
   6887                        const char *errors)
   6888 {
   6889     PyObject *result;
   6890     PyObject *unicode = PyUnicode_FromUnicode(p, size);
   6891     if (unicode == NULL)
   6892         return NULL;
   6893     result = unicode_encode_ucs1(unicode, errors, 256);
   6894     Py_DECREF(unicode);
   6895     return result;
   6896 }
   6897 
   6898 PyObject *
   6899 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
   6900 {
   6901     if (!PyUnicode_Check(unicode)) {
   6902         PyErr_BadArgument();
   6903         return NULL;
   6904     }
   6905     if (PyUnicode_READY(unicode) == -1)
   6906         return NULL;
   6907     /* Fast path: if it is a one-byte string, construct
   6908        bytes object directly. */
   6909     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
   6910         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
   6911                                          PyUnicode_GET_LENGTH(unicode));
   6912     /* Non-Latin-1 characters present. Defer to above function to
   6913        raise the exception. */
   6914     return unicode_encode_ucs1(unicode, errors, 256);
   6915 }
   6916 
   6917 PyObject*
   6918 PyUnicode_AsLatin1String(PyObject *unicode)
   6919 {
   6920     return _PyUnicode_AsLatin1String(unicode, NULL);
   6921 }
   6922 
   6923 /* --- 7-bit ASCII Codec -------------------------------------------------- */
   6924 
   6925 PyObject *
   6926 PyUnicode_DecodeASCII(const char *s,
   6927                       Py_ssize_t size,
   6928                       const char *errors)
   6929 {
   6930     const char *starts = s;
   6931     _PyUnicodeWriter writer;
   6932     int kind;
   6933     void *data;
   6934     Py_ssize_t startinpos;
   6935     Py_ssize_t endinpos;
   6936     Py_ssize_t outpos;
   6937     const char *e;
   6938     PyObject *error_handler_obj = NULL;
   6939     PyObject *exc = NULL;
   6940     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
   6941 
   6942     if (size == 0)
   6943         _Py_RETURN_UNICODE_EMPTY();
   6944 
   6945     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
   6946     if (size == 1 && (unsigned char)s[0] < 128)
   6947         return get_latin1_char((unsigned char)s[0]);
   6948 
   6949     _PyUnicodeWriter_Init(&writer);
   6950     writer.min_length = size;
   6951     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
   6952         return NULL;
   6953 
   6954     e = s + size;
   6955     data = writer.data;
   6956     outpos = ascii_decode(s, e, (Py_UCS1 *)data);
   6957     writer.pos = outpos;
   6958     if (writer.pos == size)
   6959         return _PyUnicodeWriter_Finish(&writer);
   6960 
   6961     s += writer.pos;
   6962     kind = writer.kind;
   6963     while (s < e) {
   6964         unsigned char c = (unsigned char)*s;
   6965         if (c < 128) {
   6966             PyUnicode_WRITE(kind, data, writer.pos, c);
   6967             writer.pos++;
   6968             ++s;
   6969             continue;
   6970         }
   6971 
   6972         /* byte outsize range 0x00..0x7f: call the error handler */
   6973 
   6974         if (error_handler == _Py_ERROR_UNKNOWN)
   6975             error_handler = get_error_handler(errors);
   6976 
   6977         switch (error_handler)
   6978         {
   6979         case _Py_ERROR_REPLACE:
   6980         case _Py_ERROR_SURROGATEESCAPE:
   6981             /* Fast-path: the error handler only writes one character,
   6982                but we may switch to UCS2 at the first write */
   6983             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
   6984                 goto onError;
   6985             kind = writer.kind;
   6986             data = writer.data;
   6987 
   6988             if (error_handler == _Py_ERROR_REPLACE)
   6989                 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
   6990             else
   6991                 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
   6992             writer.pos++;
   6993             ++s;
   6994             break;
   6995 
   6996         case _Py_ERROR_IGNORE:
   6997             ++s;
   6998             break;
   6999 
   7000         default:
   7001             startinpos = s-starts;
   7002             endinpos = startinpos + 1;
   7003             if (unicode_decode_call_errorhandler_writer(
   7004                     errors, &error_handler_obj,
   7005                     "ascii", "ordinal not in range(128)",
   7006                     &starts, &e, &startinpos, &endinpos, &exc, &s,
   7007                     &writer))
   7008                 goto onError;
   7009             kind = writer.kind;
   7010             data = writer.data;
   7011         }
   7012     }
   7013     Py_XDECREF(error_handler_obj);
   7014     Py_XDECREF(exc);
   7015     return _PyUnicodeWriter_Finish(&writer);
   7016 
   7017   onError:
   7018     _PyUnicodeWriter_Dealloc(&writer);
   7019     Py_XDECREF(error_handler_obj);
   7020     Py_XDECREF(exc);
   7021     return NULL;
   7022 }
   7023 
   7024 /* Deprecated */
   7025 PyObject *
   7026 PyUnicode_EncodeASCII(const Py_UNICODE *p,
   7027                       Py_ssize_t size,
   7028                       const char *errors)
   7029 {
   7030     PyObject *result;
   7031     PyObject *unicode = PyUnicode_FromUnicode(p, size);
   7032     if (unicode == NULL)
   7033         return NULL;
   7034     result = unicode_encode_ucs1(unicode, errors, 128);
   7035     Py_DECREF(unicode);
   7036     return result;
   7037 }
   7038 
   7039 PyObject *
   7040 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
   7041 {
   7042     if (!PyUnicode_Check(unicode)) {
   7043         PyErr_BadArgument();
   7044         return NULL;
   7045     }
   7046     if (PyUnicode_READY(unicode) == -1)
   7047         return NULL;
   7048     /* Fast path: if it is an ASCII-only string, construct bytes object
   7049        directly. Else defer to above function to raise the exception. */
   7050     if (PyUnicode_IS_ASCII(unicode))
   7051         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
   7052                                          PyUnicode_GET_LENGTH(unicode));
   7053     return unicode_encode_ucs1(unicode, errors, 128);
   7054 }
   7055 
   7056 PyObject *
   7057 PyUnicode_AsASCIIString(PyObject *unicode)
   7058 {
   7059     return _PyUnicode_AsASCIIString(unicode, NULL);
   7060 }
   7061 
   7062 #ifdef MS_WINDOWS
   7063 
   7064 /* --- MBCS codecs for Windows -------------------------------------------- */
   7065 
   7066 #if SIZEOF_INT < SIZEOF_SIZE_T
   7067 #define NEED_RETRY
   7068 #endif
   7069 
   7070 #ifndef WC_ERR_INVALID_CHARS
   7071 #  define WC_ERR_INVALID_CHARS 0x0080
   7072 #endif
   7073 
   7074 static const char*
   7075 code_page_name(UINT code_page, PyObject **obj)
   7076 {
   7077     *obj = NULL;
   7078     if (code_page == CP_ACP)
   7079         return "mbcs";
   7080     if (code_page == CP_UTF7)
   7081         return "CP_UTF7";
   7082     if (code_page == CP_UTF8)
   7083         return "CP_UTF8";
   7084 
   7085     *obj = PyBytes_FromFormat("cp%u", code_page);
   7086     if (*obj == NULL)
   7087         return NULL;
   7088     return PyBytes_AS_STRING(*obj);
   7089 }
   7090 
   7091 static DWORD
   7092 decode_code_page_flags(UINT code_page)
   7093 {
   7094     if (code_page == CP_UTF7) {
   7095         /* The CP_UTF7 decoder only supports flags=0 */
   7096         return 0;
   7097     }
   7098     else
   7099         return MB_ERR_INVALID_CHARS;
   7100 }
   7101 
   7102 /*
   7103  * Decode a byte string from a Windows code page into unicode object in strict
   7104  * mode.
   7105  *
   7106  * Returns consumed size if succeed, returns -2 on decode error, or raise an
   7107  * OSError and returns -1 on other error.
   7108  */
   7109 static int
   7110 decode_code_page_strict(UINT code_page,
   7111                         PyObject **v,
   7112                         const char *in,
   7113                         int insize)
   7114 {
   7115     const DWORD flags = decode_code_page_flags(code_page);
   7116     wchar_t *out;
   7117     DWORD outsize;
   7118 
   7119     /* First get the size of the result */
   7120     assert(insize > 0);
   7121     outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
   7122     if (outsize <= 0)
   7123         goto error;
   7124 
   7125     if (*v == NULL) {
   7126         /* Create unicode object */
   7127         /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
   7128         *v = (PyObject*)_PyUnicode_New(outsize);
   7129         if (*v == NULL)
   7130             return -1;
   7131         out = PyUnicode_AS_UNICODE(*v);
   7132     }
   7133     else {
   7134         /* Extend unicode object */
   7135         Py_ssize_t n = PyUnicode_GET_SIZE(*v);
   7136         if (unicode_resize(v, n + outsize) < 0)
   7137             return -1;
   7138         out = PyUnicode_AS_UNICODE(*v) + n;
   7139     }
   7140 
   7141     /* Do the conversion */
   7142     outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
   7143     if (outsize <= 0)
   7144         goto error;
   7145     return insize;
   7146 
   7147 error:
   7148     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
   7149         return -2;
   7150     PyErr_SetFromWindowsErr(0);
   7151     return -1;
   7152 }
   7153 
   7154 /*
   7155  * Decode a byte string from a code page into unicode object with an error
   7156  * handler.
   7157  *
   7158  * Returns consumed size if succeed, or raise an OSError or
   7159  * UnicodeDecodeError exception and returns -1 on error.
   7160  */
   7161 static int
   7162 decode_code_page_errors(UINT code_page,
   7163                         PyObject **v,
   7164                         const char *in, const int size,
   7165                         const char *errors, int final)
   7166 {
   7167     const char *startin = in;
   7168     const char *endin = in + size;
   7169     const DWORD flags = decode_code_page_flags(code_page);
   7170     /* Ideally, we should get reason from FormatMessage. This is the Windows
   7171        2000 English version of the message. */
   7172     const char *reason = "No mapping for the Unicode character exists "
   7173                          "in the target code page.";
   7174     /* each step cannot decode more than 1 character, but a character can be
   7175        represented as a surrogate pair */
   7176     wchar_t buffer[2], *startout, *out;
   7177     int insize;
   7178     Py_ssize_t outsize;
   7179     PyObject *errorHandler = NULL;
   7180     PyObject *exc = NULL;
   7181     PyObject *encoding_obj = NULL;
   7182     const char *encoding;
   7183     DWORD err;
   7184     int ret = -1;
   7185 
   7186     assert(size > 0);
   7187 
   7188     encoding = code_page_name(code_page, &encoding_obj);
   7189     if (encoding == NULL)
   7190         return -1;
   7191 
   7192     if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
   7193         /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
   7194            UnicodeDecodeError. */
   7195         make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
   7196         if (exc != NULL) {
   7197             PyCodec_StrictErrors(exc);
   7198             Py_CLEAR(exc);
   7199         }
   7200         goto error;
   7201     }
   7202 
   7203     if (*v == NULL) {
   7204         /* Create unicode object */
   7205         if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
   7206             PyErr_NoMemory();
   7207             goto error;
   7208         }
   7209         /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
   7210         *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
   7211         if (*v == NULL)
   7212             goto error;
   7213         startout = PyUnicode_AS_UNICODE(*v);
   7214     }
   7215     else {
   7216         /* Extend unicode object */
   7217         Py_ssize_t n = PyUnicode_GET_SIZE(*v);
   7218         if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
   7219             PyErr_NoMemory();
   7220             goto error;
   7221         }
   7222         if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
   7223             goto error;
   7224         startout = PyUnicode_AS_UNICODE(*v) + n;
   7225     }
   7226 
   7227     /* Decode the byte string character per character */
   7228     out = startout;
   7229     while (in < endin)
   7230     {
   7231         /* Decode a character */
   7232         insize = 1;
   7233         do
   7234         {
   7235             outsize = MultiByteToWideChar(code_page, flags,
   7236                                           in, insize,
   7237                                           buffer, Py_ARRAY_LENGTH(buffer));
   7238             if (outsize > 0)
   7239                 break;
   7240             err = GetLastError();
   7241             if (err != ERROR_NO_UNICODE_TRANSLATION
   7242                 && err != ERROR_INSUFFICIENT_BUFFER)
   7243             {
   7244                 PyErr_SetFromWindowsErr(0);
   7245                 goto error;
   7246             }
   7247             insize++;
   7248         }
   7249         /* 4=maximum length of a UTF-8 sequence */
   7250         while (insize <= 4 && (in + insize) <= endin);
   7251 
   7252         if (outsize <= 0) {
   7253             Py_ssize_t startinpos, endinpos, outpos;
   7254 
   7255             /* last character in partial decode? */
   7256             if (in + insize >= endin && !final)
   7257                 break;
   7258 
   7259             startinpos = in - startin;
   7260             endinpos = startinpos + 1;
   7261             outpos = out - PyUnicode_AS_UNICODE(*v);
   7262             if (unicode_decode_call_errorhandler_wchar(
   7263                     errors, &errorHandler,
   7264                     encoding, reason,
   7265                     &startin, &endin, &startinpos, &endinpos, &exc, &in,
   7266                     v, &outpos))
   7267             {
   7268                 goto error;
   7269             }
   7270             out = PyUnicode_AS_UNICODE(*v) + outpos;
   7271         }
   7272         else {
   7273             in += insize;
   7274             memcpy(out, buffer, outsize * sizeof(wchar_t));
   7275             out += outsize;
   7276         }
   7277     }
   7278 
   7279     /* write a NUL character at the end */
   7280     *out = 0;
   7281 
   7282     /* Extend unicode object */
   7283     outsize = out - startout;
   7284     assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
   7285     if (unicode_resize(v, outsize) < 0)
   7286         goto error;
   7287     /* (in - startin) <= size and size is an int */
   7288     ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
   7289 
   7290 error:
   7291     Py_XDECREF(encoding_obj);
   7292     Py_XDECREF(errorHandler);
   7293     Py_XDECREF(exc);
   7294     return ret;
   7295 }
   7296 
   7297 static PyObject *
   7298 decode_code_page_stateful(int code_page,
   7299                           const char *s, Py_ssize_t size,
   7300                           const char *errors, Py_ssize_t *consumed)
   7301 {
   7302     PyObject *v = NULL;
   7303     int chunk_size, final, converted, done;
   7304 
   7305     if (code_page < 0) {
   7306         PyErr_SetString(PyExc_ValueError, "invalid code page number");
   7307         return NULL;
   7308     }
   7309 
   7310     if (consumed)
   7311         *consumed = 0;
   7312 
   7313     do
   7314     {
   7315 #ifdef NEED_RETRY
   7316         if (size > INT_MAX) {
   7317             chunk_size = INT_MAX;
   7318             final = 0;
   7319             done = 0;
   7320         }
   7321         else
   7322 #endif
   7323         {
   7324             chunk_size = (int)size;
   7325             final = (consumed == NULL);
   7326             done = 1;
   7327         }
   7328 
   7329         if (chunk_size == 0 && done) {
   7330             if (v != NULL)
   7331                 break;
   7332             _Py_RETURN_UNICODE_EMPTY();
   7333         }
   7334 
   7335         converted = decode_code_page_strict(code_page, &v,
   7336                                             s, chunk_size);
   7337         if (converted == -2)
   7338             converted = decode_code_page_errors(code_page, &v,
   7339                                                 s, chunk_size,
   7340                                                 errors, final);
   7341         assert(converted != 0 || done);
   7342 
   7343         if (converted < 0) {
   7344             Py_XDECREF(v);
   7345             return NULL;
   7346         }
   7347 
   7348         if (consumed)
   7349             *consumed += converted;
   7350 
   7351         s += converted;
   7352         size -= converted;
   7353     } while (!done);
   7354 
   7355     return unicode_result(v);
   7356 }
   7357 
   7358 PyObject *
   7359 PyUnicode_DecodeCodePageStateful(int code_page,
   7360                                  const char *s,
   7361                                  Py_ssize_t size,
   7362                                  const char *errors,
   7363                                  Py_ssize_t *consumed)
   7364 {
   7365     return decode_code_page_stateful(code_page, s, size, errors, consumed);
   7366 }
   7367 
   7368 PyObject *
   7369 PyUnicode_DecodeMBCSStateful(const char *s,
   7370                              Py_ssize_t size,
   7371                              const char *errors,
   7372                              Py_ssize_t *consumed)
   7373 {
   7374     return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
   7375 }
   7376 
   7377 PyObject *
   7378 PyUnicode_DecodeMBCS(const char *s,
   7379                      Py_ssize_t size,
   7380                      const char *errors)
   7381 {
   7382     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
   7383 }
   7384 
   7385 static DWORD
   7386 encode_code_page_flags(UINT code_page, const char *errors)
   7387 {
   7388     if (code_page == CP_UTF8) {
   7389         return WC_ERR_INVALID_CHARS;
   7390     }
   7391     else if (code_page == CP_UTF7) {
   7392         /* CP_UTF7 only supports flags=0 */
   7393         return 0;
   7394     }
   7395     else {
   7396         if (errors != NULL && strcmp(errors, "replace") == 0)
   7397             return 0;
   7398         else
   7399             return WC_NO_BEST_FIT_CHARS;
   7400     }
   7401 }
   7402 
   7403 /*
   7404  * Encode a Unicode string to a Windows code page into a byte string in strict
   7405  * mode.
   7406  *
   7407  * Returns consumed characters if succeed, returns -2 on encode error, or raise
   7408  * an OSError and returns -1 on other error.
   7409  */
   7410 static int
   7411 encode_code_page_strict(UINT code_page, PyObject **outbytes,
   7412                         PyObject *unicode, Py_ssize_t offset, int len,
   7413                         const char* errors)
   7414 {
   7415     BOOL usedDefaultChar = FALSE;
   7416     BOOL *pusedDefaultChar = &usedDefaultChar;
   7417     int outsize;
   7418     wchar_t *p;
   7419     Py_ssize_t size;
   7420     const DWORD flags = encode_code_page_flags(code_page, NULL);
   7421     char *out;
   7422     /* Create a substring so that we can get the UTF-16 representation
   7423        of just the slice under consideration. */
   7424     PyObject *substring;
   7425 
   7426     assert(len > 0);
   7427 
   7428     if (code_page != CP_UTF8 && code_page != CP_UTF7)
   7429         pusedDefaultChar = &usedDefaultChar;
   7430     else
   7431         pusedDefaultChar = NULL;
   7432 
   7433     substring = PyUnicode_Substring(unicode, offset, offset+len);
   7434     if (substring == NULL)
   7435         return -1;
   7436     p = PyUnicode_AsUnicodeAndSize(substring, &size);
   7437     if (p == NULL) {
   7438         Py_DECREF(substring);
   7439         return -1;
   7440     }
   7441     assert(size <= INT_MAX);
   7442 
   7443     /* First get the size of the result */
   7444     outsize = WideCharToMultiByte(code_page, flags,
   7445                                   p, (int)size,
   7446                                   NULL, 0,
   7447                                   NULL, pusedDefaultChar);
   7448     if (outsize <= 0)
   7449         goto error;
   7450     /* If we used a default char, then we failed! */
   7451     if (pusedDefaultChar && *pusedDefaultChar) {
   7452         Py_DECREF(substring);
   7453         return -2;
   7454     }
   7455 
   7456     if (*outbytes == NULL) {
   7457         /* Create string object */
   7458         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
   7459         if (*outbytes == NULL) {
   7460             Py_DECREF(substring);
   7461             return -1;
   7462         }
   7463         out = PyBytes_AS_STRING(*outbytes);
   7464     }
   7465     else {
   7466         /* Extend string object */
   7467         const Py_ssize_t n = PyBytes_Size(*outbytes);
   7468         if (outsize > PY_SSIZE_T_MAX - n) {
   7469             PyErr_NoMemory();
   7470             Py_DECREF(substring);
   7471             return -1;
   7472         }
   7473         if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
   7474             Py_DECREF(substring);
   7475             return -1;
   7476         }
   7477         out = PyBytes_AS_STRING(*outbytes) + n;
   7478     }
   7479 
   7480     /* Do the conversion */
   7481     outsize = WideCharToMultiByte(code_page, flags,
   7482                                   p, (int)size,
   7483                                   out, outsize,
   7484                                   NULL, pusedDefaultChar);
   7485     Py_CLEAR(substring);
   7486     if (outsize <= 0)
   7487         goto error;
   7488     if (pusedDefaultChar && *pusedDefaultChar)
   7489         return -2;
   7490     return 0;
   7491 
   7492 error:
   7493     Py_XDECREF(substring);
   7494     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
   7495         return -2;
   7496     PyErr_SetFromWindowsErr(0);
   7497     return -1;
   7498 }
   7499 
   7500 /*
   7501  * Encode a Unicode string to a Windows code page into a byte string using an
   7502  * error handler.
   7503  *
   7504  * Returns consumed characters if succeed, or raise an OSError and returns
   7505  * -1 on other error.
   7506  */
   7507 static int
   7508 encode_code_page_errors(UINT code_page, PyObject **outbytes,
   7509                         PyObject *unicode, Py_ssize_t unicode_offset,
   7510                         Py_ssize_t insize, const char* errors)
   7511 {
   7512     const DWORD flags = encode_code_page_flags(code_page, errors);
   7513     Py_ssize_t pos = unicode_offset;
   7514     Py_ssize_t endin = unicode_offset + insize;
   7515     /* Ideally, we should get reason from FormatMessage. This is the Windows
   7516        2000 English version of the message. */
   7517     const char *reason = "invalid character";
   7518     /* 4=maximum length of a UTF-8 sequence */
   7519     char buffer[4];
   7520     BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
   7521     Py_ssize_t outsize;
   7522     char *out;
   7523     PyObject *errorHandler = NULL;
   7524     PyObject *exc = NULL;
   7525     PyObject *encoding_obj = NULL;
   7526     const char *encoding;
   7527     Py_ssize_t newpos, newoutsize;
   7528     PyObject *rep;
   7529     int ret = -1;
   7530 
   7531     assert(insize > 0);
   7532 
   7533     encoding = code_page_name(code_page, &encoding_obj);
   7534     if (encoding == NULL)
   7535         return -1;
   7536 
   7537     if (errors == NULL || strcmp(errors, "strict") == 0) {
   7538         /* The last error was ERROR_NO_UNICODE_TRANSLATION,
   7539            then we raise a UnicodeEncodeError. */
   7540         make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
   7541         if (exc != NULL) {
   7542             PyCodec_StrictErrors(exc);
   7543             Py_DECREF(exc);
   7544         }
   7545         Py_XDECREF(encoding_obj);
   7546         return -1;
   7547     }
   7548 
   7549     if (code_page != CP_UTF8 && code_page != CP_UTF7)
   7550         pusedDefaultChar = &usedDefaultChar;
   7551     else
   7552         pusedDefaultChar = NULL;
   7553 
   7554     if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
   7555         PyErr_NoMemory();
   7556         goto error;
   7557     }
   7558     outsize = insize * Py_ARRAY_LENGTH(buffer);
   7559 
   7560     if (*outbytes == NULL) {
   7561         /* Create string object */
   7562         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
   7563         if (*outbytes == NULL)
   7564             goto error;
   7565         out = PyBytes_AS_STRING(*outbytes);
   7566     }
   7567     else {
   7568         /* Extend string object */
   7569         Py_ssize_t n = PyBytes_Size(*outbytes);
   7570         if (n > PY_SSIZE_T_MAX - outsize) {
   7571             PyErr_NoMemory();
   7572             goto error;
   7573         }
   7574         if (_PyBytes_Resize(outbytes, n + outsize) < 0)
   7575             goto error;
   7576         out = PyBytes_AS_STRING(*outbytes) + n;
   7577     }
   7578 
   7579     /* Encode the string character per character */
   7580     while (pos < endin)
   7581     {
   7582         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
   7583         wchar_t chars[2];
   7584         int charsize;
   7585         if (ch < 0x10000) {
   7586             chars[0] = (wchar_t)ch;
   7587             charsize = 1;
   7588         }
   7589         else {
   7590             chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
   7591             chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
   7592             charsize = 2;
   7593         }
   7594 
   7595         outsize = WideCharToMultiByte(code_page, flags,
   7596                                       chars, charsize,
   7597                                       buffer, Py_ARRAY_LENGTH(buffer),
   7598                                       NULL, pusedDefaultChar);
   7599         if (outsize > 0) {
   7600             if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
   7601             {
   7602                 pos++;
   7603                 memcpy(out, buffer, outsize);
   7604                 out += outsize;
   7605                 continue;
   7606             }
   7607         }
   7608         else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
   7609             PyErr_SetFromWindowsErr(0);
   7610             goto error;
   7611         }
   7612 
   7613         rep = unicode_encode_call_errorhandler(
   7614                   errors, &errorHandler, encoding, reason,
   7615                   unicode, &exc,
   7616                   pos, pos + 1, &newpos);
   7617         if (rep == NULL)
   7618             goto error;
   7619         pos = newpos;
   7620 
   7621         if (PyBytes_Check(rep)) {
   7622             outsize = PyBytes_GET_SIZE(rep);
   7623             if (outsize != 1) {
   7624                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
   7625                 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
   7626                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
   7627                     Py_DECREF(rep);
   7628                     goto error;
   7629                 }
   7630                 out = PyBytes_AS_STRING(*outbytes) + offset;
   7631             }
   7632             memcpy(out, PyBytes_AS_STRING(rep), outsize);
   7633             out += outsize;
   7634         }
   7635         else {
   7636             Py_ssize_t i;
   7637             enum PyUnicode_Kind kind;
   7638             void *data;
   7639 
   7640             if (PyUnicode_READY(rep) == -1) {
   7641                 Py_DECREF(rep);
   7642                 goto error;
   7643             }
   7644 
   7645             outsize = PyUnicode_GET_LENGTH(rep);
   7646             if (outsize != 1) {
   7647                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
   7648                 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
   7649                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
   7650                     Py_DECREF(rep);
   7651                     goto error;
   7652                 }
   7653                 out = PyBytes_AS_STRING(*outbytes) + offset;
   7654             }
   7655             kind = PyUnicode_KIND(rep);
   7656             data = PyUnicode_DATA(rep);
   7657             for (i=0; i < outsize; i++) {
   7658                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   7659                 if (ch > 127) {
   7660                     raise_encode_exception(&exc,
   7661                         encoding, unicode,
   7662                         pos, pos + 1,
   7663                         "unable to encode error handler result to ASCII");
   7664                     Py_DECREF(rep);
   7665                     goto error;
   7666                 }
   7667                 *out = (unsigned char)ch;
   7668                 out++;
   7669             }
   7670         }
   7671         Py_DECREF(rep);
   7672     }
   7673     /* write a NUL byte */
   7674     *out = 0;
   7675     outsize = out - PyBytes_AS_STRING(*outbytes);
   7676     assert(outsize <= PyBytes_GET_SIZE(*outbytes));
   7677     if (_PyBytes_Resize(outbytes, outsize) < 0)
   7678         goto error;
   7679     ret = 0;
   7680 
   7681 error:
   7682     Py_XDECREF(encoding_obj);
   7683     Py_XDECREF(errorHandler);
   7684     Py_XDECREF(exc);
   7685     return ret;
   7686 }
   7687 
   7688 static PyObject *
   7689 encode_code_page(int code_page,
   7690                  PyObject *unicode,
   7691                  const char *errors)
   7692 {
   7693     Py_ssize_t len;
   7694     PyObject *outbytes = NULL;
   7695     Py_ssize_t offset;
   7696     int chunk_len, ret, done;
   7697 
   7698     if (!PyUnicode_Check(unicode)) {
   7699         PyErr_BadArgument();
   7700         return NULL;
   7701     }
   7702 
   7703     if (PyUnicode_READY(unicode) == -1)
   7704         return NULL;
   7705     len = PyUnicode_GET_LENGTH(unicode);
   7706 
   7707     if (code_page < 0) {
   7708         PyErr_SetString(PyExc_ValueError, "invalid code page number");
   7709         return NULL;
   7710     }
   7711 
   7712     if (len == 0)
   7713         return PyBytes_FromStringAndSize(NULL, 0);
   7714 
   7715     offset = 0;
   7716     do
   7717     {
   7718 #ifdef NEED_RETRY
   7719         /* UTF-16 encoding may double the size, so use only INT_MAX/2
   7720            chunks. */
   7721         if (len > INT_MAX/2) {
   7722             chunk_len = INT_MAX/2;
   7723             done = 0;
   7724         }
   7725         else
   7726 #endif
   7727         {
   7728             chunk_len = (int)len;
   7729             done = 1;
   7730         }
   7731 
   7732         ret = encode_code_page_strict(code_page, &outbytes,
   7733                                       unicode, offset, chunk_len,
   7734                                       errors);
   7735         if (ret == -2)
   7736             ret = encode_code_page_errors(code_page, &outbytes,
   7737                                           unicode, offset,
   7738                                           chunk_len, errors);
   7739         if (ret < 0) {
   7740             Py_XDECREF(outbytes);
   7741             return NULL;
   7742         }
   7743 
   7744         offset += chunk_len;
   7745         len -= chunk_len;
   7746     } while (!done);
   7747 
   7748     return outbytes;
   7749 }
   7750 
   7751 PyObject *
   7752 PyUnicode_EncodeMBCS(const Py_UNICODE *p,
   7753                      Py_ssize_t size,
   7754                      const char *errors)
   7755 {
   7756     PyObject *unicode, *res;
   7757     unicode = PyUnicode_FromUnicode(p, size);
   7758     if (unicode == NULL)
   7759         return NULL;
   7760     res = encode_code_page(CP_ACP, unicode, errors);
   7761     Py_DECREF(unicode);
   7762     return res;
   7763 }
   7764 
   7765 PyObject *
   7766 PyUnicode_EncodeCodePage(int code_page,
   7767                          PyObject *unicode,
   7768                          const char *errors)
   7769 {
   7770     return encode_code_page(code_page, unicode, errors);
   7771 }
   7772 
   7773 PyObject *
   7774 PyUnicode_AsMBCSString(PyObject *unicode)
   7775 {
   7776     return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
   7777 }
   7778 
   7779 #undef NEED_RETRY
   7780 
   7781 #endif /* MS_WINDOWS */
   7782 
   7783 /* --- Character Mapping Codec -------------------------------------------- */
   7784 
   7785 static int
   7786 charmap_decode_string(const char *s,
   7787                       Py_ssize_t size,
   7788                       PyObject *mapping,
   7789                       const char *errors,
   7790                       _PyUnicodeWriter *writer)
   7791 {
   7792     const char *starts = s;
   7793     const char *e;
   7794     Py_ssize_t startinpos, endinpos;
   7795     PyObject *errorHandler = NULL, *exc = NULL;
   7796     Py_ssize_t maplen;
   7797     enum PyUnicode_Kind mapkind;
   7798     void *mapdata;
   7799     Py_UCS4 x;
   7800     unsigned char ch;
   7801 
   7802     if (PyUnicode_READY(mapping) == -1)
   7803         return -1;
   7804 
   7805     maplen = PyUnicode_GET_LENGTH(mapping);
   7806     mapdata = PyUnicode_DATA(mapping);
   7807     mapkind = PyUnicode_KIND(mapping);
   7808 
   7809     e = s + size;
   7810 
   7811     if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
   7812         /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
   7813          * is disabled in encoding aliases, latin1 is preferred because
   7814          * its implementation is faster. */
   7815         Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
   7816         Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
   7817         Py_UCS4 maxchar = writer->maxchar;
   7818 
   7819         assert (writer->kind == PyUnicode_1BYTE_KIND);
   7820         while (s < e) {
   7821             ch = *s;
   7822             x = mapdata_ucs1[ch];
   7823             if (x > maxchar) {
   7824                 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
   7825                     goto onError;
   7826                 maxchar = writer->maxchar;
   7827                 outdata = (Py_UCS1 *)writer->data;
   7828             }
   7829             outdata[writer->pos] = x;
   7830             writer->pos++;
   7831             ++s;
   7832         }
   7833         return 0;
   7834     }
   7835 
   7836     while (s < e) {
   7837         if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
   7838             enum PyUnicode_Kind outkind = writer->kind;
   7839             Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
   7840             if (outkind == PyUnicode_1BYTE_KIND) {
   7841                 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
   7842                 Py_UCS4 maxchar = writer->maxchar;
   7843                 while (s < e) {
   7844                     ch = *s;
   7845                     x = mapdata_ucs2[ch];
   7846                     if (x > maxchar)
   7847                         goto Error;
   7848                     outdata[writer->pos] = x;
   7849                     writer->pos++;
   7850                     ++s;
   7851                 }
   7852                 break;
   7853             }
   7854             else if (outkind == PyUnicode_2BYTE_KIND) {
   7855                 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
   7856                 while (s < e) {
   7857                     ch = *s;
   7858                     x = mapdata_ucs2[ch];
   7859                     if (x == 0xFFFE)
   7860                         goto Error;
   7861                     outdata[writer->pos] = x;
   7862                     writer->pos++;
   7863                     ++s;
   7864                 }
   7865                 break;
   7866             }
   7867         }
   7868         ch = *s;
   7869 
   7870         if (ch < maplen)
   7871             x = PyUnicode_READ(mapkind, mapdata, ch);
   7872         else
   7873             x = 0xfffe; /* invalid value */
   7874 Error:
   7875         if (x == 0xfffe)
   7876         {
   7877             /* undefined mapping */
   7878             startinpos = s-starts;
   7879             endinpos = startinpos+1;
   7880             if (unicode_decode_call_errorhandler_writer(
   7881                     errors, &errorHandler,
   7882                     "charmap", "character maps to <undefined>",
   7883                     &starts, &e, &startinpos, &endinpos, &exc, &s,
   7884                     writer)) {
   7885                 goto onError;
   7886             }
   7887             continue;
   7888         }
   7889 
   7890         if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
   7891             goto onError;
   7892         ++s;
   7893     }
   7894     Py_XDECREF(errorHandler);
   7895     Py_XDECREF(exc);
   7896     return 0;
   7897 
   7898 onError:
   7899     Py_XDECREF(errorHandler);
   7900     Py_XDECREF(exc);
   7901     return -1;
   7902 }
   7903 
   7904 static int
   7905 charmap_decode_mapping(const char *s,
   7906                        Py_ssize_t size,
   7907                        PyObject *mapping,
   7908                        const char *errors,
   7909                        _PyUnicodeWriter *writer)
   7910 {
   7911     const char *starts = s;
   7912     const char *e;
   7913     Py_ssize_t startinpos, endinpos;
   7914     PyObject *errorHandler = NULL, *exc = NULL;
   7915     unsigned char ch;
   7916     PyObject *key, *item = NULL;
   7917 
   7918     e = s + size;
   7919 
   7920     while (s < e) {
   7921         ch = *s;
   7922 
   7923         /* Get mapping (char ordinal -> integer, Unicode char or None) */
   7924         key = PyLong_FromLong((long)ch);
   7925         if (key == NULL)
   7926             goto onError;
   7927 
   7928         item = PyObject_GetItem(mapping, key);
   7929         Py_DECREF(key);
   7930         if (item == NULL) {
   7931             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
   7932                 /* No mapping found means: mapping is undefined. */
   7933                 PyErr_Clear();
   7934                 goto Undefined;
   7935             } else
   7936                 goto onError;
   7937         }
   7938 
   7939         /* Apply mapping */
   7940         if (item == Py_None)
   7941             goto Undefined;
   7942         if (PyLong_Check(item)) {
   7943             long value = PyLong_AS_LONG(item);
   7944             if (value == 0xFFFE)
   7945                 goto Undefined;
   7946             if (value < 0 || value > MAX_UNICODE) {
   7947                 PyErr_Format(PyExc_TypeError,
   7948                              "character mapping must be in range(0x%lx)",
   7949                              (unsigned long)MAX_UNICODE + 1);
   7950                 goto onError;
   7951             }
   7952 
   7953             if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
   7954                 goto onError;
   7955         }
   7956         else if (PyUnicode_Check(item)) {
   7957             if (PyUnicode_READY(item) == -1)
   7958                 goto onError;
   7959             if (PyUnicode_GET_LENGTH(item) == 1) {
   7960                 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
   7961                 if (value == 0xFFFE)
   7962                     goto Undefined;
   7963                 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
   7964                     goto onError;
   7965             }
   7966             else {
   7967                 writer->overallocate = 1;
   7968                 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
   7969                     goto onError;
   7970             }
   7971         }
   7972         else {
   7973             /* wrong return value */
   7974             PyErr_SetString(PyExc_TypeError,
   7975                             "character mapping must return integer, None or str");
   7976             goto onError;
   7977         }
   7978         Py_CLEAR(item);
   7979         ++s;
   7980         continue;
   7981 
   7982 Undefined:
   7983         /* undefined mapping */
   7984         Py_CLEAR(item);
   7985         startinpos = s-starts;
   7986         endinpos = startinpos+1;
   7987         if (unicode_decode_call_errorhandler_writer(
   7988                 errors, &errorHandler,
   7989                 "charmap", "character maps to <undefined>",
   7990                 &starts, &e, &startinpos, &endinpos, &exc, &s,
   7991                 writer)) {
   7992             goto onError;
   7993         }
   7994     }
   7995     Py_XDECREF(errorHandler);
   7996     Py_XDECREF(exc);
   7997     return 0;
   7998 
   7999 onError:
   8000     Py_XDECREF(item);
   8001     Py_XDECREF(errorHandler);
   8002     Py_XDECREF(exc);
   8003     return -1;
   8004 }
   8005 
   8006 PyObject *
   8007 PyUnicode_DecodeCharmap(const char *s,
   8008                         Py_ssize_t size,
   8009                         PyObject *mapping,
   8010                         const char *errors)
   8011 {
   8012     _PyUnicodeWriter writer;
   8013 
   8014     /* Default to Latin-1 */
   8015     if (mapping == NULL)
   8016         return PyUnicode_DecodeLatin1(s, size, errors);
   8017 
   8018     if (size == 0)
   8019         _Py_RETURN_UNICODE_EMPTY();
   8020     _PyUnicodeWriter_Init(&writer);
   8021     writer.min_length = size;
   8022     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
   8023         goto onError;
   8024 
   8025     if (PyUnicode_CheckExact(mapping)) {
   8026         if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
   8027             goto onError;
   8028     }
   8029     else {
   8030         if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
   8031             goto onError;
   8032     }
   8033     return _PyUnicodeWriter_Finish(&writer);
   8034 
   8035   onError:
   8036     _PyUnicodeWriter_Dealloc(&writer);
   8037     return NULL;
   8038 }
   8039 
   8040 /* Charmap encoding: the lookup table */
   8041 
   8042 struct encoding_map {
   8043     PyObject_HEAD
   8044     unsigned char level1[32];
   8045     int count2, count3;
   8046     unsigned char level23[1];
   8047 };
   8048 
   8049 static PyObject*
   8050 encoding_map_size(PyObject *obj, PyObject* args)
   8051 {
   8052     struct encoding_map *map = (struct encoding_map*)obj;
   8053     return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
   8054                            128*map->count3);
   8055 }
   8056 
   8057 static PyMethodDef encoding_map_methods[] = {
   8058     {"size", encoding_map_size, METH_NOARGS,
   8059      PyDoc_STR("Return the size (in bytes) of this object") },
   8060     { 0 }
   8061 };
   8062 
   8063 static void
   8064 encoding_map_dealloc(PyObject* o)
   8065 {
   8066     PyObject_FREE(o);
   8067 }
   8068 
   8069 static PyTypeObject EncodingMapType = {
   8070     PyVarObject_HEAD_INIT(NULL, 0)
   8071     "EncodingMap",          /*tp_name*/
   8072     sizeof(struct encoding_map),   /*tp_basicsize*/
   8073     0,                      /*tp_itemsize*/
   8074     /* methods */
   8075     encoding_map_dealloc,   /*tp_dealloc*/
   8076     0,                      /*tp_print*/
   8077     0,                      /*tp_getattr*/
   8078     0,                      /*tp_setattr*/
   8079     0,                      /*tp_reserved*/
   8080     0,                      /*tp_repr*/
   8081     0,                      /*tp_as_number*/
   8082     0,                      /*tp_as_sequence*/
   8083     0,                      /*tp_as_mapping*/
   8084     0,                      /*tp_hash*/
   8085     0,                      /*tp_call*/
   8086     0,                      /*tp_str*/
   8087     0,                      /*tp_getattro*/
   8088     0,                      /*tp_setattro*/
   8089     0,                      /*tp_as_buffer*/
   8090     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
   8091     0,                      /*tp_doc*/
   8092     0,                      /*tp_traverse*/
   8093     0,                      /*tp_clear*/
   8094     0,                      /*tp_richcompare*/
   8095     0,                      /*tp_weaklistoffset*/
   8096     0,                      /*tp_iter*/
   8097     0,                      /*tp_iternext*/
   8098     encoding_map_methods,   /*tp_methods*/
   8099     0,                      /*tp_members*/
   8100     0,                      /*tp_getset*/
   8101     0,                      /*tp_base*/
   8102     0,                      /*tp_dict*/
   8103     0,                      /*tp_descr_get*/
   8104     0,                      /*tp_descr_set*/
   8105     0,                      /*tp_dictoffset*/
   8106     0,                      /*tp_init*/
   8107     0,                      /*tp_alloc*/
   8108     0,                      /*tp_new*/
   8109     0,                      /*tp_free*/
   8110     0,                      /*tp_is_gc*/
   8111 };
   8112 
   8113 PyObject*
   8114 PyUnicode_BuildEncodingMap(PyObject* string)
   8115 {
   8116     PyObject *result;
   8117     struct encoding_map *mresult;
   8118     int i;
   8119     int need_dict = 0;
   8120     unsigned char level1[32];
   8121     unsigned char level2[512];
   8122     unsigned char *mlevel1, *mlevel2, *mlevel3;
   8123     int count2 = 0, count3 = 0;
   8124     int kind;
   8125     void *data;
   8126     Py_ssize_t length;
   8127     Py_UCS4 ch;
   8128 
   8129     if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
   8130         PyErr_BadArgument();
   8131         return NULL;
   8132     }
   8133     kind = PyUnicode_KIND(string);
   8134     data = PyUnicode_DATA(string);
   8135     length = PyUnicode_GET_LENGTH(string);
   8136     length = Py_MIN(length, 256);
   8137     memset(level1, 0xFF, sizeof level1);
   8138     memset(level2, 0xFF, sizeof level2);
   8139 
   8140     /* If there isn't a one-to-one mapping of NULL to \0,
   8141        or if there are non-BMP characters, we need to use
   8142        a mapping dictionary. */
   8143     if (PyUnicode_READ(kind, data, 0) != 0)
   8144         need_dict = 1;
   8145     for (i = 1; i < length; i++) {
   8146         int l1, l2;
   8147         ch = PyUnicode_READ(kind, data, i);
   8148         if (ch == 0 || ch > 0xFFFF) {
   8149             need_dict = 1;
   8150             break;
   8151         }
   8152         if (ch == 0xFFFE)
   8153             /* unmapped character */
   8154             continue;
   8155         l1 = ch >> 11;
   8156         l2 = ch >> 7;
   8157         if (level1[l1] == 0xFF)
   8158             level1[l1] = count2++;
   8159         if (level2[l2] == 0xFF)
   8160             level2[l2] = count3++;
   8161     }
   8162 
   8163     if (count2 >= 0xFF || count3 >= 0xFF)
   8164         need_dict = 1;
   8165 
   8166     if (need_dict) {
   8167         PyObject *result = PyDict_New();
   8168         PyObject *key, *value;
   8169         if (!result)
   8170             return NULL;
   8171         for (i = 0; i < length; i++) {
   8172             key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
   8173             value = PyLong_FromLong(i);
   8174             if (!key || !value)
   8175                 goto failed1;
   8176             if (PyDict_SetItem(result, key, value) == -1)
   8177                 goto failed1;
   8178             Py_DECREF(key);
   8179             Py_DECREF(value);
   8180         }
   8181         return result;
   8182       failed1:
   8183         Py_XDECREF(key);
   8184         Py_XDECREF(value);
   8185         Py_DECREF(result);
   8186         return NULL;
   8187     }
   8188 
   8189     /* Create a three-level trie */
   8190     result = PyObject_MALLOC(sizeof(struct encoding_map) +
   8191                              16*count2 + 128*count3 - 1);
   8192     if (!result)
   8193         return PyErr_NoMemory();
   8194     PyObject_Init(result, &EncodingMapType);
   8195     mresult = (struct encoding_map*)result;
   8196     mresult->count2 = count2;
   8197     mresult->count3 = count3;
   8198     mlevel1 = mresult->level1;
   8199     mlevel2 = mresult->level23;
   8200     mlevel3 = mresult->level23 + 16*count2;
   8201     memcpy(mlevel1, level1, 32);
   8202     memset(mlevel2, 0xFF, 16*count2);
   8203     memset(mlevel3, 0, 128*count3);
   8204     count3 = 0;
   8205     for (i = 1; i < length; i++) {
   8206         int o1, o2, o3, i2, i3;
   8207         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   8208         if (ch == 0xFFFE)
   8209             /* unmapped character */
   8210             continue;
   8211         o1 = ch>>11;
   8212         o2 = (ch>>7) & 0xF;
   8213         i2 = 16*mlevel1[o1] + o2;
   8214         if (mlevel2[i2] == 0xFF)
   8215             mlevel2[i2] = count3++;
   8216         o3 = ch & 0x7F;
   8217         i3 = 128*mlevel2[i2] + o3;
   8218         mlevel3[i3] = i;
   8219     }
   8220     return result;
   8221 }
   8222 
   8223 static int
   8224 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
   8225 {
   8226     struct encoding_map *map = (struct encoding_map*)mapping;
   8227     int l1 = c>>11;
   8228     int l2 = (c>>7) & 0xF;
   8229     int l3 = c & 0x7F;
   8230     int i;
   8231 
   8232     if (c > 0xFFFF)
   8233         return -1;
   8234     if (c == 0)
   8235         return 0;
   8236     /* level 1*/
   8237     i = map->level1[l1];
   8238     if (i == 0xFF) {
   8239         return -1;
   8240     }
   8241     /* level 2*/
   8242     i = map->level23[16*i+l2];
   8243     if (i == 0xFF) {
   8244         return -1;
   8245     }
   8246     /* level 3 */
   8247     i = map->level23[16*map->count2 + 128*i + l3];
   8248     if (i == 0) {
   8249         return -1;
   8250     }
   8251     return i;
   8252 }
   8253 
   8254 /* Lookup the character ch in the mapping. If the character
   8255    can't be found, Py_None is returned (or NULL, if another
   8256    error occurred). */
   8257 static PyObject *
   8258 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
   8259 {
   8260     PyObject *w = PyLong_FromLong((long)c);
   8261     PyObject *x;
   8262 
   8263     if (w == NULL)
   8264         return NULL;
   8265     x = PyObject_GetItem(mapping, w);
   8266     Py_DECREF(w);
   8267     if (x == NULL) {
   8268         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
   8269             /* No mapping found means: mapping is undefined. */
   8270             PyErr_Clear();
   8271             x = Py_None;
   8272             Py_INCREF(x);
   8273             return x;
   8274         } else
   8275             return NULL;
   8276     }
   8277     else if (x == Py_None)
   8278         return x;
   8279     else if (PyLong_Check(x)) {
   8280         long value = PyLong_AS_LONG(x);
   8281         if (value < 0 || value > 255) {
   8282             PyErr_SetString(PyExc_TypeError,
   8283                             "character mapping must be in range(256)");
   8284             Py_DECREF(x);
   8285             return NULL;
   8286         }
   8287         return x;
   8288     }
   8289     else if (PyBytes_Check(x))
   8290         return x;
   8291     else {
   8292         /* wrong return value */
   8293         PyErr_Format(PyExc_TypeError,
   8294                      "character mapping must return integer, bytes or None, not %.400s",
   8295                      x->ob_type->tp_name);
   8296         Py_DECREF(x);
   8297         return NULL;
   8298     }
   8299 }
   8300 
   8301 static int
   8302 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
   8303 {
   8304     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
   8305     /* exponentially overallocate to minimize reallocations */
   8306     if (requiredsize < 2*outsize)
   8307         requiredsize = 2*outsize;
   8308     if (_PyBytes_Resize(outobj, requiredsize))
   8309         return -1;
   8310     return 0;
   8311 }
   8312 
   8313 typedef enum charmapencode_result {
   8314     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
   8315 } charmapencode_result;
   8316 /* lookup the character, put the result in the output string and adjust
   8317    various state variables. Resize the output bytes object if not enough
   8318    space is available. Return a new reference to the object that
   8319    was put in the output buffer, or Py_None, if the mapping was undefined
   8320    (in which case no character was written) or NULL, if a
   8321    reallocation error occurred. The caller must decref the result */
   8322 static charmapencode_result
   8323 charmapencode_output(Py_UCS4 c, PyObject *mapping,
   8324                      PyObject **outobj, Py_ssize_t *outpos)
   8325 {
   8326     PyObject *rep;
   8327     char *outstart;
   8328     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
   8329 
   8330     if (Py_TYPE(mapping) == &EncodingMapType) {
   8331         int res = encoding_map_lookup(c, mapping);
   8332         Py_ssize_t requiredsize = *outpos+1;
   8333         if (res == -1)
   8334             return enc_FAILED;
   8335         if (outsize<requiredsize)
   8336             if (charmapencode_resize(outobj, outpos, requiredsize))
   8337                 return enc_EXCEPTION;
   8338         outstart = PyBytes_AS_STRING(*outobj);
   8339         outstart[(*outpos)++] = (char)res;
   8340         return enc_SUCCESS;
   8341     }
   8342 
   8343     rep = charmapencode_lookup(c, mapping);
   8344     if (rep==NULL)
   8345         return enc_EXCEPTION;
   8346     else if (rep==Py_None) {
   8347         Py_DECREF(rep);
   8348         return enc_FAILED;
   8349     } else {
   8350         if (PyLong_Check(rep)) {
   8351             Py_ssize_t requiredsize = *outpos+1;
   8352             if (outsize<requiredsize)
   8353                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
   8354                     Py_DECREF(rep);
   8355                     return enc_EXCEPTION;
   8356                 }
   8357             outstart = PyBytes_AS_STRING(*outobj);
   8358             outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
   8359         }
   8360         else {
   8361             const char *repchars = PyBytes_AS_STRING(rep);
   8362             Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
   8363             Py_ssize_t requiredsize = *outpos+repsize;
   8364             if (outsize<requiredsize)
   8365                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
   8366                     Py_DECREF(rep);
   8367                     return enc_EXCEPTION;
   8368                 }
   8369             outstart = PyBytes_AS_STRING(*outobj);
   8370             memcpy(outstart + *outpos, repchars, repsize);
   8371             *outpos += repsize;
   8372         }
   8373     }
   8374     Py_DECREF(rep);
   8375     return enc_SUCCESS;
   8376 }
   8377 
   8378 /* handle an error in PyUnicode_EncodeCharmap
   8379    Return 0 on success, -1 on error */
   8380 static int
   8381 charmap_encoding_error(
   8382     PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
   8383     PyObject **exceptionObject,
   8384     _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
   8385     PyObject **res, Py_ssize_t *respos)
   8386 {
   8387     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
   8388     Py_ssize_t size, repsize;
   8389     Py_ssize_t newpos;
   8390     enum PyUnicode_Kind kind;
   8391     void *data;
   8392     Py_ssize_t index;
   8393     /* startpos for collecting unencodable chars */
   8394     Py_ssize_t collstartpos = *inpos;
   8395     Py_ssize_t collendpos = *inpos+1;
   8396     Py_ssize_t collpos;
   8397     char *encoding = "charmap";
   8398     char *reason = "character maps to <undefined>";
   8399     charmapencode_result x;
   8400     Py_UCS4 ch;
   8401     int val;
   8402 
   8403     if (PyUnicode_READY(unicode) == -1)
   8404         return -1;
   8405     size = PyUnicode_GET_LENGTH(unicode);
   8406     /* find all unencodable characters */
   8407     while (collendpos < size) {
   8408         PyObject *rep;
   8409         if (Py_TYPE(mapping) == &EncodingMapType) {
   8410             ch = PyUnicode_READ_CHAR(unicode, collendpos);
   8411             val = encoding_map_lookup(ch, mapping);
   8412             if (val != -1)
   8413                 break;
   8414             ++collendpos;
   8415             continue;
   8416         }
   8417 
   8418         ch = PyUnicode_READ_CHAR(unicode, collendpos);
   8419         rep = charmapencode_lookup(ch, mapping);
   8420         if (rep==NULL)
   8421             return -1;
   8422         else if (rep!=Py_None) {
   8423             Py_DECREF(rep);
   8424             break;
   8425         }
   8426         Py_DECREF(rep);
   8427         ++collendpos;
   8428     }
   8429     /* cache callback name lookup
   8430      * (if not done yet, i.e. it's the first error) */
   8431     if (*error_handler == _Py_ERROR_UNKNOWN)
   8432         *error_handler = get_error_handler(errors);
   8433 
   8434     switch (*error_handler) {
   8435     case _Py_ERROR_STRICT:
   8436         raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
   8437         return -1;
   8438 
   8439     case _Py_ERROR_REPLACE:
   8440         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
   8441             x = charmapencode_output('?', mapping, res, respos);
   8442             if (x==enc_EXCEPTION) {
   8443                 return -1;
   8444             }
   8445             else if (x==enc_FAILED) {
   8446                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
   8447                 return -1;
   8448             }
   8449         }
   8450         /* fall through */
   8451     case _Py_ERROR_IGNORE:
   8452         *inpos = collendpos;
   8453         break;
   8454 
   8455     case _Py_ERROR_XMLCHARREFREPLACE:
   8456         /* generate replacement (temporarily (mis)uses p) */
   8457         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
   8458             char buffer[2+29+1+1];
   8459             char *cp;
   8460             sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
   8461             for (cp = buffer; *cp; ++cp) {
   8462                 x = charmapencode_output(*cp, mapping, res, respos);
   8463                 if (x==enc_EXCEPTION)
   8464                     return -1;
   8465                 else if (x==enc_FAILED) {
   8466                     raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
   8467                     return -1;
   8468                 }
   8469             }
   8470         }
   8471         *inpos = collendpos;
   8472         break;
   8473 
   8474     default:
   8475         repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
   8476                                                       encoding, reason, unicode, exceptionObject,
   8477                                                       collstartpos, collendpos, &newpos);
   8478         if (repunicode == NULL)
   8479             return -1;
   8480         if (PyBytes_Check(repunicode)) {
   8481             /* Directly copy bytes result to output. */
   8482             Py_ssize_t outsize = PyBytes_Size(*res);
   8483             Py_ssize_t requiredsize;
   8484             repsize = PyBytes_Size(repunicode);
   8485             requiredsize = *respos + repsize;
   8486             if (requiredsize > outsize)
   8487                 /* Make room for all additional bytes. */
   8488                 if (charmapencode_resize(res, respos, requiredsize)) {
   8489                     Py_DECREF(repunicode);
   8490                     return -1;
   8491                 }
   8492             memcpy(PyBytes_AsString(*res) + *respos,
   8493                    PyBytes_AsString(repunicode),  repsize);
   8494             *respos += repsize;
   8495             *inpos = newpos;
   8496             Py_DECREF(repunicode);
   8497             break;
   8498         }
   8499         /* generate replacement  */
   8500         if (PyUnicode_READY(repunicode) == -1) {
   8501             Py_DECREF(repunicode);
   8502             return -1;
   8503         }
   8504         repsize = PyUnicode_GET_LENGTH(repunicode);
   8505         data = PyUnicode_DATA(repunicode);
   8506         kind = PyUnicode_KIND(repunicode);
   8507         for (index = 0; index < repsize; index++) {
   8508             Py_UCS4 repch = PyUnicode_READ(kind, data, index);
   8509             x = charmapencode_output(repch, mapping, res, respos);
   8510             if (x==enc_EXCEPTION) {
   8511                 Py_DECREF(repunicode);
   8512                 return -1;
   8513             }
   8514             else if (x==enc_FAILED) {
   8515                 Py_DECREF(repunicode);
   8516                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
   8517                 return -1;
   8518             }
   8519         }
   8520         *inpos = newpos;
   8521         Py_DECREF(repunicode);
   8522     }
   8523     return 0;
   8524 }
   8525 
   8526 PyObject *
   8527 _PyUnicode_EncodeCharmap(PyObject *unicode,
   8528                          PyObject *mapping,
   8529                          const char *errors)
   8530 {
   8531     /* output object */
   8532     PyObject *res = NULL;
   8533     /* current input position */
   8534     Py_ssize_t inpos = 0;
   8535     Py_ssize_t size;
   8536     /* current output position */
   8537     Py_ssize_t respos = 0;
   8538     PyObject *error_handler_obj = NULL;
   8539     PyObject *exc = NULL;
   8540     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
   8541     void *data;
   8542     int kind;
   8543 
   8544     if (PyUnicode_READY(unicode) == -1)
   8545         return NULL;
   8546     size = PyUnicode_GET_LENGTH(unicode);
   8547     data = PyUnicode_DATA(unicode);
   8548     kind = PyUnicode_KIND(unicode);
   8549 
   8550     /* Default to Latin-1 */
   8551     if (mapping == NULL)
   8552         return unicode_encode_ucs1(unicode, errors, 256);
   8553 
   8554     /* allocate enough for a simple encoding without
   8555        replacements, if we need more, we'll resize */
   8556     res = PyBytes_FromStringAndSize(NULL, size);
   8557     if (res == NULL)
   8558         goto onError;
   8559     if (size == 0)
   8560         return res;
   8561 
   8562     while (inpos<size) {
   8563         Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
   8564         /* try to encode it */
   8565         charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
   8566         if (x==enc_EXCEPTION) /* error */
   8567             goto onError;
   8568         if (x==enc_FAILED) { /* unencodable character */
   8569             if (charmap_encoding_error(unicode, &inpos, mapping,
   8570                                        &exc,
   8571                                        &error_handler, &error_handler_obj, errors,
   8572                                        &res, &respos)) {
   8573                 goto onError;
   8574             }
   8575         }
   8576         else
   8577             /* done with this character => adjust input position */
   8578             ++inpos;
   8579     }
   8580 
   8581     /* Resize if we allocated to much */
   8582     if (respos<PyBytes_GET_SIZE(res))
   8583         if (_PyBytes_Resize(&res, respos) < 0)
   8584             goto onError;
   8585 
   8586     Py_XDECREF(exc);
   8587     Py_XDECREF(error_handler_obj);
   8588     return res;
   8589 
   8590   onError:
   8591     Py_XDECREF(res);
   8592     Py_XDECREF(exc);
   8593     Py_XDECREF(error_handler_obj);
   8594     return NULL;
   8595 }
   8596 
   8597 /* Deprecated */
   8598 PyObject *
   8599 PyUnicode_EncodeCharmap(const Py_UNICODE *p,
   8600                         Py_ssize_t size,
   8601                         PyObject *mapping,
   8602                         const char *errors)
   8603 {
   8604     PyObject *result;
   8605     PyObject *unicode = PyUnicode_FromUnicode(p, size);
   8606     if (unicode == NULL)
   8607         return NULL;
   8608     result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
   8609     Py_DECREF(unicode);
   8610     return result;
   8611 }
   8612 
   8613 PyObject *
   8614 PyUnicode_AsCharmapString(PyObject *unicode,
   8615                           PyObject *mapping)
   8616 {
   8617     if (!PyUnicode_Check(unicode) || mapping == NULL) {
   8618         PyErr_BadArgument();
   8619         return NULL;
   8620     }
   8621     return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
   8622 }
   8623 
   8624 /* create or adjust a UnicodeTranslateError */
   8625 static void
   8626 make_translate_exception(PyObject **exceptionObject,
   8627                          PyObject *unicode,
   8628                          Py_ssize_t startpos, Py_ssize_t endpos,
   8629                          const char *reason)
   8630 {
   8631     if (*exceptionObject == NULL) {
   8632         *exceptionObject = _PyUnicodeTranslateError_Create(
   8633             unicode, startpos, endpos, reason);
   8634     }
   8635     else {
   8636         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
   8637             goto onError;
   8638         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
   8639             goto onError;
   8640         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
   8641             goto onError;
   8642         return;
   8643       onError:
   8644         Py_CLEAR(*exceptionObject);
   8645     }
   8646 }
   8647 
   8648 /* error handling callback helper:
   8649    build arguments, call the callback and check the arguments,
   8650    put the result into newpos and return the replacement string, which
   8651    has to be freed by the caller */
   8652 static PyObject *
   8653 unicode_translate_call_errorhandler(const char *errors,
   8654                                     PyObject **errorHandler,
   8655                                     const char *reason,
   8656                                     PyObject *unicode, PyObject **exceptionObject,
   8657                                     Py_ssize_t startpos, Py_ssize_t endpos,
   8658                                     Py_ssize_t *newpos)
   8659 {
   8660     static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
   8661 
   8662     Py_ssize_t i_newpos;
   8663     PyObject *restuple;
   8664     PyObject *resunicode;
   8665 
   8666     if (*errorHandler == NULL) {
   8667         *errorHandler = PyCodec_LookupError(errors);
   8668         if (*errorHandler == NULL)
   8669             return NULL;
   8670     }
   8671 
   8672     make_translate_exception(exceptionObject,
   8673                              unicode, startpos, endpos, reason);
   8674     if (*exceptionObject == NULL)
   8675         return NULL;
   8676 
   8677     restuple = PyObject_CallFunctionObjArgs(
   8678         *errorHandler, *exceptionObject, NULL);
   8679     if (restuple == NULL)
   8680         return NULL;
   8681     if (!PyTuple_Check(restuple)) {
   8682         PyErr_SetString(PyExc_TypeError, &argparse[4]);
   8683         Py_DECREF(restuple);
   8684         return NULL;
   8685     }
   8686     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
   8687                           &resunicode, &i_newpos)) {
   8688         Py_DECREF(restuple);
   8689         return NULL;
   8690     }
   8691     if (i_newpos<0)
   8692         *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
   8693     else
   8694         *newpos = i_newpos;
   8695     if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
   8696         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
   8697         Py_DECREF(restuple);
   8698         return NULL;
   8699     }
   8700     Py_INCREF(resunicode);
   8701     Py_DECREF(restuple);
   8702     return resunicode;
   8703 }
   8704 
   8705 /* Lookup the character ch in the mapping and put the result in result,
   8706    which must be decrefed by the caller.
   8707    Return 0 on success, -1 on error */
   8708 static int
   8709 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
   8710 {
   8711     PyObject *w = PyLong_FromLong((long)c);
   8712     PyObject *x;
   8713 
   8714     if (w == NULL)
   8715         return -1;
   8716     x = PyObject_GetItem(mapping, w);
   8717     Py_DECREF(w);
   8718     if (x == NULL) {
   8719         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
   8720             /* No mapping found means: use 1:1 mapping. */
   8721             PyErr_Clear();
   8722             *result = NULL;
   8723             return 0;
   8724         } else
   8725             return -1;
   8726     }
   8727     else if (x == Py_None) {
   8728         *result = x;
   8729         return 0;
   8730     }
   8731     else if (PyLong_Check(x)) {
   8732         long value = PyLong_AS_LONG(x);
   8733         if (value < 0 || value > MAX_UNICODE) {
   8734             PyErr_Format(PyExc_ValueError,
   8735                          "character mapping must be in range(0x%x)",
   8736                          MAX_UNICODE+1);
   8737             Py_DECREF(x);
   8738             return -1;
   8739         }
   8740         *result = x;
   8741         return 0;
   8742     }
   8743     else if (PyUnicode_Check(x)) {
   8744         *result = x;
   8745         return 0;
   8746     }
   8747     else {
   8748         /* wrong return value */
   8749         PyErr_SetString(PyExc_TypeError,
   8750                         "character mapping must return integer, None or str");
   8751         Py_DECREF(x);
   8752         return -1;
   8753     }
   8754 }
   8755 
   8756 /* lookup the character, write the result into the writer.
   8757    Return 1 if the result was written into the writer, return 0 if the mapping
   8758    was undefined, raise an exception return -1 on error. */
   8759 static int
   8760 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
   8761                         _PyUnicodeWriter *writer)
   8762 {
   8763     PyObject *item;
   8764 
   8765     if (charmaptranslate_lookup(ch, mapping, &item))
   8766         return -1;
   8767 
   8768     if (item == NULL) {
   8769         /* not found => default to 1:1 mapping */
   8770         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
   8771             return -1;
   8772         }
   8773         return 1;
   8774     }
   8775 
   8776     if (item == Py_None) {
   8777         Py_DECREF(item);
   8778         return 0;
   8779     }
   8780 
   8781     if (PyLong_Check(item)) {
   8782         long ch = (Py_UCS4)PyLong_AS_LONG(item);
   8783         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
   8784            used it */
   8785         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
   8786             Py_DECREF(item);
   8787             return -1;
   8788         }
   8789         Py_DECREF(item);
   8790         return 1;
   8791     }
   8792 
   8793     if (!PyUnicode_Check(item)) {
   8794         Py_DECREF(item);
   8795         return -1;
   8796     }
   8797 
   8798     if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
   8799         Py_DECREF(item);
   8800         return -1;
   8801     }
   8802 
   8803     Py_DECREF(item);
   8804     return 1;
   8805 }
   8806 
   8807 static int
   8808 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
   8809                               Py_UCS1 *translate)
   8810 {
   8811     PyObject *item = NULL;
   8812     int ret = 0;
   8813 
   8814     if (charmaptranslate_lookup(ch, mapping, &item)) {
   8815         return -1;
   8816     }
   8817 
   8818     if (item == Py_None) {
   8819         /* deletion */
   8820         translate[ch] = 0xfe;
   8821     }
   8822     else if (item == NULL) {
   8823         /* not found => default to 1:1 mapping */
   8824         translate[ch] = ch;
   8825         return 1;
   8826     }
   8827     else if (PyLong_Check(item)) {
   8828         long replace = PyLong_AS_LONG(item);
   8829         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
   8830            used it */
   8831         if (127 < replace) {
   8832             /* invalid character or character outside ASCII:
   8833                skip the fast translate */
   8834             goto exit;
   8835         }
   8836         translate[ch] = (Py_UCS1)replace;
   8837     }
   8838     else if (PyUnicode_Check(item)) {
   8839         Py_UCS4 replace;
   8840 
   8841         if (PyUnicode_READY(item) == -1) {
   8842             Py_DECREF(item);
   8843             return -1;
   8844         }
   8845         if (PyUnicode_GET_LENGTH(item) != 1)
   8846             goto exit;
   8847 
   8848         replace = PyUnicode_READ_CHAR(item, 0);
   8849         if (replace > 127)
   8850             goto exit;
   8851         translate[ch] = (Py_UCS1)replace;
   8852     }
   8853     else {
   8854         /* not None, NULL, long or unicode */
   8855         goto exit;
   8856     }
   8857     ret = 1;
   8858 
   8859   exit:
   8860     Py_DECREF(item);
   8861     return ret;
   8862 }
   8863 
   8864 /* Fast path for ascii => ascii translation. Return 1 if the whole string
   8865    was translated into writer, return 0 if the input string was partially
   8866    translated into writer, raise an exception and return -1 on error. */
   8867 static int
   8868 unicode_fast_translate(PyObject *input, PyObject *mapping,
   8869                        _PyUnicodeWriter *writer, int ignore,
   8870                        Py_ssize_t *input_pos)
   8871 {
   8872     Py_UCS1 ascii_table[128], ch, ch2;
   8873     Py_ssize_t len;
   8874     Py_UCS1 *in, *end, *out;
   8875     int res = 0;
   8876 
   8877     len = PyUnicode_GET_LENGTH(input);
   8878 
   8879     memset(ascii_table, 0xff, 128);
   8880 
   8881     in = PyUnicode_1BYTE_DATA(input);
   8882     end = in + len;
   8883 
   8884     assert(PyUnicode_IS_ASCII(writer->buffer));
   8885     assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
   8886     out = PyUnicode_1BYTE_DATA(writer->buffer);
   8887 
   8888     for (; in < end; in++) {
   8889         ch = *in;
   8890         ch2 = ascii_table[ch];
   8891         if (ch2 == 0xff) {
   8892             int translate = unicode_fast_translate_lookup(mapping, ch,
   8893                                                           ascii_table);
   8894             if (translate < 0)
   8895                 return -1;
   8896             if (translate == 0)
   8897                 goto exit;
   8898             ch2 = ascii_table[ch];
   8899         }
   8900         if (ch2 == 0xfe) {
   8901             if (ignore)
   8902                 continue;
   8903             goto exit;
   8904         }
   8905         assert(ch2 < 128);
   8906         *out = ch2;
   8907         out++;
   8908     }
   8909     res = 1;
   8910 
   8911 exit:
   8912     writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
   8913     *input_pos = in - PyUnicode_1BYTE_DATA(input);
   8914     return res;
   8915 }
   8916 
   8917 static PyObject *
   8918 _PyUnicode_TranslateCharmap(PyObject *input,
   8919                             PyObject *mapping,
   8920                             const char *errors)
   8921 {
   8922     /* input object */
   8923     char *data;
   8924     Py_ssize_t size, i;
   8925     int kind;
   8926     /* output buffer */
   8927     _PyUnicodeWriter writer;
   8928     /* error handler */
   8929     char *reason = "character maps to <undefined>";
   8930     PyObject *errorHandler = NULL;
   8931     PyObject *exc = NULL;
   8932     int ignore;
   8933     int res;
   8934 
   8935     if (mapping == NULL) {
   8936         PyErr_BadArgument();
   8937         return NULL;
   8938     }
   8939 
   8940     if (PyUnicode_READY(input) == -1)
   8941         return NULL;
   8942     data = (char*)PyUnicode_DATA(input);
   8943     kind = PyUnicode_KIND(input);
   8944     size = PyUnicode_GET_LENGTH(input);
   8945 
   8946     if (size == 0)
   8947         return PyUnicode_FromObject(input);
   8948 
   8949     /* allocate enough for a simple 1:1 translation without
   8950        replacements, if we need more, we'll resize */
   8951     _PyUnicodeWriter_Init(&writer);
   8952     if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
   8953         goto onError;
   8954 
   8955     ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
   8956 
   8957     if (PyUnicode_READY(input) == -1)
   8958         return NULL;
   8959     if (PyUnicode_IS_ASCII(input)) {
   8960         res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
   8961         if (res < 0) {
   8962             _PyUnicodeWriter_Dealloc(&writer);
   8963             return NULL;
   8964         }
   8965         if (res == 1)
   8966             return _PyUnicodeWriter_Finish(&writer);
   8967     }
   8968     else {
   8969         i = 0;
   8970     }
   8971 
   8972     while (i<size) {
   8973         /* try to encode it */
   8974         int translate;
   8975         PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
   8976         Py_ssize_t newpos;
   8977         /* startpos for collecting untranslatable chars */
   8978         Py_ssize_t collstart;
   8979         Py_ssize_t collend;
   8980         Py_UCS4 ch;
   8981 
   8982         ch = PyUnicode_READ(kind, data, i);
   8983         translate = charmaptranslate_output(ch, mapping, &writer);
   8984         if (translate < 0)
   8985             goto onError;
   8986 
   8987         if (translate != 0) {
   8988             /* it worked => adjust input pointer */
   8989             ++i;
   8990             continue;
   8991         }
   8992 
   8993         /* untranslatable character */
   8994         collstart = i;
   8995         collend = i+1;
   8996 
   8997         /* find all untranslatable characters */
   8998         while (collend < size) {
   8999             PyObject *x;
   9000             ch = PyUnicode_READ(kind, data, collend);
   9001             if (charmaptranslate_lookup(ch, mapping, &x))
   9002                 goto onError;
   9003             Py_XDECREF(x);
   9004             if (x != Py_None)
   9005                 break;
   9006             ++collend;
   9007         }
   9008 
   9009         if (ignore) {
   9010             i = collend;
   9011         }
   9012         else {
   9013             repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
   9014                                                              reason, input, &exc,
   9015                                                              collstart, collend, &newpos);
   9016             if (repunicode == NULL)
   9017                 goto onError;
   9018             if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
   9019                 Py_DECREF(repunicode);
   9020                 goto onError;
   9021             }
   9022             Py_DECREF(repunicode);
   9023             i = newpos;
   9024         }
   9025     }
   9026     Py_XDECREF(exc);
   9027     Py_XDECREF(errorHandler);
   9028     return _PyUnicodeWriter_Finish(&writer);
   9029 
   9030   onError:
   9031     _PyUnicodeWriter_Dealloc(&writer);
   9032     Py_XDECREF(exc);
   9033     Py_XDECREF(errorHandler);
   9034     return NULL;
   9035 }
   9036 
   9037 /* Deprecated. Use PyUnicode_Translate instead. */
   9038 PyObject *
   9039 PyUnicode_TranslateCharmap(const Py_UNICODE *p,
   9040                            Py_ssize_t size,
   9041                            PyObject *mapping,
   9042                            const char *errors)
   9043 {
   9044     PyObject *result;
   9045     PyObject *unicode = PyUnicode_FromUnicode(p, size);
   9046     if (!unicode)
   9047         return NULL;
   9048     result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
   9049     Py_DECREF(unicode);
   9050     return result;
   9051 }
   9052 
   9053 PyObject *
   9054 PyUnicode_Translate(PyObject *str,
   9055                     PyObject *mapping,
   9056                     const char *errors)
   9057 {
   9058     if (ensure_unicode(str) < 0)
   9059         return NULL;
   9060     return _PyUnicode_TranslateCharmap(str, mapping, errors);
   9061 }
   9062 
   9063 static Py_UCS4
   9064 fix_decimal_and_space_to_ascii(PyObject *self)
   9065 {
   9066     /* No need to call PyUnicode_READY(self) because this function is only
   9067        called as a callback from fixup() which does it already. */
   9068     const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
   9069     const int kind = PyUnicode_KIND(self);
   9070     void *data = PyUnicode_DATA(self);
   9071     Py_UCS4 maxchar = 127, ch, fixed;
   9072     int modified = 0;
   9073     Py_ssize_t i;
   9074 
   9075     for (i = 0; i < len; ++i) {
   9076         ch = PyUnicode_READ(kind, data, i);
   9077         fixed = 0;
   9078         if (ch > 127) {
   9079             if (Py_UNICODE_ISSPACE(ch))
   9080                 fixed = ' ';
   9081             else {
   9082                 const int decimal = Py_UNICODE_TODECIMAL(ch);
   9083                 if (decimal >= 0)
   9084                     fixed = '0' + decimal;
   9085             }
   9086             if (fixed != 0) {
   9087                 modified = 1;
   9088                 maxchar = Py_MAX(maxchar, fixed);
   9089                 PyUnicode_WRITE(kind, data, i, fixed);
   9090             }
   9091             else
   9092                 maxchar = Py_MAX(maxchar, ch);
   9093         }
   9094     }
   9095 
   9096     return (modified) ? maxchar : 0;
   9097 }
   9098 
   9099 PyObject *
   9100 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
   9101 {
   9102     if (!PyUnicode_Check(unicode)) {
   9103         PyErr_BadInternalCall();
   9104         return NULL;
   9105     }
   9106     if (PyUnicode_READY(unicode) == -1)
   9107         return NULL;
   9108     if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
   9109         /* If the string is already ASCII, just return the same string */
   9110         Py_INCREF(unicode);
   9111         return unicode;
   9112     }
   9113     return fixup(unicode, fix_decimal_and_space_to_ascii);
   9114 }
   9115 
   9116 PyObject *
   9117 PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
   9118                                   Py_ssize_t length)
   9119 {
   9120     PyObject *decimal;
   9121     Py_ssize_t i;
   9122     Py_UCS4 maxchar;
   9123     enum PyUnicode_Kind kind;
   9124     void *data;
   9125 
   9126     maxchar = 127;
   9127     for (i = 0; i < length; i++) {
   9128         Py_UCS4 ch = s[i];
   9129         if (ch > 127) {
   9130             int decimal = Py_UNICODE_TODECIMAL(ch);
   9131             if (decimal >= 0)
   9132                 ch = '0' + decimal;
   9133             maxchar = Py_MAX(maxchar, ch);
   9134         }
   9135     }
   9136 
   9137     /* Copy to a new string */
   9138     decimal = PyUnicode_New(length, maxchar);
   9139     if (decimal == NULL)
   9140         return decimal;
   9141     kind = PyUnicode_KIND(decimal);
   9142     data = PyUnicode_DATA(decimal);
   9143     /* Iterate over code points */
   9144     for (i = 0; i < length; i++) {
   9145         Py_UCS4 ch = s[i];
   9146         if (ch > 127) {
   9147             int decimal = Py_UNICODE_TODECIMAL(ch);
   9148             if (decimal >= 0)
   9149                 ch = '0' + decimal;
   9150         }
   9151         PyUnicode_WRITE(kind, data, i, ch);
   9152     }
   9153     return unicode_result(decimal);
   9154 }
   9155 /* --- Decimal Encoder ---------------------------------------------------- */
   9156 
   9157 int
   9158 PyUnicode_EncodeDecimal(Py_UNICODE *s,
   9159                         Py_ssize_t length,
   9160                         char *output,
   9161                         const char *errors)
   9162 {
   9163     PyObject *unicode;
   9164     Py_ssize_t i;
   9165     enum PyUnicode_Kind kind;
   9166     void *data;
   9167 
   9168     if (output == NULL) {
   9169         PyErr_BadArgument();
   9170         return -1;
   9171     }
   9172 
   9173     unicode = PyUnicode_FromUnicode(s, length);
   9174     if (unicode == NULL)
   9175         return -1;
   9176 
   9177     if (PyUnicode_READY(unicode) == -1) {
   9178         Py_DECREF(unicode);
   9179         return -1;
   9180     }
   9181     kind = PyUnicode_KIND(unicode);
   9182     data = PyUnicode_DATA(unicode);
   9183 
   9184     for (i=0; i < length; ) {
   9185         PyObject *exc;
   9186         Py_UCS4 ch;
   9187         int decimal;
   9188         Py_ssize_t startpos;
   9189 
   9190         ch = PyUnicode_READ(kind, data, i);
   9191 
   9192         if (Py_UNICODE_ISSPACE(ch)) {
   9193             *output++ = ' ';
   9194             i++;
   9195             continue;
   9196         }
   9197         decimal = Py_UNICODE_TODECIMAL(ch);
   9198         if (decimal >= 0) {
   9199             *output++ = '0' + decimal;
   9200             i++;
   9201             continue;
   9202         }
   9203         if (0 < ch && ch < 256) {
   9204             *output++ = (char)ch;
   9205             i++;
   9206             continue;
   9207         }
   9208 
   9209         startpos = i;
   9210         exc = NULL;
   9211         raise_encode_exception(&exc, "decimal", unicode,
   9212                                startpos, startpos+1,
   9213                                "invalid decimal Unicode string");
   9214         Py_XDECREF(exc);
   9215         Py_DECREF(unicode);
   9216         return -1;
   9217     }
   9218     /* 0-terminate the output string */
   9219     *output++ = '\0';
   9220     Py_DECREF(unicode);
   9221     return 0;
   9222 }
   9223 
   9224 /* --- Helpers ------------------------------------------------------------ */
   9225 
   9226 /* helper macro to fixup start/end slice values */
   9227 #define ADJUST_INDICES(start, end, len)         \
   9228     if (end > len)                              \
   9229         end = len;                              \
   9230     else if (end < 0) {                         \
   9231         end += len;                             \
   9232         if (end < 0)                            \
   9233             end = 0;                            \
   9234     }                                           \
   9235     if (start < 0) {                            \
   9236         start += len;                           \
   9237         if (start < 0)                          \
   9238             start = 0;                          \
   9239     }
   9240 
   9241 static Py_ssize_t
   9242 any_find_slice(PyObject* s1, PyObject* s2,
   9243                Py_ssize_t start,
   9244                Py_ssize_t end,
   9245                int direction)
   9246 {
   9247     int kind1, kind2;
   9248     void *buf1, *buf2;
   9249     Py_ssize_t len1, len2, result;
   9250 
   9251     kind1 = PyUnicode_KIND(s1);
   9252     kind2 = PyUnicode_KIND(s2);
   9253     if (kind1 < kind2)
   9254         return -1;
   9255 
   9256     len1 = PyUnicode_GET_LENGTH(s1);
   9257     len2 = PyUnicode_GET_LENGTH(s2);
   9258     ADJUST_INDICES(start, end, len1);
   9259     if (end - start < len2)
   9260         return -1;
   9261 
   9262     buf1 = PyUnicode_DATA(s1);
   9263     buf2 = PyUnicode_DATA(s2);
   9264     if (len2 == 1) {
   9265         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
   9266         result = findchar((const char *)buf1 + kind1*start,
   9267                           kind1, end - start, ch, direction);
   9268         if (result == -1)
   9269             return -1;
   9270         else
   9271             return start + result;
   9272     }
   9273 
   9274     if (kind2 != kind1) {
   9275         buf2 = _PyUnicode_AsKind(s2, kind1);
   9276         if (!buf2)
   9277             return -2;
   9278     }
   9279 
   9280     if (direction > 0) {
   9281         switch (kind1) {
   9282         case PyUnicode_1BYTE_KIND:
   9283             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
   9284                 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
   9285             else
   9286                 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
   9287             break;
   9288         case PyUnicode_2BYTE_KIND:
   9289             result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
   9290             break;
   9291         case PyUnicode_4BYTE_KIND:
   9292             result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
   9293             break;
   9294         default:
   9295             assert(0); result = -2;
   9296         }
   9297     }
   9298     else {
   9299         switch (kind1) {
   9300         case PyUnicode_1BYTE_KIND:
   9301             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
   9302                 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
   9303             else
   9304                 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
   9305             break;
   9306         case PyUnicode_2BYTE_KIND:
   9307             result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
   9308             break;
   9309         case PyUnicode_4BYTE_KIND:
   9310             result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
   9311             break;
   9312         default:
   9313             assert(0); result = -2;
   9314         }
   9315     }
   9316 
   9317     if (kind2 != kind1)
   9318         PyMem_Free(buf2);
   9319 
   9320     return result;
   9321 }
   9322 
   9323 Py_ssize_t
   9324 _PyUnicode_InsertThousandsGrouping(
   9325     PyObject *unicode, Py_ssize_t index,
   9326     Py_ssize_t n_buffer,
   9327     void *digits, Py_ssize_t n_digits,
   9328     Py_ssize_t min_width,
   9329     const char *grouping, PyObject *thousands_sep,
   9330     Py_UCS4 *maxchar)
   9331 {
   9332     unsigned int kind, thousands_sep_kind;
   9333     char *data, *thousands_sep_data;
   9334     Py_ssize_t thousands_sep_len;
   9335     Py_ssize_t len;
   9336 
   9337     if (unicode != NULL) {
   9338         kind = PyUnicode_KIND(unicode);
   9339         data = (char *) PyUnicode_DATA(unicode) + index * kind;
   9340     }
   9341     else {
   9342         kind = PyUnicode_1BYTE_KIND;
   9343         data = NULL;
   9344     }
   9345     thousands_sep_kind = PyUnicode_KIND(thousands_sep);
   9346     thousands_sep_data = PyUnicode_DATA(thousands_sep);
   9347     thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
   9348     if (unicode != NULL && thousands_sep_kind != kind) {
   9349         if (thousands_sep_kind < kind) {
   9350             thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
   9351             if (!thousands_sep_data)
   9352                 return -1;
   9353         }
   9354         else {
   9355             data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
   9356             if (!data)
   9357                 return -1;
   9358         }
   9359     }
   9360 
   9361     switch (kind) {
   9362     case PyUnicode_1BYTE_KIND:
   9363         if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
   9364             len = asciilib_InsertThousandsGrouping(
   9365                 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
   9366                 min_width, grouping,
   9367                 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
   9368         else
   9369             len = ucs1lib_InsertThousandsGrouping(
   9370                 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
   9371                 min_width, grouping,
   9372                 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
   9373         break;
   9374     case PyUnicode_2BYTE_KIND:
   9375         len = ucs2lib_InsertThousandsGrouping(
   9376             (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
   9377             min_width, grouping,
   9378             (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
   9379         break;
   9380     case PyUnicode_4BYTE_KIND:
   9381         len = ucs4lib_InsertThousandsGrouping(
   9382             (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
   9383             min_width, grouping,
   9384             (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
   9385         break;
   9386     default:
   9387         assert(0);
   9388         return -1;
   9389     }
   9390     if (unicode != NULL && thousands_sep_kind != kind) {
   9391         if (thousands_sep_kind < kind)
   9392             PyMem_Free(thousands_sep_data);
   9393         else
   9394             PyMem_Free(data);
   9395     }
   9396     if (unicode == NULL) {
   9397         *maxchar = 127;
   9398         if (len != n_digits) {
   9399             *maxchar = Py_MAX(*maxchar,
   9400                                    PyUnicode_MAX_CHAR_VALUE(thousands_sep));
   9401         }
   9402     }
   9403     return len;
   9404 }
   9405 
   9406 
   9407 Py_ssize_t
   9408 PyUnicode_Count(PyObject *str,
   9409                 PyObject *substr,
   9410                 Py_ssize_t start,
   9411                 Py_ssize_t end)
   9412 {
   9413     Py_ssize_t result;
   9414     int kind1, kind2;
   9415     void *buf1 = NULL, *buf2 = NULL;
   9416     Py_ssize_t len1, len2;
   9417 
   9418     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
   9419         return -1;
   9420 
   9421     kind1 = PyUnicode_KIND(str);
   9422     kind2 = PyUnicode_KIND(substr);
   9423     if (kind1 < kind2)
   9424         return 0;
   9425 
   9426     len1 = PyUnicode_GET_LENGTH(str);
   9427     len2 = PyUnicode_GET_LENGTH(substr);
   9428     ADJUST_INDICES(start, end, len1);
   9429     if (end - start < len2)
   9430         return 0;
   9431 
   9432     buf1 = PyUnicode_DATA(str);
   9433     buf2 = PyUnicode_DATA(substr);
   9434     if (kind2 != kind1) {
   9435         buf2 = _PyUnicode_AsKind(substr, kind1);
   9436         if (!buf2)
   9437             goto onError;
   9438     }
   9439 
   9440     switch (kind1) {
   9441     case PyUnicode_1BYTE_KIND:
   9442         if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
   9443             result = asciilib_count(
   9444                 ((Py_UCS1*)buf1) + start, end - start,
   9445                 buf2, len2, PY_SSIZE_T_MAX
   9446                 );
   9447         else
   9448             result = ucs1lib_count(
   9449                 ((Py_UCS1*)buf1) + start, end - start,
   9450                 buf2, len2, PY_SSIZE_T_MAX
   9451                 );
   9452         break;
   9453     case PyUnicode_2BYTE_KIND:
   9454         result = ucs2lib_count(
   9455             ((Py_UCS2*)buf1) + start, end - start,
   9456             buf2, len2, PY_SSIZE_T_MAX
   9457             );
   9458         break;
   9459     case PyUnicode_4BYTE_KIND:
   9460         result = ucs4lib_count(
   9461             ((Py_UCS4*)buf1) + start, end - start,
   9462             buf2, len2, PY_SSIZE_T_MAX
   9463             );
   9464         break;
   9465     default:
   9466         assert(0); result = 0;
   9467     }
   9468 
   9469     if (kind2 != kind1)
   9470         PyMem_Free(buf2);
   9471 
   9472     return result;
   9473   onError:
   9474     if (kind2 != kind1 && buf2)
   9475         PyMem_Free(buf2);
   9476     return -1;
   9477 }
   9478 
   9479 Py_ssize_t
   9480 PyUnicode_Find(PyObject *str,
   9481                PyObject *substr,
   9482                Py_ssize_t start,
   9483                Py_ssize_t end,
   9484                int direction)
   9485 {
   9486     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
   9487         return -2;
   9488 
   9489     return any_find_slice(str, substr, start, end, direction);
   9490 }
   9491 
   9492 Py_ssize_t
   9493 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
   9494                    Py_ssize_t start, Py_ssize_t end,
   9495                    int direction)
   9496 {
   9497     int kind;
   9498     Py_ssize_t result;
   9499     if (PyUnicode_READY(str) == -1)
   9500         return -2;
   9501     if (start < 0 || end < 0) {
   9502         PyErr_SetString(PyExc_IndexError, "string index out of range");
   9503         return -2;
   9504     }
   9505     if (end > PyUnicode_GET_LENGTH(str))
   9506         end = PyUnicode_GET_LENGTH(str);
   9507     if (start >= end)
   9508         return -1;
   9509     kind = PyUnicode_KIND(str);
   9510     result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
   9511                       kind, end-start, ch, direction);
   9512     if (result == -1)
   9513         return -1;
   9514     else
   9515         return start + result;
   9516 }
   9517 
   9518 static int
   9519 tailmatch(PyObject *self,
   9520           PyObject *substring,
   9521           Py_ssize_t start,
   9522           Py_ssize_t end,
   9523           int direction)
   9524 {
   9525     int kind_self;
   9526     int kind_sub;
   9527     void *data_self;
   9528     void *data_sub;
   9529     Py_ssize_t offset;
   9530     Py_ssize_t i;
   9531     Py_ssize_t end_sub;
   9532 
   9533     if (PyUnicode_READY(self) == -1 ||
   9534         PyUnicode_READY(substring) == -1)
   9535         return -1;
   9536 
   9537     ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
   9538     end -= PyUnicode_GET_LENGTH(substring);
   9539     if (end < start)
   9540         return 0;
   9541 
   9542     if (PyUnicode_GET_LENGTH(substring) == 0)
   9543         return 1;
   9544 
   9545     kind_self = PyUnicode_KIND(self);
   9546     data_self = PyUnicode_DATA(self);
   9547     kind_sub = PyUnicode_KIND(substring);
   9548     data_sub = PyUnicode_DATA(substring);
   9549     end_sub = PyUnicode_GET_LENGTH(substring) - 1;
   9550 
   9551     if (direction > 0)
   9552         offset = end;
   9553     else
   9554         offset = start;
   9555 
   9556     if (PyUnicode_READ(kind_self, data_self, offset) ==
   9557         PyUnicode_READ(kind_sub, data_sub, 0) &&
   9558         PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
   9559         PyUnicode_READ(kind_sub, data_sub, end_sub)) {
   9560         /* If both are of the same kind, memcmp is sufficient */
   9561         if (kind_self == kind_sub) {
   9562             return ! memcmp((char *)data_self +
   9563                                 (offset * PyUnicode_KIND(substring)),
   9564                             data_sub,
   9565                             PyUnicode_GET_LENGTH(substring) *
   9566                                 PyUnicode_KIND(substring));
   9567         }
   9568         /* otherwise we have to compare each character by first accessing it */
   9569         else {
   9570             /* We do not need to compare 0 and len(substring)-1 because
   9571                the if statement above ensured already that they are equal
   9572                when we end up here. */
   9573             for (i = 1; i < end_sub; ++i) {
   9574                 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
   9575                     PyUnicode_READ(kind_sub, data_sub, i))
   9576                     return 0;
   9577             }
   9578             return 1;
   9579         }
   9580     }
   9581 
   9582     return 0;
   9583 }
   9584 
   9585 Py_ssize_t
   9586 PyUnicode_Tailmatch(PyObject *str,
   9587                     PyObject *substr,
   9588                     Py_ssize_t start,
   9589                     Py_ssize_t end,
   9590                     int direction)
   9591 {
   9592     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
   9593         return -1;
   9594 
   9595     return tailmatch(str, substr, start, end, direction);
   9596 }
   9597 
   9598 /* Apply fixfct filter to the Unicode object self and return a
   9599    reference to the modified object */
   9600 
   9601 static PyObject *
   9602 fixup(PyObject *self,
   9603       Py_UCS4 (*fixfct)(PyObject *s))
   9604 {
   9605     PyObject *u;
   9606     Py_UCS4 maxchar_old, maxchar_new = 0;
   9607     PyObject *v;
   9608 
   9609     u = _PyUnicode_Copy(self);
   9610     if (u == NULL)
   9611         return NULL;
   9612     maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
   9613 
   9614     /* fix functions return the new maximum character in a string,
   9615        if the kind of the resulting unicode object does not change,
   9616        everything is fine.  Otherwise we need to change the string kind
   9617        and re-run the fix function. */
   9618     maxchar_new = fixfct(u);
   9619 
   9620     if (maxchar_new == 0) {
   9621         /* no changes */;
   9622         if (PyUnicode_CheckExact(self)) {
   9623             Py_DECREF(u);
   9624             Py_INCREF(self);
   9625             return self;
   9626         }
   9627         else
   9628             return u;
   9629     }
   9630 
   9631     maxchar_new = align_maxchar(maxchar_new);
   9632 
   9633     if (maxchar_new == maxchar_old)
   9634         return u;
   9635 
   9636     /* In case the maximum character changed, we need to
   9637        convert the string to the new category. */
   9638     v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
   9639     if (v == NULL) {
   9640         Py_DECREF(u);
   9641         return NULL;
   9642     }
   9643     if (maxchar_new > maxchar_old) {
   9644         /* If the maxchar increased so that the kind changed, not all
   9645            characters are representable anymore and we need to fix the
   9646            string again. This only happens in very few cases. */
   9647         _PyUnicode_FastCopyCharacters(v, 0,
   9648                                       self, 0, PyUnicode_GET_LENGTH(self));
   9649         maxchar_old = fixfct(v);
   9650         assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
   9651     }
   9652     else {
   9653         _PyUnicode_FastCopyCharacters(v, 0,
   9654                                       u, 0, PyUnicode_GET_LENGTH(self));
   9655     }
   9656     Py_DECREF(u);
   9657     assert(_PyUnicode_CheckConsistency(v, 1));
   9658     return v;
   9659 }
   9660 
   9661 static PyObject *
   9662 ascii_upper_or_lower(PyObject *self, int lower)
   9663 {
   9664     Py_ssize_t len = PyUnicode_GET_LENGTH(self);
   9665     char *resdata, *data = PyUnicode_DATA(self);
   9666     PyObject *res;
   9667 
   9668     res = PyUnicode_New(len, 127);
   9669     if (res == NULL)
   9670         return NULL;
   9671     resdata = PyUnicode_DATA(res);
   9672     if (lower)
   9673         _Py_bytes_lower(resdata, data, len);
   9674     else
   9675         _Py_bytes_upper(resdata, data, len);
   9676     return res;
   9677 }
   9678 
   9679 static Py_UCS4
   9680 handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
   9681 {
   9682     Py_ssize_t j;
   9683     int final_sigma;
   9684     Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
   9685     /* U+03A3 is in the Final_Sigma context when, it is found like this:
   9686 
   9687      \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
   9688 
   9689     where ! is a negation and \p{xxx} is a character with property xxx.
   9690     */
   9691     for (j = i - 1; j >= 0; j--) {
   9692         c = PyUnicode_READ(kind, data, j);
   9693         if (!_PyUnicode_IsCaseIgnorable(c))
   9694             break;
   9695     }
   9696     final_sigma = j >= 0 && _PyUnicode_IsCased(c);
   9697     if (final_sigma) {
   9698         for (j = i + 1; j < length; j++) {
   9699             c = PyUnicode_READ(kind, data, j);
   9700             if (!_PyUnicode_IsCaseIgnorable(c))
   9701                 break;
   9702         }
   9703         final_sigma = j == length || !_PyUnicode_IsCased(c);
   9704     }
   9705     return (final_sigma) ? 0x3C2 : 0x3C3;
   9706 }
   9707 
   9708 static int
   9709 lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
   9710            Py_UCS4 c, Py_UCS4 *mapped)
   9711 {
   9712     /* Obscure special case. */
   9713     if (c == 0x3A3) {
   9714         mapped[0] = handle_capital_sigma(kind, data, length, i);
   9715         return 1;
   9716     }
   9717     return _PyUnicode_ToLowerFull(c, mapped);
   9718 }
   9719 
   9720 static Py_ssize_t
   9721 do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
   9722 {
   9723     Py_ssize_t i, k = 0;
   9724     int n_res, j;
   9725     Py_UCS4 c, mapped[3];
   9726 
   9727     c = PyUnicode_READ(kind, data, 0);
   9728     n_res = _PyUnicode_ToUpperFull(c, mapped);
   9729     for (j = 0; j < n_res; j++) {
   9730         *maxchar = Py_MAX(*maxchar, mapped[j]);
   9731         res[k++] = mapped[j];
   9732     }
   9733     for (i = 1; i < length; i++) {
   9734         c = PyUnicode_READ(kind, data, i);
   9735         n_res = lower_ucs4(kind, data, length, i, c, mapped);
   9736         for (j = 0; j < n_res; j++) {
   9737             *maxchar = Py_MAX(*maxchar, mapped[j]);
   9738             res[k++] = mapped[j];
   9739         }
   9740     }
   9741     return k;
   9742 }
   9743 
   9744 static Py_ssize_t
   9745 do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
   9746     Py_ssize_t i, k = 0;
   9747 
   9748     for (i = 0; i < length; i++) {
   9749         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
   9750         int n_res, j;
   9751         if (Py_UNICODE_ISUPPER(c)) {
   9752             n_res = lower_ucs4(kind, data, length, i, c, mapped);
   9753         }
   9754         else if (Py_UNICODE_ISLOWER(c)) {
   9755             n_res = _PyUnicode_ToUpperFull(c, mapped);
   9756         }
   9757         else {
   9758             n_res = 1;
   9759             mapped[0] = c;
   9760         }
   9761         for (j = 0; j < n_res; j++) {
   9762             *maxchar = Py_MAX(*maxchar, mapped[j]);
   9763             res[k++] = mapped[j];
   9764         }
   9765     }
   9766     return k;
   9767 }
   9768 
   9769 static Py_ssize_t
   9770 do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
   9771                   Py_UCS4 *maxchar, int lower)
   9772 {
   9773     Py_ssize_t i, k = 0;
   9774 
   9775     for (i = 0; i < length; i++) {
   9776         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
   9777         int n_res, j;
   9778         if (lower)
   9779             n_res = lower_ucs4(kind, data, length, i, c, mapped);
   9780         else
   9781             n_res = _PyUnicode_ToUpperFull(c, mapped);
   9782         for (j = 0; j < n_res; j++) {
   9783             *maxchar = Py_MAX(*maxchar, mapped[j]);
   9784             res[k++] = mapped[j];
   9785         }
   9786     }
   9787     return k;
   9788 }
   9789 
   9790 static Py_ssize_t
   9791 do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
   9792 {
   9793     return do_upper_or_lower(kind, data, length, res, maxchar, 0);
   9794 }
   9795 
   9796 static Py_ssize_t
   9797 do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
   9798 {
   9799     return do_upper_or_lower(kind, data, length, res, maxchar, 1);
   9800 }
   9801 
   9802 static Py_ssize_t
   9803 do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
   9804 {
   9805     Py_ssize_t i, k = 0;
   9806 
   9807     for (i = 0; i < length; i++) {
   9808         Py_UCS4 c = PyUnicode_READ(kind, data, i);
   9809         Py_UCS4 mapped[3];
   9810         int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
   9811         for (j = 0; j < n_res; j++) {
   9812             *maxchar = Py_MAX(*maxchar, mapped[j]);
   9813             res[k++] = mapped[j];
   9814         }
   9815     }
   9816     return k;
   9817 }
   9818 
   9819 static Py_ssize_t
   9820 do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
   9821 {
   9822     Py_ssize_t i, k = 0;
   9823     int previous_is_cased;
   9824 
   9825     previous_is_cased = 0;
   9826     for (i = 0; i < length; i++) {
   9827         const Py_UCS4 c = PyUnicode_READ(kind, data, i);
   9828         Py_UCS4 mapped[3];
   9829         int n_res, j;
   9830 
   9831         if (previous_is_cased)
   9832             n_res = lower_ucs4(kind, data, length, i, c, mapped);
   9833         else
   9834             n_res = _PyUnicode_ToTitleFull(c, mapped);
   9835 
   9836         for (j = 0; j < n_res; j++) {
   9837             *maxchar = Py_MAX(*maxchar, mapped[j]);
   9838             res[k++] = mapped[j];
   9839         }
   9840 
   9841         previous_is_cased = _PyUnicode_IsCased(c);
   9842     }
   9843     return k;
   9844 }
   9845 
   9846 static PyObject *
   9847 case_operation(PyObject *self,
   9848                Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
   9849 {
   9850     PyObject *res = NULL;
   9851     Py_ssize_t length, newlength = 0;
   9852     int kind, outkind;
   9853     void *data, *outdata;
   9854     Py_UCS4 maxchar = 0, *tmp, *tmpend;
   9855 
   9856     assert(PyUnicode_IS_READY(self));
   9857 
   9858     kind = PyUnicode_KIND(self);
   9859     data = PyUnicode_DATA(self);
   9860     length = PyUnicode_GET_LENGTH(self);
   9861     if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
   9862         PyErr_SetString(PyExc_OverflowError, "string is too long");
   9863         return NULL;
   9864     }
   9865     tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
   9866     if (tmp == NULL)
   9867         return PyErr_NoMemory();
   9868     newlength = perform(kind, data, length, tmp, &maxchar);
   9869     res = PyUnicode_New(newlength, maxchar);
   9870     if (res == NULL)
   9871         goto leave;
   9872     tmpend = tmp + newlength;
   9873     outdata = PyUnicode_DATA(res);
   9874     outkind = PyUnicode_KIND(res);
   9875     switch (outkind) {
   9876     case PyUnicode_1BYTE_KIND:
   9877         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
   9878         break;
   9879     case PyUnicode_2BYTE_KIND:
   9880         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
   9881         break;
   9882     case PyUnicode_4BYTE_KIND:
   9883         memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
   9884         break;
   9885     default:
   9886         assert(0);
   9887         break;
   9888     }
   9889   leave:
   9890     PyMem_FREE(tmp);
   9891     return res;
   9892 }
   9893 
   9894 PyObject *
   9895 PyUnicode_Join(PyObject *separator, PyObject *seq)
   9896 {
   9897     PyObject *res;
   9898     PyObject *fseq;
   9899     Py_ssize_t seqlen;
   9900     PyObject **items;
   9901 
   9902     fseq = PySequence_Fast(seq, "can only join an iterable");
   9903     if (fseq == NULL) {
   9904         return NULL;
   9905     }
   9906 
   9907     /* NOTE: the following code can't call back into Python code,
   9908      * so we are sure that fseq won't be mutated.
   9909      */
   9910 
   9911     items = PySequence_Fast_ITEMS(fseq);
   9912     seqlen = PySequence_Fast_GET_SIZE(fseq);
   9913     res = _PyUnicode_JoinArray(separator, items, seqlen);
   9914     Py_DECREF(fseq);
   9915     return res;
   9916 }
   9917 
   9918 PyObject *
   9919 _PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
   9920 {
   9921     PyObject *res = NULL; /* the result */
   9922     PyObject *sep = NULL;
   9923     Py_ssize_t seplen;
   9924     PyObject *item;
   9925     Py_ssize_t sz, i, res_offset;
   9926     Py_UCS4 maxchar;
   9927     Py_UCS4 item_maxchar;
   9928     int use_memcpy;
   9929     unsigned char *res_data = NULL, *sep_data = NULL;
   9930     PyObject *last_obj;
   9931     unsigned int kind = 0;
   9932 
   9933     /* If empty sequence, return u"". */
   9934     if (seqlen == 0) {
   9935         _Py_RETURN_UNICODE_EMPTY();
   9936     }
   9937 
   9938     /* If singleton sequence with an exact Unicode, return that. */
   9939     last_obj = NULL;
   9940     if (seqlen == 1) {
   9941         if (PyUnicode_CheckExact(items[0])) {
   9942             res = items[0];
   9943             Py_INCREF(res);
   9944             return res;
   9945         }
   9946         seplen = 0;
   9947         maxchar = 0;
   9948     }
   9949     else {
   9950         /* Set up sep and seplen */
   9951         if (separator == NULL) {
   9952             /* fall back to a blank space separator */
   9953             sep = PyUnicode_FromOrdinal(' ');
   9954             if (!sep)
   9955                 goto onError;
   9956             seplen = 1;
   9957             maxchar = 32;
   9958         }
   9959         else {
   9960             if (!PyUnicode_Check(separator)) {
   9961                 PyErr_Format(PyExc_TypeError,
   9962                              "separator: expected str instance,"
   9963                              " %.80s found",
   9964                              Py_TYPE(separator)->tp_name);
   9965                 goto onError;
   9966             }
   9967             if (PyUnicode_READY(separator))
   9968                 goto onError;
   9969             sep = separator;
   9970             seplen = PyUnicode_GET_LENGTH(separator);
   9971             maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
   9972             /* inc refcount to keep this code path symmetric with the
   9973                above case of a blank separator */
   9974             Py_INCREF(sep);
   9975         }
   9976         last_obj = sep;
   9977     }
   9978 
   9979     /* There are at least two things to join, or else we have a subclass
   9980      * of str in the sequence.
   9981      * Do a pre-pass to figure out the total amount of space we'll
   9982      * need (sz), and see whether all argument are strings.
   9983      */
   9984     sz = 0;
   9985 #ifdef Py_DEBUG
   9986     use_memcpy = 0;
   9987 #else
   9988     use_memcpy = 1;
   9989 #endif
   9990     for (i = 0; i < seqlen; i++) {
   9991         size_t add_sz;
   9992         item = items[i];
   9993         if (!PyUnicode_Check(item)) {
   9994             PyErr_Format(PyExc_TypeError,
   9995                          "sequence item %zd: expected str instance,"
   9996                          " %.80s found",
   9997                          i, Py_TYPE(item)->tp_name);
   9998             goto onError;
   9999         }
   10000         if (PyUnicode_READY(item) == -1)
   10001             goto onError;
   10002         add_sz = PyUnicode_GET_LENGTH(item);
   10003         item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
   10004         maxchar = Py_MAX(maxchar, item_maxchar);
   10005         if (i != 0) {
   10006             add_sz += seplen;
   10007         }
   10008         if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
   10009             PyErr_SetString(PyExc_OverflowError,
   10010                             "join() result is too long for a Python string");
   10011             goto onError;
   10012         }
   10013         sz += add_sz;
   10014         if (use_memcpy && last_obj != NULL) {
   10015             if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
   10016                 use_memcpy = 0;
   10017         }
   10018         last_obj = item;
   10019     }
   10020 
   10021     res = PyUnicode_New(sz, maxchar);
   10022     if (res == NULL)
   10023         goto onError;
   10024 
   10025     /* Catenate everything. */
   10026 #ifdef Py_DEBUG
   10027     use_memcpy = 0;
   10028 #else
   10029     if (use_memcpy) {
   10030         res_data = PyUnicode_1BYTE_DATA(res);
   10031         kind = PyUnicode_KIND(res);
   10032         if (seplen != 0)
   10033             sep_data = PyUnicode_1BYTE_DATA(sep);
   10034     }
   10035 #endif
   10036     if (use_memcpy) {
   10037         for (i = 0; i < seqlen; ++i) {
   10038             Py_ssize_t itemlen;
   10039             item = items[i];
   10040 
   10041             /* Copy item, and maybe the separator. */
   10042             if (i && seplen != 0) {
   10043                 memcpy(res_data,
   10044                           sep_data,
   10045                           kind * seplen);
   10046                 res_data += kind * seplen;
   10047             }
   10048 
   10049             itemlen = PyUnicode_GET_LENGTH(item);
   10050             if (itemlen != 0) {
   10051                 memcpy(res_data,
   10052                           PyUnicode_DATA(item),
   10053                           kind * itemlen);
   10054                 res_data += kind * itemlen;
   10055             }
   10056         }
   10057         assert(res_data == PyUnicode_1BYTE_DATA(res)
   10058                            + kind * PyUnicode_GET_LENGTH(res));
   10059     }
   10060     else {
   10061         for (i = 0, res_offset = 0; i < seqlen; ++i) {
   10062             Py_ssize_t itemlen;
   10063             item = items[i];
   10064 
   10065             /* Copy item, and maybe the separator. */
   10066             if (i && seplen != 0) {
   10067                 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
   10068                 res_offset += seplen;
   10069             }
   10070 
   10071             itemlen = PyUnicode_GET_LENGTH(item);
   10072             if (itemlen != 0) {
   10073                 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
   10074                 res_offset += itemlen;
   10075             }
   10076         }
   10077         assert(res_offset == PyUnicode_GET_LENGTH(res));
   10078     }
   10079 
   10080     Py_XDECREF(sep);
   10081     assert(_PyUnicode_CheckConsistency(res, 1));
   10082     return res;
   10083 
   10084   onError:
   10085     Py_XDECREF(sep);
   10086     Py_XDECREF(res);
   10087     return NULL;
   10088 }
   10089 
   10090 #define FILL(kind, data, value, start, length) \
   10091     do { \
   10092         Py_ssize_t i_ = 0; \
   10093         assert(kind != PyUnicode_WCHAR_KIND); \
   10094         switch ((kind)) { \
   10095         case PyUnicode_1BYTE_KIND: { \
   10096             unsigned char * to_ = (unsigned char *)((data)) + (start); \
   10097             memset(to_, (unsigned char)value, (length)); \
   10098             break; \
   10099         } \
   10100         case PyUnicode_2BYTE_KIND: { \
   10101             Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
   10102             for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
   10103             break; \
   10104         } \
   10105         case PyUnicode_4BYTE_KIND: { \
   10106             Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
   10107             for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
   10108             break; \
   10109         } \
   10110         default: assert(0); \
   10111         } \
   10112     } while (0)
   10113 
   10114 void
   10115 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
   10116                     Py_UCS4 fill_char)
   10117 {
   10118     const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
   10119     const void *data = PyUnicode_DATA(unicode);
   10120     assert(PyUnicode_IS_READY(unicode));
   10121     assert(unicode_modifiable(unicode));
   10122     assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
   10123     assert(start >= 0);
   10124     assert(start + length <= PyUnicode_GET_LENGTH(unicode));
   10125     FILL(kind, data, fill_char, start, length);
   10126 }
   10127 
   10128 Py_ssize_t
   10129 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
   10130                Py_UCS4 fill_char)
   10131 {
   10132     Py_ssize_t maxlen;
   10133 
   10134     if (!PyUnicode_Check(unicode)) {
   10135         PyErr_BadInternalCall();
   10136         return -1;
   10137     }
   10138     if (PyUnicode_READY(unicode) == -1)
   10139         return -1;
   10140     if (unicode_check_modifiable(unicode))
   10141         return -1;
   10142 
   10143     if (start < 0) {
   10144         PyErr_SetString(PyExc_IndexError, "string index out of range");
   10145         return -1;
   10146     }
   10147     if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
   10148         PyErr_SetString(PyExc_ValueError,
   10149                          "fill character is bigger than "
   10150                          "the string maximum character");
   10151         return -1;
   10152     }
   10153 
   10154     maxlen = PyUnicode_GET_LENGTH(unicode) - start;
   10155     length = Py_MIN(maxlen, length);
   10156     if (length <= 0)
   10157         return 0;
   10158 
   10159     _PyUnicode_FastFill(unicode, start, length, fill_char);
   10160     return length;
   10161 }
   10162 
   10163 static PyObject *
   10164 pad(PyObject *self,
   10165     Py_ssize_t left,
   10166     Py_ssize_t right,
   10167     Py_UCS4 fill)
   10168 {
   10169     PyObject *u;
   10170     Py_UCS4 maxchar;
   10171     int kind;
   10172     void *data;
   10173 
   10174     if (left < 0)
   10175         left = 0;
   10176     if (right < 0)
   10177         right = 0;
   10178 
   10179     if (left == 0 && right == 0)
   10180         return unicode_result_unchanged(self);
   10181 
   10182     if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
   10183         right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
   10184         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
   10185         return NULL;
   10186     }
   10187     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
   10188     maxchar = Py_MAX(maxchar, fill);
   10189     u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
   10190     if (!u)
   10191         return NULL;
   10192 
   10193     kind = PyUnicode_KIND(u);
   10194     data = PyUnicode_DATA(u);
   10195     if (left)
   10196         FILL(kind, data, fill, 0, left);
   10197     if (right)
   10198         FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
   10199     _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
   10200     assert(_PyUnicode_CheckConsistency(u, 1));
   10201     return u;
   10202 }
   10203 
   10204 PyObject *
   10205 PyUnicode_Splitlines(PyObject *string, int keepends)
   10206 {
   10207     PyObject *list;
   10208 
   10209     if (ensure_unicode(string) < 0)
   10210         return NULL;
   10211 
   10212     switch (PyUnicode_KIND(string)) {
   10213     case PyUnicode_1BYTE_KIND:
   10214         if (PyUnicode_IS_ASCII(string))
   10215             list = asciilib_splitlines(
   10216                 string, PyUnicode_1BYTE_DATA(string),
   10217                 PyUnicode_GET_LENGTH(string), keepends);
   10218         else
   10219             list = ucs1lib_splitlines(
   10220                 string, PyUnicode_1BYTE_DATA(string),
   10221                 PyUnicode_GET_LENGTH(string), keepends);
   10222         break;
   10223     case PyUnicode_2BYTE_KIND:
   10224         list = ucs2lib_splitlines(
   10225             string, PyUnicode_2BYTE_DATA(string),
   10226             PyUnicode_GET_LENGTH(string), keepends);
   10227         break;
   10228     case PyUnicode_4BYTE_KIND:
   10229         list = ucs4lib_splitlines(
   10230             string, PyUnicode_4BYTE_DATA(string),
   10231             PyUnicode_GET_LENGTH(string), keepends);
   10232         break;
   10233     default:
   10234         assert(0);
   10235         list = 0;
   10236     }
   10237     return list;
   10238 }
   10239 
   10240 static PyObject *
   10241 split(PyObject *self,
   10242       PyObject *substring,
   10243       Py_ssize_t maxcount)
   10244 {
   10245     int kind1, kind2;
   10246     void *buf1, *buf2;
   10247     Py_ssize_t len1, len2;
   10248     PyObject* out;
   10249 
   10250     if (maxcount < 0)
   10251         maxcount = PY_SSIZE_T_MAX;
   10252 
   10253     if (PyUnicode_READY(self) == -1)
   10254         return NULL;
   10255 
   10256     if (substring == NULL)
   10257         switch (PyUnicode_KIND(self)) {
   10258         case PyUnicode_1BYTE_KIND:
   10259             if (PyUnicode_IS_ASCII(self))
   10260                 return asciilib_split_whitespace(
   10261                     self,  PyUnicode_1BYTE_DATA(self),
   10262                     PyUnicode_GET_LENGTH(self), maxcount
   10263                     );
   10264             else
   10265                 return ucs1lib_split_whitespace(
   10266                     self,  PyUnicode_1BYTE_DATA(self),
   10267                     PyUnicode_GET_LENGTH(self), maxcount
   10268                     );
   10269         case PyUnicode_2BYTE_KIND:
   10270             return ucs2lib_split_whitespace(
   10271                 self,  PyUnicode_2BYTE_DATA(self),
   10272                 PyUnicode_GET_LENGTH(self), maxcount
   10273                 );
   10274         case PyUnicode_4BYTE_KIND:
   10275             return ucs4lib_split_whitespace(
   10276                 self,  PyUnicode_4BYTE_DATA(self),
   10277                 PyUnicode_GET_LENGTH(self), maxcount
   10278                 );
   10279         default:
   10280             assert(0);
   10281             return NULL;
   10282         }
   10283 
   10284     if (PyUnicode_READY(substring) == -1)
   10285         return NULL;
   10286 
   10287     kind1 = PyUnicode_KIND(self);
   10288     kind2 = PyUnicode_KIND(substring);
   10289     len1 = PyUnicode_GET_LENGTH(self);
   10290     len2 = PyUnicode_GET_LENGTH(substring);
   10291     if (kind1 < kind2 || len1 < len2) {
   10292         out = PyList_New(1);
   10293         if (out == NULL)
   10294             return NULL;
   10295         Py_INCREF(self);
   10296         PyList_SET_ITEM(out, 0, self);
   10297         return out;
   10298     }
   10299     buf1 = PyUnicode_DATA(self);
   10300     buf2 = PyUnicode_DATA(substring);
   10301     if (kind2 != kind1) {
   10302         buf2 = _PyUnicode_AsKind(substring, kind1);
   10303         if (!buf2)
   10304             return NULL;
   10305     }
   10306 
   10307     switch (kind1) {
   10308     case PyUnicode_1BYTE_KIND:
   10309         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
   10310             out = asciilib_split(
   10311                 self,  buf1, len1, buf2, len2, maxcount);
   10312         else
   10313             out = ucs1lib_split(
   10314                 self,  buf1, len1, buf2, len2, maxcount);
   10315         break;
   10316     case PyUnicode_2BYTE_KIND:
   10317         out = ucs2lib_split(
   10318             self,  buf1, len1, buf2, len2, maxcount);
   10319         break;
   10320     case PyUnicode_4BYTE_KIND:
   10321         out = ucs4lib_split(
   10322             self,  buf1, len1, buf2, len2, maxcount);
   10323         break;
   10324     default:
   10325         out = NULL;
   10326     }
   10327     if (kind2 != kind1)
   10328         PyMem_Free(buf2);
   10329     return out;
   10330 }
   10331 
   10332 static PyObject *
   10333 rsplit(PyObject *self,
   10334        PyObject *substring,
   10335        Py_ssize_t maxcount)
   10336 {
   10337     int kind1, kind2;
   10338     void *buf1, *buf2;
   10339     Py_ssize_t len1, len2;
   10340     PyObject* out;
   10341 
   10342     if (maxcount < 0)
   10343         maxcount = PY_SSIZE_T_MAX;
   10344 
   10345     if (PyUnicode_READY(self) == -1)
   10346         return NULL;
   10347 
   10348     if (substring == NULL)
   10349         switch (PyUnicode_KIND(self)) {
   10350         case PyUnicode_1BYTE_KIND:
   10351             if (PyUnicode_IS_ASCII(self))
   10352                 return asciilib_rsplit_whitespace(
   10353                     self,  PyUnicode_1BYTE_DATA(self),
   10354                     PyUnicode_GET_LENGTH(self), maxcount
   10355                     );
   10356             else
   10357                 return ucs1lib_rsplit_whitespace(
   10358                     self,  PyUnicode_1BYTE_DATA(self),
   10359                     PyUnicode_GET_LENGTH(self), maxcount
   10360                     );
   10361         case PyUnicode_2BYTE_KIND:
   10362             return ucs2lib_rsplit_whitespace(
   10363                 self,  PyUnicode_2BYTE_DATA(self),
   10364                 PyUnicode_GET_LENGTH(self), maxcount
   10365                 );
   10366         case PyUnicode_4BYTE_KIND:
   10367             return ucs4lib_rsplit_whitespace(
   10368                 self,  PyUnicode_4BYTE_DATA(self),
   10369                 PyUnicode_GET_LENGTH(self), maxcount
   10370                 );
   10371         default:
   10372             assert(0);
   10373             return NULL;
   10374         }
   10375 
   10376     if (PyUnicode_READY(substring) == -1)
   10377         return NULL;
   10378 
   10379     kind1 = PyUnicode_KIND(self);
   10380     kind2 = PyUnicode_KIND(substring);
   10381     len1 = PyUnicode_GET_LENGTH(self);
   10382     len2 = PyUnicode_GET_LENGTH(substring);
   10383     if (kind1 < kind2 || len1 < len2) {
   10384         out = PyList_New(1);
   10385         if (out == NULL)
   10386             return NULL;
   10387         Py_INCREF(self);
   10388         PyList_SET_ITEM(out, 0, self);
   10389         return out;
   10390     }
   10391     buf1 = PyUnicode_DATA(self);
   10392     buf2 = PyUnicode_DATA(substring);
   10393     if (kind2 != kind1) {
   10394         buf2 = _PyUnicode_AsKind(substring, kind1);
   10395         if (!buf2)
   10396             return NULL;
   10397     }
   10398 
   10399     switch (kind1) {
   10400     case PyUnicode_1BYTE_KIND:
   10401         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
   10402             out = asciilib_rsplit(
   10403                 self,  buf1, len1, buf2, len2, maxcount);
   10404         else
   10405             out = ucs1lib_rsplit(
   10406                 self,  buf1, len1, buf2, len2, maxcount);
   10407         break;
   10408     case PyUnicode_2BYTE_KIND:
   10409         out = ucs2lib_rsplit(
   10410             self,  buf1, len1, buf2, len2, maxcount);
   10411         break;
   10412     case PyUnicode_4BYTE_KIND:
   10413         out = ucs4lib_rsplit(
   10414             self,  buf1, len1, buf2, len2, maxcount);
   10415         break;
   10416     default:
   10417         out = NULL;
   10418     }
   10419     if (kind2 != kind1)
   10420         PyMem_Free(buf2);
   10421     return out;
   10422 }
   10423 
   10424 static Py_ssize_t
   10425 anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
   10426             PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
   10427 {
   10428     switch (kind) {
   10429     case PyUnicode_1BYTE_KIND:
   10430         if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
   10431             return asciilib_find(buf1, len1, buf2, len2, offset);
   10432         else
   10433             return ucs1lib_find(buf1, len1, buf2, len2, offset);
   10434     case PyUnicode_2BYTE_KIND:
   10435         return ucs2lib_find(buf1, len1, buf2, len2, offset);
   10436     case PyUnicode_4BYTE_KIND:
   10437         return ucs4lib_find(buf1, len1, buf2, len2, offset);
   10438     }
   10439     assert(0);
   10440     return -1;
   10441 }
   10442 
   10443 static Py_ssize_t
   10444 anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
   10445              PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
   10446 {
   10447     switch (kind) {
   10448     case PyUnicode_1BYTE_KIND:
   10449         if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
   10450             return asciilib_count(sbuf, slen, buf1, len1, maxcount);
   10451         else
   10452             return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
   10453     case PyUnicode_2BYTE_KIND:
   10454         return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
   10455     case PyUnicode_4BYTE_KIND:
   10456         return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
   10457     }
   10458     assert(0);
   10459     return 0;
   10460 }
   10461 
   10462 static void
   10463 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
   10464                       Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
   10465 {
   10466     int kind = PyUnicode_KIND(u);
   10467     void *data = PyUnicode_DATA(u);
   10468     Py_ssize_t len = PyUnicode_GET_LENGTH(u);
   10469     if (kind == PyUnicode_1BYTE_KIND) {
   10470         ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
   10471                                       (Py_UCS1 *)data + len,
   10472                                       u1, u2, maxcount);
   10473     }
   10474     else if (kind == PyUnicode_2BYTE_KIND) {
   10475         ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
   10476                                       (Py_UCS2 *)data + len,
   10477                                       u1, u2, maxcount);
   10478     }
   10479     else {
   10480         assert(kind == PyUnicode_4BYTE_KIND);
   10481         ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
   10482                                       (Py_UCS4 *)data + len,
   10483                                       u1, u2, maxcount);
   10484     }
   10485 }
   10486 
   10487 static PyObject *
   10488 replace(PyObject *self, PyObject *str1,
   10489         PyObject *str2, Py_ssize_t maxcount)
   10490 {
   10491     PyObject *u;
   10492     char *sbuf = PyUnicode_DATA(self);
   10493     char *buf1 = PyUnicode_DATA(str1);
   10494     char *buf2 = PyUnicode_DATA(str2);
   10495     int srelease = 0, release1 = 0, release2 = 0;
   10496     int skind = PyUnicode_KIND(self);
   10497     int kind1 = PyUnicode_KIND(str1);
   10498     int kind2 = PyUnicode_KIND(str2);
   10499     Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
   10500     Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
   10501     Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
   10502     int mayshrink;
   10503     Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
   10504 
   10505     if (maxcount < 0)
   10506         maxcount = PY_SSIZE_T_MAX;
   10507     else if (maxcount == 0 || slen == 0)
   10508         goto nothing;
   10509 
   10510     if (str1 == str2)
   10511         goto nothing;
   10512 
   10513     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
   10514     maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
   10515     if (maxchar < maxchar_str1)
   10516         /* substring too wide to be present */
   10517         goto nothing;
   10518     maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
   10519     /* Replacing str1 with str2 may cause a maxchar reduction in the
   10520        result string. */
   10521     mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
   10522     maxchar = Py_MAX(maxchar, maxchar_str2);
   10523 
   10524     if (len1 == len2) {
   10525         /* same length */
   10526         if (len1 == 0)
   10527             goto nothing;
   10528         if (len1 == 1) {
   10529             /* replace characters */
   10530             Py_UCS4 u1, u2;
   10531             Py_ssize_t pos;
   10532 
   10533             u1 = PyUnicode_READ(kind1, buf1, 0);
   10534             pos = findchar(sbuf, skind, slen, u1, 1);
   10535             if (pos < 0)
   10536                 goto nothing;
   10537             u2 = PyUnicode_READ(kind2, buf2, 0);
   10538             u = PyUnicode_New(slen, maxchar);
   10539             if (!u)
   10540                 goto error;
   10541 
   10542             _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
   10543             replace_1char_inplace(u, pos, u1, u2, maxcount);
   10544         }
   10545         else {
   10546             int rkind = skind;
   10547             char *res;
   10548             Py_ssize_t i;
   10549 
   10550             if (kind1 < rkind) {
   10551                 /* widen substring */
   10552                 buf1 = _PyUnicode_AsKind(str1, rkind);
   10553                 if (!buf1) goto error;
   10554                 release1 = 1;
   10555             }
   10556             i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
   10557             if (i < 0)
   10558                 goto nothing;
   10559             if (rkind > kind2) {
   10560                 /* widen replacement */
   10561                 buf2 = _PyUnicode_AsKind(str2, rkind);
   10562                 if (!buf2) goto error;
   10563                 release2 = 1;
   10564             }
   10565             else if (rkind < kind2) {
   10566                 /* widen self and buf1 */
   10567                 rkind = kind2;
   10568                 if (release1) PyMem_Free(buf1);
   10569                 release1 = 0;
   10570                 sbuf = _PyUnicode_AsKind(self, rkind);
   10571                 if (!sbuf) goto error;
   10572                 srelease = 1;
   10573                 buf1 = _PyUnicode_AsKind(str1, rkind);
   10574                 if (!buf1) goto error;
   10575                 release1 = 1;
   10576             }
   10577             u = PyUnicode_New(slen, maxchar);
   10578             if (!u)
   10579                 goto error;
   10580             assert(PyUnicode_KIND(u) == rkind);
   10581             res = PyUnicode_DATA(u);
   10582 
   10583             memcpy(res, sbuf, rkind * slen);
   10584             /* change everything in-place, starting with this one */
   10585             memcpy(res + rkind * i,
   10586                    buf2,
   10587                    rkind * len2);
   10588             i += len1;
   10589 
   10590             while ( --maxcount > 0) {
   10591                 i = anylib_find(rkind, self,
   10592                                 sbuf+rkind*i, slen-i,
   10593                                 str1, buf1, len1, i);
   10594                 if (i == -1)
   10595                     break;
   10596                 memcpy(res + rkind * i,
   10597                        buf2,
   10598                        rkind * len2);
   10599                 i += len1;
   10600             }
   10601         }
   10602     }
   10603     else {
   10604         Py_ssize_t n, i, j, ires;
   10605         Py_ssize_t new_size;
   10606         int rkind = skind;
   10607         char *res;
   10608 
   10609         if (kind1 < rkind) {
   10610             /* widen substring */
   10611             buf1 = _PyUnicode_AsKind(str1, rkind);
   10612             if (!buf1) goto error;
   10613             release1 = 1;
   10614         }
   10615         n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
   10616         if (n == 0)
   10617             goto nothing;
   10618         if (kind2 < rkind) {
   10619             /* widen replacement */
   10620             buf2 = _PyUnicode_AsKind(str2, rkind);
   10621             if (!buf2) goto error;
   10622             release2 = 1;
   10623         }
   10624         else if (kind2 > rkind) {
   10625             /* widen self and buf1 */
   10626             rkind = kind2;
   10627             sbuf = _PyUnicode_AsKind(self, rkind);
   10628             if (!sbuf) goto error;
   10629             srelease = 1;
   10630             if (release1) PyMem_Free(buf1);
   10631             release1 = 0;
   10632             buf1 = _PyUnicode_AsKind(str1, rkind);
   10633             if (!buf1) goto error;
   10634             release1 = 1;
   10635         }
   10636         /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
   10637            PyUnicode_GET_LENGTH(str1))); */
   10638         if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
   10639                 PyErr_SetString(PyExc_OverflowError,
   10640                                 "replace string is too long");
   10641                 goto error;
   10642         }
   10643         new_size = slen + n * (len2 - len1);
   10644         if (new_size == 0) {
   10645             _Py_INCREF_UNICODE_EMPTY();
   10646             if (!unicode_empty)
   10647                 goto error;
   10648             u = unicode_empty;
   10649             goto done;
   10650         }
   10651         if (new_size > (PY_SSIZE_T_MAX / rkind)) {
   10652             PyErr_SetString(PyExc_OverflowError,
   10653                             "replace string is too long");
   10654             goto error;
   10655         }
   10656         u = PyUnicode_New(new_size, maxchar);
   10657         if (!u)
   10658             goto error;
   10659         assert(PyUnicode_KIND(u) == rkind);
   10660         res = PyUnicode_DATA(u);
   10661         ires = i = 0;
   10662         if (len1 > 0) {
   10663             while (n-- > 0) {
   10664                 /* look for next match */
   10665                 j = anylib_find(rkind, self,
   10666                                 sbuf + rkind * i, slen-i,
   10667                                 str1, buf1, len1, i);
   10668                 if (j == -1)
   10669                     break;
   10670                 else if (j > i) {
   10671                     /* copy unchanged part [i:j] */
   10672                     memcpy(res + rkind * ires,
   10673                            sbuf + rkind * i,
   10674                            rkind * (j-i));
   10675                     ires += j - i;
   10676                 }
   10677                 /* copy substitution string */
   10678                 if (len2 > 0) {
   10679                     memcpy(res + rkind * ires,
   10680                            buf2,
   10681                            rkind * len2);
   10682                     ires += len2;
   10683                 }
   10684                 i = j + len1;
   10685             }
   10686             if (i < slen)
   10687                 /* copy tail [i:] */
   10688                 memcpy(res + rkind * ires,
   10689                        sbuf + rkind * i,
   10690                        rkind * (slen-i));
   10691         }
   10692         else {
   10693             /* interleave */
   10694             while (n > 0) {
   10695                 memcpy(res + rkind * ires,
   10696                        buf2,
   10697                        rkind * len2);
   10698                 ires += len2;
   10699                 if (--n <= 0)
   10700                     break;
   10701                 memcpy(res + rkind * ires,
   10702                        sbuf + rkind * i,
   10703                        rkind);
   10704                 ires++;
   10705                 i++;
   10706             }
   10707             memcpy(res + rkind * ires,
   10708                    sbuf + rkind * i,
   10709                    rkind * (slen-i));
   10710         }
   10711     }
   10712 
   10713     if (mayshrink) {
   10714         unicode_adjust_maxchar(&u);
   10715         if (u == NULL)
   10716             goto error;
   10717     }
   10718 
   10719   done:
   10720     if (srelease)
   10721         PyMem_FREE(sbuf);
   10722     if (release1)
   10723         PyMem_FREE(buf1);
   10724     if (release2)
   10725         PyMem_FREE(buf2);
   10726     assert(_PyUnicode_CheckConsistency(u, 1));
   10727     return u;
   10728 
   10729   nothing:
   10730     /* nothing to replace; return original string (when possible) */
   10731     if (srelease)
   10732         PyMem_FREE(sbuf);
   10733     if (release1)
   10734         PyMem_FREE(buf1);
   10735     if (release2)
   10736         PyMem_FREE(buf2);
   10737     return unicode_result_unchanged(self);
   10738 
   10739   error:
   10740     if (srelease && sbuf)
   10741         PyMem_FREE(sbuf);
   10742     if (release1 && buf1)
   10743         PyMem_FREE(buf1);
   10744     if (release2 && buf2)
   10745         PyMem_FREE(buf2);
   10746     return NULL;
   10747 }
   10748 
   10749 /* --- Unicode Object Methods --------------------------------------------- */
   10750 
   10751 PyDoc_STRVAR(title__doc__,
   10752              "S.title() -> str\n\
   10753 \n\
   10754 Return a titlecased version of S, i.e. words start with title case\n\
   10755 characters, all remaining cased characters have lower case.");
   10756 
   10757 static PyObject*
   10758 unicode_title(PyObject *self)
   10759 {
   10760     if (PyUnicode_READY(self) == -1)
   10761         return NULL;
   10762     return case_operation(self, do_title);
   10763 }
   10764 
   10765 PyDoc_STRVAR(capitalize__doc__,
   10766              "S.capitalize() -> str\n\
   10767 \n\
   10768 Return a capitalized version of S, i.e. make the first character\n\
   10769 have upper case and the rest lower case.");
   10770 
   10771 static PyObject*
   10772 unicode_capitalize(PyObject *self)
   10773 {
   10774     if (PyUnicode_READY(self) == -1)
   10775         return NULL;
   10776     if (PyUnicode_GET_LENGTH(self) == 0)
   10777         return unicode_result_unchanged(self);
   10778     return case_operation(self, do_capitalize);
   10779 }
   10780 
   10781 PyDoc_STRVAR(casefold__doc__,
   10782              "S.casefold() -> str\n\
   10783 \n\
   10784 Return a version of S suitable for caseless comparisons.");
   10785 
   10786 static PyObject *
   10787 unicode_casefold(PyObject *self)
   10788 {
   10789     if (PyUnicode_READY(self) == -1)
   10790         return NULL;
   10791     if (PyUnicode_IS_ASCII(self))
   10792         return ascii_upper_or_lower(self, 1);
   10793     return case_operation(self, do_casefold);
   10794 }
   10795 
   10796 
   10797 /* Argument converter. Accepts a single Unicode character. */
   10798 
   10799 static int
   10800 convert_uc(PyObject *obj, void *addr)
   10801 {
   10802     Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
   10803 
   10804     if (!PyUnicode_Check(obj)) {
   10805         PyErr_Format(PyExc_TypeError,
   10806                      "The fill character must be a unicode character, "
   10807                      "not %.100s", Py_TYPE(obj)->tp_name);
   10808         return 0;
   10809     }
   10810     if (PyUnicode_READY(obj) < 0)
   10811         return 0;
   10812     if (PyUnicode_GET_LENGTH(obj) != 1) {
   10813         PyErr_SetString(PyExc_TypeError,
   10814                         "The fill character must be exactly one character long");
   10815         return 0;
   10816     }
   10817     *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
   10818     return 1;
   10819 }
   10820 
   10821 PyDoc_STRVAR(center__doc__,
   10822              "S.center(width[, fillchar]) -> str\n\
   10823 \n\
   10824 Return S centered in a string of length width. Padding is\n\
   10825 done using the specified fill character (default is a space)");
   10826 
   10827 static PyObject *
   10828 unicode_center(PyObject *self, PyObject *args)
   10829 {
   10830     Py_ssize_t marg, left;
   10831     Py_ssize_t width;
   10832     Py_UCS4 fillchar = ' ';
   10833 
   10834     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
   10835         return NULL;
   10836 
   10837     if (PyUnicode_READY(self) == -1)
   10838         return NULL;
   10839 
   10840     if (PyUnicode_GET_LENGTH(self) >= width)
   10841         return unicode_result_unchanged(self);
   10842 
   10843     marg = width - PyUnicode_GET_LENGTH(self);
   10844     left = marg / 2 + (marg & width & 1);
   10845 
   10846     return pad(self, left, marg - left, fillchar);
   10847 }
   10848 
   10849 /* This function assumes that str1 and str2 are readied by the caller. */
   10850 
   10851 static int
   10852 unicode_compare(PyObject *str1, PyObject *str2)
   10853 {
   10854 #define COMPARE(TYPE1, TYPE2) \
   10855     do { \
   10856         TYPE1* p1 = (TYPE1 *)data1; \
   10857         TYPE2* p2 = (TYPE2 *)data2; \
   10858         TYPE1* end = p1 + len; \
   10859         Py_UCS4 c1, c2; \
   10860         for (; p1 != end; p1++, p2++) { \
   10861             c1 = *p1; \
   10862             c2 = *p2; \
   10863             if (c1 != c2) \
   10864                 return (c1 < c2) ? -1 : 1; \
   10865         } \
   10866     } \
   10867     while (0)
   10868 
   10869     int kind1, kind2;
   10870     void *data1, *data2;
   10871     Py_ssize_t len1, len2, len;
   10872 
   10873     kind1 = PyUnicode_KIND(str1);
   10874     kind2 = PyUnicode_KIND(str2);
   10875     data1 = PyUnicode_DATA(str1);
   10876     data2 = PyUnicode_DATA(str2);
   10877     len1 = PyUnicode_GET_LENGTH(str1);
   10878     len2 = PyUnicode_GET_LENGTH(str2);
   10879     len = Py_MIN(len1, len2);
   10880 
   10881     switch(kind1) {
   10882     case PyUnicode_1BYTE_KIND:
   10883     {
   10884         switch(kind2) {
   10885         case PyUnicode_1BYTE_KIND:
   10886         {
   10887             int cmp = memcmp(data1, data2, len);
   10888             /* normalize result of memcmp() into the range [-1; 1] */
   10889             if (cmp < 0)
   10890                 return -1;
   10891             if (cmp > 0)
   10892                 return 1;
   10893             break;
   10894         }
   10895         case PyUnicode_2BYTE_KIND:
   10896             COMPARE(Py_UCS1, Py_UCS2);
   10897             break;
   10898         case PyUnicode_4BYTE_KIND:
   10899             COMPARE(Py_UCS1, Py_UCS4);
   10900             break;
   10901         default:
   10902             assert(0);
   10903         }
   10904         break;
   10905     }
   10906     case PyUnicode_2BYTE_KIND:
   10907     {
   10908         switch(kind2) {
   10909         case PyUnicode_1BYTE_KIND:
   10910             COMPARE(Py_UCS2, Py_UCS1);
   10911             break;
   10912         case PyUnicode_2BYTE_KIND:
   10913         {
   10914             COMPARE(Py_UCS2, Py_UCS2);
   10915             break;
   10916         }
   10917         case PyUnicode_4BYTE_KIND:
   10918             COMPARE(Py_UCS2, Py_UCS4);
   10919             break;
   10920         default:
   10921             assert(0);
   10922         }
   10923         break;
   10924     }
   10925     case PyUnicode_4BYTE_KIND:
   10926     {
   10927         switch(kind2) {
   10928         case PyUnicode_1BYTE_KIND:
   10929             COMPARE(Py_UCS4, Py_UCS1);
   10930             break;
   10931         case PyUnicode_2BYTE_KIND:
   10932             COMPARE(Py_UCS4, Py_UCS2);
   10933             break;
   10934         case PyUnicode_4BYTE_KIND:
   10935         {
   10936 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
   10937             int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
   10938             /* normalize result of wmemcmp() into the range [-1; 1] */
   10939             if (cmp < 0)
   10940                 return -1;
   10941             if (cmp > 0)
   10942                 return 1;
   10943 #else
   10944             COMPARE(Py_UCS4, Py_UCS4);
   10945 #endif
   10946             break;
   10947         }
   10948         default:
   10949             assert(0);
   10950         }
   10951         break;
   10952     }
   10953     default:
   10954         assert(0);
   10955     }
   10956 
   10957     if (len1 == len2)
   10958         return 0;
   10959     if (len1 < len2)
   10960         return -1;
   10961     else
   10962         return 1;
   10963 
   10964 #undef COMPARE
   10965 }
   10966 
   10967 static int
   10968 unicode_compare_eq(PyObject *str1, PyObject *str2)
   10969 {
   10970     int kind;
   10971     void *data1, *data2;
   10972     Py_ssize_t len;
   10973     int cmp;
   10974 
   10975     len = PyUnicode_GET_LENGTH(str1);
   10976     if (PyUnicode_GET_LENGTH(str2) != len)
   10977         return 0;
   10978     kind = PyUnicode_KIND(str1);
   10979     if (PyUnicode_KIND(str2) != kind)
   10980         return 0;
   10981     data1 = PyUnicode_DATA(str1);
   10982     data2 = PyUnicode_DATA(str2);
   10983 
   10984     cmp = memcmp(data1, data2, len * kind);
   10985     return (cmp == 0);
   10986 }
   10987 
   10988 
   10989 int
   10990 PyUnicode_Compare(PyObject *left, PyObject *right)
   10991 {
   10992     if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
   10993         if (PyUnicode_READY(left) == -1 ||
   10994             PyUnicode_READY(right) == -1)
   10995             return -1;
   10996 
   10997         /* a string is equal to itself */
   10998         if (left == right)
   10999             return 0;
   11000 
   11001         return unicode_compare(left, right);
   11002     }
   11003     PyErr_Format(PyExc_TypeError,
   11004                  "Can't compare %.100s and %.100s",
   11005                  left->ob_type->tp_name,
   11006                  right->ob_type->tp_name);
   11007     return -1;
   11008 }
   11009 
   11010 int
   11011 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
   11012 {
   11013     Py_ssize_t i;
   11014     int kind;
   11015     Py_UCS4 chr;
   11016     const unsigned char *ustr = (const unsigned char *)str;
   11017 
   11018     assert(_PyUnicode_CHECK(uni));
   11019     if (!PyUnicode_IS_READY(uni)) {
   11020         const wchar_t *ws = _PyUnicode_WSTR(uni);
   11021         /* Compare Unicode string and source character set string */
   11022         for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
   11023             if (chr != ustr[i])
   11024                 return (chr < ustr[i]) ? -1 : 1;
   11025         }
   11026         /* This check keeps Python strings that end in '\0' from comparing equal
   11027          to C strings identical up to that point. */
   11028         if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
   11029             return 1; /* uni is longer */
   11030         if (ustr[i])
   11031             return -1; /* str is longer */
   11032         return 0;
   11033     }
   11034     kind = PyUnicode_KIND(uni);
   11035     if (kind == PyUnicode_1BYTE_KIND) {
   11036         const void *data = PyUnicode_1BYTE_DATA(uni);
   11037         size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
   11038         size_t len, len2 = strlen(str);
   11039         int cmp;
   11040 
   11041         len = Py_MIN(len1, len2);
   11042         cmp = memcmp(data, str, len);
   11043         if (cmp != 0) {
   11044             if (cmp < 0)
   11045                 return -1;
   11046             else
   11047                 return 1;
   11048         }
   11049         if (len1 > len2)
   11050             return 1; /* uni is longer */
   11051         if (len1 < len2)
   11052             return -1; /* str is longer */
   11053         return 0;
   11054     }
   11055     else {
   11056         void *data = PyUnicode_DATA(uni);
   11057         /* Compare Unicode string and source character set string */
   11058         for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
   11059             if (chr != (unsigned char)str[i])
   11060                 return (chr < (unsigned char)(str[i])) ? -1 : 1;
   11061         /* This check keeps Python strings that end in '\0' from comparing equal
   11062          to C strings identical up to that point. */
   11063         if (PyUnicode_GET_LENGTH(uni) != i || chr)
   11064             return 1; /* uni is longer */
   11065         if (str[i])
   11066             return -1; /* str is longer */
   11067         return 0;
   11068     }
   11069 }
   11070 
   11071 static int
   11072 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
   11073 {
   11074     size_t i, len;
   11075     const wchar_t *p;
   11076     len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
   11077     if (strlen(str) != len)
   11078         return 0;
   11079     p = _PyUnicode_WSTR(unicode);
   11080     assert(p);
   11081     for (i = 0; i < len; i++) {
   11082         unsigned char c = (unsigned char)str[i];
   11083         if (c >= 128 || p[i] != (wchar_t)c)
   11084             return 0;
   11085     }
   11086     return 1;
   11087 }
   11088 
   11089 int
   11090 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
   11091 {
   11092     size_t len;
   11093     assert(_PyUnicode_CHECK(unicode));
   11094     assert(str);
   11095 #ifndef NDEBUG
   11096     for (const char *p = str; *p; p++) {
   11097         assert((unsigned char)*p < 128);
   11098     }
   11099 #endif
   11100     if (PyUnicode_READY(unicode) == -1) {
   11101         /* Memory error or bad data */
   11102         PyErr_Clear();
   11103         return non_ready_unicode_equal_to_ascii_string(unicode, str);
   11104     }
   11105     if (!PyUnicode_IS_ASCII(unicode))
   11106         return 0;
   11107     len = (size_t)PyUnicode_GET_LENGTH(unicode);
   11108     return strlen(str) == len &&
   11109            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
   11110 }
   11111 
   11112 int
   11113 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
   11114 {
   11115     PyObject *right_uni;
   11116     Py_hash_t hash;
   11117 
   11118     assert(_PyUnicode_CHECK(left));
   11119     assert(right->string);
   11120 #ifndef NDEBUG
   11121     for (const char *p = right->string; *p; p++) {
   11122         assert((unsigned char)*p < 128);
   11123     }
   11124 #endif
   11125 
   11126     if (PyUnicode_READY(left) == -1) {
   11127         /* memory error or bad data */
   11128         PyErr_Clear();
   11129         return non_ready_unicode_equal_to_ascii_string(left, right->string);
   11130     }
   11131 
   11132     if (!PyUnicode_IS_ASCII(left))
   11133         return 0;
   11134 
   11135     right_uni = _PyUnicode_FromId(right);       /* borrowed */
   11136     if (right_uni == NULL) {
   11137         /* memory error or bad data */
   11138         PyErr_Clear();
   11139         return _PyUnicode_EqualToASCIIString(left, right->string);
   11140     }
   11141 
   11142     if (left == right_uni)
   11143         return 1;
   11144 
   11145     if (PyUnicode_CHECK_INTERNED(left))
   11146         return 0;
   11147 
   11148     assert(_PyUnicode_HASH(right_uni) != 1);
   11149     hash = _PyUnicode_HASH(left);
   11150     if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
   11151         return 0;
   11152 
   11153     return unicode_compare_eq(left, right_uni);
   11154 }
   11155 
   11156 #define TEST_COND(cond)                         \
   11157     ((cond) ? Py_True : Py_False)
   11158 
   11159 PyObject *
   11160 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
   11161 {
   11162     int result;
   11163     PyObject *v;
   11164 
   11165     if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
   11166         Py_RETURN_NOTIMPLEMENTED;
   11167 
   11168     if (PyUnicode_READY(left) == -1 ||
   11169         PyUnicode_READY(right) == -1)
   11170         return NULL;
   11171 
   11172     if (left == right) {
   11173         switch (op) {
   11174         case Py_EQ:
   11175         case Py_LE:
   11176         case Py_GE:
   11177             /* a string is equal to itself */
   11178             v = Py_True;
   11179             break;
   11180         case Py_NE:
   11181         case Py_LT:
   11182         case Py_GT:
   11183             v = Py_False;
   11184             break;
   11185         default:
   11186             PyErr_BadArgument();
   11187             return NULL;
   11188         }
   11189     }
   11190     else if (op == Py_EQ || op == Py_NE) {
   11191         result = unicode_compare_eq(left, right);
   11192         result ^= (op == Py_NE);
   11193         v = TEST_COND(result);
   11194     }
   11195     else {
   11196         result = unicode_compare(left, right);
   11197 
   11198         /* Convert the return value to a Boolean */
   11199         switch (op) {
   11200         case Py_LE:
   11201             v = TEST_COND(result <= 0);
   11202             break;
   11203         case Py_GE:
   11204             v = TEST_COND(result >= 0);
   11205             break;
   11206         case Py_LT:
   11207             v = TEST_COND(result == -1);
   11208             break;
   11209         case Py_GT:
   11210             v = TEST_COND(result == 1);
   11211             break;
   11212         default:
   11213             PyErr_BadArgument();
   11214             return NULL;
   11215         }
   11216     }
   11217     Py_INCREF(v);
   11218     return v;
   11219 }
   11220 
   11221 int
   11222 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
   11223 {
   11224     return unicode_eq(aa, bb);
   11225 }
   11226 
   11227 int
   11228 PyUnicode_Contains(PyObject *str, PyObject *substr)
   11229 {
   11230     int kind1, kind2;
   11231     void *buf1, *buf2;
   11232     Py_ssize_t len1, len2;
   11233     int result;
   11234 
   11235     if (!PyUnicode_Check(substr)) {
   11236         PyErr_Format(PyExc_TypeError,
   11237                      "'in <string>' requires string as left operand, not %.100s",
   11238                      Py_TYPE(substr)->tp_name);
   11239         return -1;
   11240     }
   11241     if (PyUnicode_READY(substr) == -1)
   11242         return -1;
   11243     if (ensure_unicode(str) < 0)
   11244         return -1;
   11245 
   11246     kind1 = PyUnicode_KIND(str);
   11247     kind2 = PyUnicode_KIND(substr);
   11248     if (kind1 < kind2)
   11249         return 0;
   11250     len1 = PyUnicode_GET_LENGTH(str);
   11251     len2 = PyUnicode_GET_LENGTH(substr);
   11252     if (len1 < len2)
   11253         return 0;
   11254     buf1 = PyUnicode_DATA(str);
   11255     buf2 = PyUnicode_DATA(substr);
   11256     if (len2 == 1) {
   11257         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
   11258         result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
   11259         return result;
   11260     }
   11261     if (kind2 != kind1) {
   11262         buf2 = _PyUnicode_AsKind(substr, kind1);
   11263         if (!buf2)
   11264             return -1;
   11265     }
   11266 
   11267     switch (kind1) {
   11268     case PyUnicode_1BYTE_KIND:
   11269         result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
   11270         break;
   11271     case PyUnicode_2BYTE_KIND:
   11272         result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
   11273         break;
   11274     case PyUnicode_4BYTE_KIND:
   11275         result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
   11276         break;
   11277     default:
   11278         result = -1;
   11279         assert(0);
   11280     }
   11281 
   11282     if (kind2 != kind1)
   11283         PyMem_Free(buf2);
   11284 
   11285     return result;
   11286 }
   11287 
   11288 /* Concat to string or Unicode object giving a new Unicode object. */
   11289 
   11290 PyObject *
   11291 PyUnicode_Concat(PyObject *left, PyObject *right)
   11292 {
   11293     PyObject *result;
   11294     Py_UCS4 maxchar, maxchar2;
   11295     Py_ssize_t left_len, right_len, new_len;
   11296 
   11297     if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
   11298         return NULL;
   11299 
   11300     /* Shortcuts */
   11301     if (left == unicode_empty)
   11302         return PyUnicode_FromObject(right);
   11303     if (right == unicode_empty)
   11304         return PyUnicode_FromObject(left);
   11305 
   11306     left_len = PyUnicode_GET_LENGTH(left);
   11307     right_len = PyUnicode_GET_LENGTH(right);
   11308     if (left_len > PY_SSIZE_T_MAX - right_len) {
   11309         PyErr_SetString(PyExc_OverflowError,
   11310                         "strings are too large to concat");
   11311         return NULL;
   11312     }
   11313     new_len = left_len + right_len;
   11314 
   11315     maxchar = PyUnicode_MAX_CHAR_VALUE(left);
   11316     maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
   11317     maxchar = Py_MAX(maxchar, maxchar2);
   11318 
   11319     /* Concat the two Unicode strings */
   11320     result = PyUnicode_New(new_len, maxchar);
   11321     if (result == NULL)
   11322         return NULL;
   11323     _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
   11324     _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
   11325     assert(_PyUnicode_CheckConsistency(result, 1));
   11326     return result;
   11327 }
   11328 
   11329 void
   11330 PyUnicode_Append(PyObject **p_left, PyObject *right)
   11331 {
   11332     PyObject *left, *res;
   11333     Py_UCS4 maxchar, maxchar2;
   11334     Py_ssize_t left_len, right_len, new_len;
   11335 
   11336     if (p_left == NULL) {
   11337         if (!PyErr_Occurred())
   11338             PyErr_BadInternalCall();
   11339         return;
   11340     }
   11341     left = *p_left;
   11342     if (right == NULL || left == NULL
   11343         || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
   11344         if (!PyErr_Occurred())
   11345             PyErr_BadInternalCall();
   11346         goto error;
   11347     }
   11348 
   11349     if (PyUnicode_READY(left) == -1)
   11350         goto error;
   11351     if (PyUnicode_READY(right) == -1)
   11352         goto error;
   11353 
   11354     /* Shortcuts */
   11355     if (left == unicode_empty) {
   11356         Py_DECREF(left);
   11357         Py_INCREF(right);
   11358         *p_left = right;
   11359         return;
   11360     }
   11361     if (right == unicode_empty)
   11362         return;
   11363 
   11364     left_len = PyUnicode_GET_LENGTH(left);
   11365     right_len = PyUnicode_GET_LENGTH(right);
   11366     if (left_len > PY_SSIZE_T_MAX - right_len) {
   11367         PyErr_SetString(PyExc_OverflowError,
   11368                         "strings are too large to concat");
   11369         goto error;
   11370     }
   11371     new_len = left_len + right_len;
   11372 
   11373     if (unicode_modifiable(left)
   11374         && PyUnicode_CheckExact(right)
   11375         && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
   11376         /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
   11377            to change the structure size, but characters are stored just after
   11378            the structure, and so it requires to move all characters which is
   11379            not so different than duplicating the string. */
   11380         && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
   11381     {
   11382         /* append inplace */
   11383         if (unicode_resize(p_left, new_len) != 0)
   11384             goto error;
   11385 
   11386         /* copy 'right' into the newly allocated area of 'left' */
   11387         _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
   11388     }
   11389     else {
   11390         maxchar = PyUnicode_MAX_CHAR_VALUE(left);
   11391         maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
   11392         maxchar = Py_MAX(maxchar, maxchar2);
   11393 
   11394         /* Concat the two Unicode strings */
   11395         res = PyUnicode_New(new_len, maxchar);
   11396         if (res == NULL)
   11397             goto error;
   11398         _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
   11399         _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
   11400         Py_DECREF(left);
   11401         *p_left = res;
   11402     }
   11403     assert(_PyUnicode_CheckConsistency(*p_left, 1));
   11404     return;
   11405 
   11406 error:
   11407     Py_CLEAR(*p_left);
   11408 }
   11409 
   11410 void
   11411 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
   11412 {
   11413     PyUnicode_Append(pleft, right);
   11414     Py_XDECREF(right);
   11415 }
   11416 
   11417 /*
   11418 Wraps stringlib_parse_args_finds() and additionally ensures that the
   11419 first argument is a unicode object.
   11420 */
   11421 
   11422 static inline int
   11423 parse_args_finds_unicode(const char * function_name, PyObject *args,
   11424                          PyObject **substring,
   11425                          Py_ssize_t *start, Py_ssize_t *end)
   11426 {
   11427     if(stringlib_parse_args_finds(function_name, args, substring,
   11428                                   start, end)) {
   11429         if (ensure_unicode(*substring) < 0)
   11430             return 0;
   11431         return 1;
   11432     }
   11433     return 0;
   11434 }
   11435 
   11436 PyDoc_STRVAR(count__doc__,
   11437              "S.count(sub[, start[, end]]) -> int\n\
   11438 \n\
   11439 Return the number of non-overlapping occurrences of substring sub in\n\
   11440 string S[start:end].  Optional arguments start and end are\n\
   11441 interpreted as in slice notation.");
   11442 
   11443 static PyObject *
   11444 unicode_count(PyObject *self, PyObject *args)
   11445 {
   11446     PyObject *substring = NULL;   /* initialize to fix a compiler warning */
   11447     Py_ssize_t start = 0;
   11448     Py_ssize_t end = PY_SSIZE_T_MAX;
   11449     PyObject *result;
   11450     int kind1, kind2;
   11451     void *buf1, *buf2;
   11452     Py_ssize_t len1, len2, iresult;
   11453 
   11454     if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
   11455         return NULL;
   11456 
   11457     kind1 = PyUnicode_KIND(self);
   11458     kind2 = PyUnicode_KIND(substring);
   11459     if (kind1 < kind2)
   11460         return PyLong_FromLong(0);
   11461 
   11462     len1 = PyUnicode_GET_LENGTH(self);
   11463     len2 = PyUnicode_GET_LENGTH(substring);
   11464     ADJUST_INDICES(start, end, len1);
   11465     if (end - start < len2)
   11466         return PyLong_FromLong(0);
   11467 
   11468     buf1 = PyUnicode_DATA(self);
   11469     buf2 = PyUnicode_DATA(substring);
   11470     if (kind2 != kind1) {
   11471         buf2 = _PyUnicode_AsKind(substring, kind1);
   11472         if (!buf2)
   11473             return NULL;
   11474     }
   11475     switch (kind1) {
   11476     case PyUnicode_1BYTE_KIND:
   11477         iresult = ucs1lib_count(
   11478             ((Py_UCS1*)buf1) + start, end - start,
   11479             buf2, len2, PY_SSIZE_T_MAX
   11480             );
   11481         break;
   11482     case PyUnicode_2BYTE_KIND:
   11483         iresult = ucs2lib_count(
   11484             ((Py_UCS2*)buf1) + start, end - start,
   11485             buf2, len2, PY_SSIZE_T_MAX
   11486             );
   11487         break;
   11488     case PyUnicode_4BYTE_KIND:
   11489         iresult = ucs4lib_count(
   11490             ((Py_UCS4*)buf1) + start, end - start,
   11491             buf2, len2, PY_SSIZE_T_MAX
   11492             );
   11493         break;
   11494     default:
   11495         assert(0); iresult = 0;
   11496     }
   11497 
   11498     result = PyLong_FromSsize_t(iresult);
   11499 
   11500     if (kind2 != kind1)
   11501         PyMem_Free(buf2);
   11502 
   11503     return result;
   11504 }
   11505 
   11506 PyDoc_STRVAR(encode__doc__,
   11507              "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
   11508 \n\
   11509 Encode S using the codec registered for encoding. Default encoding\n\
   11510 is 'utf-8'. errors may be given to set a different error\n\
   11511 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
   11512 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
   11513 'xmlcharrefreplace' as well as any other name registered with\n\
   11514 codecs.register_error that can handle UnicodeEncodeErrors.");
   11515 
   11516 static PyObject *
   11517 unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
   11518 {
   11519     static char *kwlist[] = {"encoding", "errors", 0};
   11520     char *encoding = NULL;
   11521     char *errors = NULL;
   11522 
   11523     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
   11524                                      kwlist, &encoding, &errors))
   11525         return NULL;
   11526     return PyUnicode_AsEncodedString(self, encoding, errors);
   11527 }
   11528 
   11529 PyDoc_STRVAR(expandtabs__doc__,
   11530              "S.expandtabs(tabsize=8) -> str\n\
   11531 \n\
   11532 Return a copy of S where all tab characters are expanded using spaces.\n\
   11533 If tabsize is not given, a tab size of 8 characters is assumed.");
   11534 
   11535 static PyObject*
   11536 unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
   11537 {
   11538     Py_ssize_t i, j, line_pos, src_len, incr;
   11539     Py_UCS4 ch;
   11540     PyObject *u;
   11541     void *src_data, *dest_data;
   11542     static char *kwlist[] = {"tabsize", 0};
   11543     int tabsize = 8;
   11544     int kind;
   11545     int found;
   11546 
   11547     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
   11548                                      kwlist, &tabsize))
   11549         return NULL;
   11550 
   11551     if (PyUnicode_READY(self) == -1)
   11552         return NULL;
   11553 
   11554     /* First pass: determine size of output string */
   11555     src_len = PyUnicode_GET_LENGTH(self);
   11556     i = j = line_pos = 0;
   11557     kind = PyUnicode_KIND(self);
   11558     src_data = PyUnicode_DATA(self);
   11559     found = 0;
   11560     for (; i < src_len; i++) {
   11561         ch = PyUnicode_READ(kind, src_data, i);
   11562         if (ch == '\t') {
   11563             found = 1;
   11564             if (tabsize > 0) {
   11565                 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
   11566                 if (j > PY_SSIZE_T_MAX - incr)
   11567                     goto overflow;
   11568                 line_pos += incr;
   11569                 j += incr;
   11570             }
   11571         }
   11572         else {
   11573             if (j > PY_SSIZE_T_MAX - 1)
   11574                 goto overflow;
   11575             line_pos++;
   11576             j++;
   11577             if (ch == '\n' || ch == '\r')
   11578                 line_pos = 0;
   11579         }
   11580     }
   11581     if (!found)
   11582         return unicode_result_unchanged(self);
   11583 
   11584     /* Second pass: create output string and fill it */
   11585     u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
   11586     if (!u)
   11587         return NULL;
   11588     dest_data = PyUnicode_DATA(u);
   11589 
   11590     i = j = line_pos = 0;
   11591 
   11592     for (; i < src_len; i++) {
   11593         ch = PyUnicode_READ(kind, src_data, i);
   11594         if (ch == '\t') {
   11595             if (tabsize > 0) {
   11596                 incr = tabsize - (line_pos % tabsize);
   11597                 line_pos += incr;
   11598                 FILL(kind, dest_data, ' ', j, incr);
   11599                 j += incr;
   11600             }
   11601         }
   11602         else {
   11603             line_pos++;
   11604             PyUnicode_WRITE(kind, dest_data, j, ch);
   11605             j++;
   11606             if (ch == '\n' || ch == '\r')
   11607                 line_pos = 0;
   11608         }
   11609     }
   11610     assert (j == PyUnicode_GET_LENGTH(u));
   11611     return unicode_result(u);
   11612 
   11613   overflow:
   11614     PyErr_SetString(PyExc_OverflowError, "new string is too long");
   11615     return NULL;
   11616 }
   11617 
   11618 PyDoc_STRVAR(find__doc__,
   11619              "S.find(sub[, start[, end]]) -> int\n\
   11620 \n\
   11621 Return the lowest index in S where substring sub is found,\n\
   11622 such that sub is contained within S[start:end].  Optional\n\
   11623 arguments start and end are interpreted as in slice notation.\n\
   11624 \n\
   11625 Return -1 on failure.");
   11626 
   11627 static PyObject *
   11628 unicode_find(PyObject *self, PyObject *args)
   11629 {
   11630     /* initialize variables to prevent gcc warning */
   11631     PyObject *substring = NULL;
   11632     Py_ssize_t start = 0;
   11633     Py_ssize_t end = 0;
   11634     Py_ssize_t result;
   11635 
   11636     if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
   11637         return NULL;
   11638 
   11639     if (PyUnicode_READY(self) == -1)
   11640         return NULL;
   11641 
   11642     result = any_find_slice(self, substring, start, end, 1);
   11643 
   11644     if (result == -2)
   11645         return NULL;
   11646 
   11647     return PyLong_FromSsize_t(result);
   11648 }
   11649 
   11650 static PyObject *
   11651 unicode_getitem(PyObject *self, Py_ssize_t index)
   11652 {
   11653     void *data;
   11654     enum PyUnicode_Kind kind;
   11655     Py_UCS4 ch;
   11656 
   11657     if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
   11658         PyErr_BadArgument();
   11659         return NULL;
   11660     }
   11661     if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
   11662         PyErr_SetString(PyExc_IndexError, "string index out of range");
   11663         return NULL;
   11664     }
   11665     kind = PyUnicode_KIND(self);
   11666     data = PyUnicode_DATA(self);
   11667     ch = PyUnicode_READ(kind, data, index);
   11668     return unicode_char(ch);
   11669 }
   11670 
   11671 /* Believe it or not, this produces the same value for ASCII strings
   11672    as bytes_hash(). */
   11673 static Py_hash_t
   11674 unicode_hash(PyObject *self)
   11675 {
   11676     Py_ssize_t len;
   11677     Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
   11678 
   11679 #ifdef Py_DEBUG
   11680     assert(_Py_HashSecret_Initialized);
   11681 #endif
   11682     if (_PyUnicode_HASH(self) != -1)
   11683         return _PyUnicode_HASH(self);
   11684     if (PyUnicode_READY(self) == -1)
   11685         return -1;
   11686     len = PyUnicode_GET_LENGTH(self);
   11687     /*
   11688       We make the hash of the empty string be 0, rather than using
   11689       (prefix ^ suffix), since this slightly obfuscates the hash secret
   11690     */
   11691     if (len == 0) {
   11692         _PyUnicode_HASH(self) = 0;
   11693         return 0;
   11694     }
   11695     x = _Py_HashBytes(PyUnicode_DATA(self),
   11696                       PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
   11697     _PyUnicode_HASH(self) = x;
   11698     return x;
   11699 }
   11700 
   11701 PyDoc_STRVAR(index__doc__,
   11702              "S.index(sub[, start[, end]]) -> int\n\
   11703 \n\
   11704 Like S.find() but raise ValueError when the substring is not found.");
   11705 
   11706 static PyObject *
   11707 unicode_index(PyObject *self, PyObject *args)
   11708 {
   11709     /* initialize variables to prevent gcc warning */
   11710     Py_ssize_t result;
   11711     PyObject *substring = NULL;
   11712     Py_ssize_t start = 0;
   11713     Py_ssize_t end = 0;
   11714 
   11715     if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
   11716         return NULL;
   11717 
   11718     if (PyUnicode_READY(self) == -1)
   11719         return NULL;
   11720 
   11721     result = any_find_slice(self, substring, start, end, 1);
   11722 
   11723     if (result == -2)
   11724         return NULL;
   11725 
   11726     if (result < 0) {
   11727         PyErr_SetString(PyExc_ValueError, "substring not found");
   11728         return NULL;
   11729     }
   11730 
   11731     return PyLong_FromSsize_t(result);
   11732 }
   11733 
   11734 PyDoc_STRVAR(islower__doc__,
   11735              "S.islower() -> bool\n\
   11736 \n\
   11737 Return True if all cased characters in S are lowercase and there is\n\
   11738 at least one cased character in S, False otherwise.");
   11739 
   11740 static PyObject*
   11741 unicode_islower(PyObject *self)
   11742 {
   11743     Py_ssize_t i, length;
   11744     int kind;
   11745     void *data;
   11746     int cased;
   11747 
   11748     if (PyUnicode_READY(self) == -1)
   11749         return NULL;
   11750     length = PyUnicode_GET_LENGTH(self);
   11751     kind = PyUnicode_KIND(self);
   11752     data = PyUnicode_DATA(self);
   11753 
   11754     /* Shortcut for single character strings */
   11755     if (length == 1)
   11756         return PyBool_FromLong(
   11757             Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
   11758 
   11759     /* Special case for empty strings */
   11760     if (length == 0)
   11761         return PyBool_FromLong(0);
   11762 
   11763     cased = 0;
   11764     for (i = 0; i < length; i++) {
   11765         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   11766 
   11767         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
   11768             return PyBool_FromLong(0);
   11769         else if (!cased && Py_UNICODE_ISLOWER(ch))
   11770             cased = 1;
   11771     }
   11772     return PyBool_FromLong(cased);
   11773 }
   11774 
   11775 PyDoc_STRVAR(isupper__doc__,
   11776              "S.isupper() -> bool\n\
   11777 \n\
   11778 Return True if all cased characters in S are uppercase and there is\n\
   11779 at least one cased character in S, False otherwise.");
   11780 
   11781 static PyObject*
   11782 unicode_isupper(PyObject *self)
   11783 {
   11784     Py_ssize_t i, length;
   11785     int kind;
   11786     void *data;
   11787     int cased;
   11788 
   11789     if (PyUnicode_READY(self) == -1)
   11790         return NULL;
   11791     length = PyUnicode_GET_LENGTH(self);
   11792     kind = PyUnicode_KIND(self);
   11793     data = PyUnicode_DATA(self);
   11794 
   11795     /* Shortcut for single character strings */
   11796     if (length == 1)
   11797         return PyBool_FromLong(
   11798             Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
   11799 
   11800     /* Special case for empty strings */
   11801     if (length == 0)
   11802         return PyBool_FromLong(0);
   11803 
   11804     cased = 0;
   11805     for (i = 0; i < length; i++) {
   11806         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   11807 
   11808         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
   11809             return PyBool_FromLong(0);
   11810         else if (!cased && Py_UNICODE_ISUPPER(ch))
   11811             cased = 1;
   11812     }
   11813     return PyBool_FromLong(cased);
   11814 }
   11815 
   11816 PyDoc_STRVAR(istitle__doc__,
   11817              "S.istitle() -> bool\n\
   11818 \n\
   11819 Return True if S is a titlecased string and there is at least one\n\
   11820 character in S, i.e. upper- and titlecase characters may only\n\
   11821 follow uncased characters and lowercase characters only cased ones.\n\
   11822 Return False otherwise.");
   11823 
   11824 static PyObject*
   11825 unicode_istitle(PyObject *self)
   11826 {
   11827     Py_ssize_t i, length;
   11828     int kind;
   11829     void *data;
   11830     int cased, previous_is_cased;
   11831 
   11832     if (PyUnicode_READY(self) == -1)
   11833         return NULL;
   11834     length = PyUnicode_GET_LENGTH(self);
   11835     kind = PyUnicode_KIND(self);
   11836     data = PyUnicode_DATA(self);
   11837 
   11838     /* Shortcut for single character strings */
   11839     if (length == 1) {
   11840         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
   11841         return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
   11842                                (Py_UNICODE_ISUPPER(ch) != 0));
   11843     }
   11844 
   11845     /* Special case for empty strings */
   11846     if (length == 0)
   11847         return PyBool_FromLong(0);
   11848 
   11849     cased = 0;
   11850     previous_is_cased = 0;
   11851     for (i = 0; i < length; i++) {
   11852         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   11853 
   11854         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
   11855             if (previous_is_cased)
   11856                 return PyBool_FromLong(0);
   11857             previous_is_cased = 1;
   11858             cased = 1;
   11859         }
   11860         else if (Py_UNICODE_ISLOWER(ch)) {
   11861             if (!previous_is_cased)
   11862                 return PyBool_FromLong(0);
   11863             previous_is_cased = 1;
   11864             cased = 1;
   11865         }
   11866         else
   11867             previous_is_cased = 0;
   11868     }
   11869     return PyBool_FromLong(cased);
   11870 }
   11871 
   11872 PyDoc_STRVAR(isspace__doc__,
   11873              "S.isspace() -> bool\n\
   11874 \n\
   11875 Return True if all characters in S are whitespace\n\
   11876 and there is at least one character in S, False otherwise.");
   11877 
   11878 static PyObject*
   11879 unicode_isspace(PyObject *self)
   11880 {
   11881     Py_ssize_t i, length;
   11882     int kind;
   11883     void *data;
   11884 
   11885     if (PyUnicode_READY(self) == -1)
   11886         return NULL;
   11887     length = PyUnicode_GET_LENGTH(self);
   11888     kind = PyUnicode_KIND(self);
   11889     data = PyUnicode_DATA(self);
   11890 
   11891     /* Shortcut for single character strings */
   11892     if (length == 1)
   11893         return PyBool_FromLong(
   11894             Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
   11895 
   11896     /* Special case for empty strings */
   11897     if (length == 0)
   11898         return PyBool_FromLong(0);
   11899 
   11900     for (i = 0; i < length; i++) {
   11901         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   11902         if (!Py_UNICODE_ISSPACE(ch))
   11903             return PyBool_FromLong(0);
   11904     }
   11905     return PyBool_FromLong(1);
   11906 }
   11907 
   11908 PyDoc_STRVAR(isalpha__doc__,
   11909              "S.isalpha() -> bool\n\
   11910 \n\
   11911 Return True if all characters in S are alphabetic\n\
   11912 and there is at least one character in S, False otherwise.");
   11913 
   11914 static PyObject*
   11915 unicode_isalpha(PyObject *self)
   11916 {
   11917     Py_ssize_t i, length;
   11918     int kind;
   11919     void *data;
   11920 
   11921     if (PyUnicode_READY(self) == -1)
   11922         return NULL;
   11923     length = PyUnicode_GET_LENGTH(self);
   11924     kind = PyUnicode_KIND(self);
   11925     data = PyUnicode_DATA(self);
   11926 
   11927     /* Shortcut for single character strings */
   11928     if (length == 1)
   11929         return PyBool_FromLong(
   11930             Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
   11931 
   11932     /* Special case for empty strings */
   11933     if (length == 0)
   11934         return PyBool_FromLong(0);
   11935 
   11936     for (i = 0; i < length; i++) {
   11937         if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
   11938             return PyBool_FromLong(0);
   11939     }
   11940     return PyBool_FromLong(1);
   11941 }
   11942 
   11943 PyDoc_STRVAR(isalnum__doc__,
   11944              "S.isalnum() -> bool\n\
   11945 \n\
   11946 Return True if all characters in S are alphanumeric\n\
   11947 and there is at least one character in S, False otherwise.");
   11948 
   11949 static PyObject*
   11950 unicode_isalnum(PyObject *self)
   11951 {
   11952     int kind;
   11953     void *data;
   11954     Py_ssize_t len, i;
   11955 
   11956     if (PyUnicode_READY(self) == -1)
   11957         return NULL;
   11958 
   11959     kind = PyUnicode_KIND(self);
   11960     data = PyUnicode_DATA(self);
   11961     len = PyUnicode_GET_LENGTH(self);
   11962 
   11963     /* Shortcut for single character strings */
   11964     if (len == 1) {
   11965         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
   11966         return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
   11967     }
   11968 
   11969     /* Special case for empty strings */
   11970     if (len == 0)
   11971         return PyBool_FromLong(0);
   11972 
   11973     for (i = 0; i < len; i++) {
   11974         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   11975         if (!Py_UNICODE_ISALNUM(ch))
   11976             return PyBool_FromLong(0);
   11977     }
   11978     return PyBool_FromLong(1);
   11979 }
   11980 
   11981 PyDoc_STRVAR(isdecimal__doc__,
   11982              "S.isdecimal() -> bool\n\
   11983 \n\
   11984 Return True if there are only decimal characters in S,\n\
   11985 False otherwise.");
   11986 
   11987 static PyObject*
   11988 unicode_isdecimal(PyObject *self)
   11989 {
   11990     Py_ssize_t i, length;
   11991     int kind;
   11992     void *data;
   11993 
   11994     if (PyUnicode_READY(self) == -1)
   11995         return NULL;
   11996     length = PyUnicode_GET_LENGTH(self);
   11997     kind = PyUnicode_KIND(self);
   11998     data = PyUnicode_DATA(self);
   11999 
   12000     /* Shortcut for single character strings */
   12001     if (length == 1)
   12002         return PyBool_FromLong(
   12003             Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
   12004 
   12005     /* Special case for empty strings */
   12006     if (length == 0)
   12007         return PyBool_FromLong(0);
   12008 
   12009     for (i = 0; i < length; i++) {
   12010         if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
   12011             return PyBool_FromLong(0);
   12012     }
   12013     return PyBool_FromLong(1);
   12014 }
   12015 
   12016 PyDoc_STRVAR(isdigit__doc__,
   12017              "S.isdigit() -> bool\n\
   12018 \n\
   12019 Return True if all characters in S are digits\n\
   12020 and there is at least one character in S, False otherwise.");
   12021 
   12022 static PyObject*
   12023 unicode_isdigit(PyObject *self)
   12024 {
   12025     Py_ssize_t i, length;
   12026     int kind;
   12027     void *data;
   12028 
   12029     if (PyUnicode_READY(self) == -1)
   12030         return NULL;
   12031     length = PyUnicode_GET_LENGTH(self);
   12032     kind = PyUnicode_KIND(self);
   12033     data = PyUnicode_DATA(self);
   12034 
   12035     /* Shortcut for single character strings */
   12036     if (length == 1) {
   12037         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
   12038         return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
   12039     }
   12040 
   12041     /* Special case for empty strings */
   12042     if (length == 0)
   12043         return PyBool_FromLong(0);
   12044 
   12045     for (i = 0; i < length; i++) {
   12046         if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
   12047             return PyBool_FromLong(0);
   12048     }
   12049     return PyBool_FromLong(1);
   12050 }
   12051 
   12052 PyDoc_STRVAR(isnumeric__doc__,
   12053              "S.isnumeric() -> bool\n\
   12054 \n\
   12055 Return True if there are only numeric characters in S,\n\
   12056 False otherwise.");
   12057 
   12058 static PyObject*
   12059 unicode_isnumeric(PyObject *self)
   12060 {
   12061     Py_ssize_t i, length;
   12062     int kind;
   12063     void *data;
   12064 
   12065     if (PyUnicode_READY(self) == -1)
   12066         return NULL;
   12067     length = PyUnicode_GET_LENGTH(self);
   12068     kind = PyUnicode_KIND(self);
   12069     data = PyUnicode_DATA(self);
   12070 
   12071     /* Shortcut for single character strings */
   12072     if (length == 1)
   12073         return PyBool_FromLong(
   12074             Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
   12075 
   12076     /* Special case for empty strings */
   12077     if (length == 0)
   12078         return PyBool_FromLong(0);
   12079 
   12080     for (i = 0; i < length; i++) {
   12081         if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
   12082             return PyBool_FromLong(0);
   12083     }
   12084     return PyBool_FromLong(1);
   12085 }
   12086 
   12087 int
   12088 PyUnicode_IsIdentifier(PyObject *self)
   12089 {
   12090     int kind;
   12091     void *data;
   12092     Py_ssize_t i;
   12093     Py_UCS4 first;
   12094 
   12095     if (PyUnicode_READY(self) == -1) {
   12096         Py_FatalError("identifier not ready");
   12097         return 0;
   12098     }
   12099 
   12100     /* Special case for empty strings */
   12101     if (PyUnicode_GET_LENGTH(self) == 0)
   12102         return 0;
   12103     kind = PyUnicode_KIND(self);
   12104     data = PyUnicode_DATA(self);
   12105 
   12106     /* PEP 3131 says that the first character must be in
   12107        XID_Start and subsequent characters in XID_Continue,
   12108        and for the ASCII range, the 2.x rules apply (i.e
   12109        start with letters and underscore, continue with
   12110        letters, digits, underscore). However, given the current
   12111        definition of XID_Start and XID_Continue, it is sufficient
   12112        to check just for these, except that _ must be allowed
   12113        as starting an identifier.  */
   12114     first = PyUnicode_READ(kind, data, 0);
   12115     if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
   12116         return 0;
   12117 
   12118     for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
   12119         if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
   12120             return 0;
   12121     return 1;
   12122 }
   12123 
   12124 PyDoc_STRVAR(isidentifier__doc__,
   12125              "S.isidentifier() -> bool\n\
   12126 \n\
   12127 Return True if S is a valid identifier according\n\
   12128 to the language definition.\n\
   12129 \n\
   12130 Use keyword.iskeyword() to test for reserved identifiers\n\
   12131 such as \"def\" and \"class\".\n");
   12132 
   12133 static PyObject*
   12134 unicode_isidentifier(PyObject *self)
   12135 {
   12136     return PyBool_FromLong(PyUnicode_IsIdentifier(self));
   12137 }
   12138 
   12139 PyDoc_STRVAR(isprintable__doc__,
   12140              "S.isprintable() -> bool\n\
   12141 \n\
   12142 Return True if all characters in S are considered\n\
   12143 printable in repr() or S is empty, False otherwise.");
   12144 
   12145 static PyObject*
   12146 unicode_isprintable(PyObject *self)
   12147 {
   12148     Py_ssize_t i, length;
   12149     int kind;
   12150     void *data;
   12151 
   12152     if (PyUnicode_READY(self) == -1)
   12153         return NULL;
   12154     length = PyUnicode_GET_LENGTH(self);
   12155     kind = PyUnicode_KIND(self);
   12156     data = PyUnicode_DATA(self);
   12157 
   12158     /* Shortcut for single character strings */
   12159     if (length == 1)
   12160         return PyBool_FromLong(
   12161             Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
   12162 
   12163     for (i = 0; i < length; i++) {
   12164         if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
   12165             Py_RETURN_FALSE;
   12166         }
   12167     }
   12168     Py_RETURN_TRUE;
   12169 }
   12170 
   12171 PyDoc_STRVAR(join__doc__,
   12172              "S.join(iterable) -> str\n\
   12173 \n\
   12174 Return a string which is the concatenation of the strings in the\n\
   12175 iterable.  The separator between elements is S.");
   12176 
   12177 static PyObject*
   12178 unicode_join(PyObject *self, PyObject *data)
   12179 {
   12180     return PyUnicode_Join(self, data);
   12181 }
   12182 
   12183 static Py_ssize_t
   12184 unicode_length(PyObject *self)
   12185 {
   12186     if (PyUnicode_READY(self) == -1)
   12187         return -1;
   12188     return PyUnicode_GET_LENGTH(self);
   12189 }
   12190 
   12191 PyDoc_STRVAR(ljust__doc__,
   12192              "S.ljust(width[, fillchar]) -> str\n\
   12193 \n\
   12194 Return S left-justified in a Unicode string of length width. Padding is\n\
   12195 done using the specified fill character (default is a space).");
   12196 
   12197 static PyObject *
   12198 unicode_ljust(PyObject *self, PyObject *args)
   12199 {
   12200     Py_ssize_t width;
   12201     Py_UCS4 fillchar = ' ';
   12202 
   12203     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
   12204         return NULL;
   12205 
   12206     if (PyUnicode_READY(self) == -1)
   12207         return NULL;
   12208 
   12209     if (PyUnicode_GET_LENGTH(self) >= width)
   12210         return unicode_result_unchanged(self);
   12211 
   12212     return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
   12213 }
   12214 
   12215 PyDoc_STRVAR(lower__doc__,
   12216              "S.lower() -> str\n\
   12217 \n\
   12218 Return a copy of the string S converted to lowercase.");
   12219 
   12220 static PyObject*
   12221 unicode_lower(PyObject *self)
   12222 {
   12223     if (PyUnicode_READY(self) == -1)
   12224         return NULL;
   12225     if (PyUnicode_IS_ASCII(self))
   12226         return ascii_upper_or_lower(self, 1);
   12227     return case_operation(self, do_lower);
   12228 }
   12229 
   12230 #define LEFTSTRIP 0
   12231 #define RIGHTSTRIP 1
   12232 #define BOTHSTRIP 2
   12233 
   12234 /* Arrays indexed by above */
   12235 static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
   12236 
   12237 #define STRIPNAME(i) (stripformat[i]+3)
   12238 
   12239 /* externally visible for str.strip(unicode) */
   12240 PyObject *
   12241 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
   12242 {
   12243     void *data;
   12244     int kind;
   12245     Py_ssize_t i, j, len;
   12246     BLOOM_MASK sepmask;
   12247     Py_ssize_t seplen;
   12248 
   12249     if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
   12250         return NULL;
   12251 
   12252     kind = PyUnicode_KIND(self);
   12253     data = PyUnicode_DATA(self);
   12254     len = PyUnicode_GET_LENGTH(self);
   12255     seplen = PyUnicode_GET_LENGTH(sepobj);
   12256     sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
   12257                               PyUnicode_DATA(sepobj),
   12258                               seplen);
   12259 
   12260     i = 0;
   12261     if (striptype != RIGHTSTRIP) {
   12262         while (i < len) {
   12263             Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   12264             if (!BLOOM(sepmask, ch))
   12265                 break;
   12266             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
   12267                 break;
   12268             i++;
   12269         }
   12270     }
   12271 
   12272     j = len;
   12273     if (striptype != LEFTSTRIP) {
   12274         j--;
   12275         while (j >= i) {
   12276             Py_UCS4 ch = PyUnicode_READ(kind, data, j);
   12277             if (!BLOOM(sepmask, ch))
   12278                 break;
   12279             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
   12280                 break;
   12281             j--;
   12282         }
   12283 
   12284         j++;
   12285     }
   12286 
   12287     return PyUnicode_Substring(self, i, j);
   12288 }
   12289 
   12290 PyObject*
   12291 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
   12292 {
   12293     unsigned char *data;
   12294     int kind;
   12295     Py_ssize_t length;
   12296 
   12297     if (PyUnicode_READY(self) == -1)
   12298         return NULL;
   12299 
   12300     length = PyUnicode_GET_LENGTH(self);
   12301     end = Py_MIN(end, length);
   12302 
   12303     if (start == 0 && end == length)
   12304         return unicode_result_unchanged(self);
   12305 
   12306     if (start < 0 || end < 0) {
   12307         PyErr_SetString(PyExc_IndexError, "string index out of range");
   12308         return NULL;
   12309     }
   12310     if (start >= length || end < start)
   12311         _Py_RETURN_UNICODE_EMPTY();
   12312 
   12313     length = end - start;
   12314     if (PyUnicode_IS_ASCII(self)) {
   12315         data = PyUnicode_1BYTE_DATA(self);
   12316         return _PyUnicode_FromASCII((char*)(data + start), length);
   12317     }
   12318     else {
   12319         kind = PyUnicode_KIND(self);
   12320         data = PyUnicode_1BYTE_DATA(self);
   12321         return PyUnicode_FromKindAndData(kind,
   12322                                          data + kind * start,
   12323                                          length);
   12324     }
   12325 }
   12326 
   12327 static PyObject *
   12328 do_strip(PyObject *self, int striptype)
   12329 {
   12330     Py_ssize_t len, i, j;
   12331 
   12332     if (PyUnicode_READY(self) == -1)
   12333         return NULL;
   12334 
   12335     len = PyUnicode_GET_LENGTH(self);
   12336 
   12337     if (PyUnicode_IS_ASCII(self)) {
   12338         Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
   12339 
   12340         i = 0;
   12341         if (striptype != RIGHTSTRIP) {
   12342             while (i < len) {
   12343                 Py_UCS1 ch = data[i];
   12344                 if (!_Py_ascii_whitespace[ch])
   12345                     break;
   12346                 i++;
   12347             }
   12348         }
   12349 
   12350         j = len;
   12351         if (striptype != LEFTSTRIP) {
   12352             j--;
   12353             while (j >= i) {
   12354                 Py_UCS1 ch = data[j];
   12355                 if (!_Py_ascii_whitespace[ch])
   12356                     break;
   12357                 j--;
   12358             }
   12359             j++;
   12360         }
   12361     }
   12362     else {
   12363         int kind = PyUnicode_KIND(self);
   12364         void *data = PyUnicode_DATA(self);
   12365 
   12366         i = 0;
   12367         if (striptype != RIGHTSTRIP) {
   12368             while (i < len) {
   12369                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   12370                 if (!Py_UNICODE_ISSPACE(ch))
   12371                     break;
   12372                 i++;
   12373             }
   12374         }
   12375 
   12376         j = len;
   12377         if (striptype != LEFTSTRIP) {
   12378             j--;
   12379             while (j >= i) {
   12380                 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
   12381                 if (!Py_UNICODE_ISSPACE(ch))
   12382                     break;
   12383                 j--;
   12384             }
   12385             j++;
   12386         }
   12387     }
   12388 
   12389     return PyUnicode_Substring(self, i, j);
   12390 }
   12391 
   12392 
   12393 static PyObject *
   12394 do_argstrip(PyObject *self, int striptype, PyObject *args)
   12395 {
   12396     PyObject *sep = NULL;
   12397 
   12398     if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
   12399         return NULL;
   12400 
   12401     if (sep != NULL && sep != Py_None) {
   12402         if (PyUnicode_Check(sep))
   12403             return _PyUnicode_XStrip(self, striptype, sep);
   12404         else {
   12405             PyErr_Format(PyExc_TypeError,
   12406                          "%s arg must be None or str",
   12407                          STRIPNAME(striptype));
   12408             return NULL;
   12409         }
   12410     }
   12411 
   12412     return do_strip(self, striptype);
   12413 }
   12414 
   12415 
   12416 PyDoc_STRVAR(strip__doc__,
   12417              "S.strip([chars]) -> str\n\
   12418 \n\
   12419 Return a copy of the string S with leading and trailing\n\
   12420 whitespace removed.\n\
   12421 If chars is given and not None, remove characters in chars instead.");
   12422 
   12423 static PyObject *
   12424 unicode_strip(PyObject *self, PyObject *args)
   12425 {
   12426     if (PyTuple_GET_SIZE(args) == 0)
   12427         return do_strip(self, BOTHSTRIP); /* Common case */
   12428     else
   12429         return do_argstrip(self, BOTHSTRIP, args);
   12430 }
   12431 
   12432 
   12433 PyDoc_STRVAR(lstrip__doc__,
   12434              "S.lstrip([chars]) -> str\n\
   12435 \n\
   12436 Return a copy of the string S with leading whitespace removed.\n\
   12437 If chars is given and not None, remove characters in chars instead.");
   12438 
   12439 static PyObject *
   12440 unicode_lstrip(PyObject *self, PyObject *args)
   12441 {
   12442     if (PyTuple_GET_SIZE(args) == 0)
   12443         return do_strip(self, LEFTSTRIP); /* Common case */
   12444     else
   12445         return do_argstrip(self, LEFTSTRIP, args);
   12446 }
   12447 
   12448 
   12449 PyDoc_STRVAR(rstrip__doc__,
   12450              "S.rstrip([chars]) -> str\n\
   12451 \n\
   12452 Return a copy of the string S with trailing whitespace removed.\n\
   12453 If chars is given and not None, remove characters in chars instead.");
   12454 
   12455 static PyObject *
   12456 unicode_rstrip(PyObject *self, PyObject *args)
   12457 {
   12458     if (PyTuple_GET_SIZE(args) == 0)
   12459         return do_strip(self, RIGHTSTRIP); /* Common case */
   12460     else
   12461         return do_argstrip(self, RIGHTSTRIP, args);
   12462 }
   12463 
   12464 
   12465 static PyObject*
   12466 unicode_repeat(PyObject *str, Py_ssize_t len)
   12467 {
   12468     PyObject *u;
   12469     Py_ssize_t nchars, n;
   12470 
   12471     if (len < 1)
   12472         _Py_RETURN_UNICODE_EMPTY();
   12473 
   12474     /* no repeat, return original string */
   12475     if (len == 1)
   12476         return unicode_result_unchanged(str);
   12477 
   12478     if (PyUnicode_READY(str) == -1)
   12479         return NULL;
   12480 
   12481     if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
   12482         PyErr_SetString(PyExc_OverflowError,
   12483                         "repeated string is too long");
   12484         return NULL;
   12485     }
   12486     nchars = len * PyUnicode_GET_LENGTH(str);
   12487 
   12488     u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
   12489     if (!u)
   12490         return NULL;
   12491     assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
   12492 
   12493     if (PyUnicode_GET_LENGTH(str) == 1) {
   12494         const int kind = PyUnicode_KIND(str);
   12495         const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
   12496         if (kind == PyUnicode_1BYTE_KIND) {
   12497             void *to = PyUnicode_DATA(u);
   12498             memset(to, (unsigned char)fill_char, len);
   12499         }
   12500         else if (kind == PyUnicode_2BYTE_KIND) {
   12501             Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
   12502             for (n = 0; n < len; ++n)
   12503                 ucs2[n] = fill_char;
   12504         } else {
   12505             Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
   12506             assert(kind == PyUnicode_4BYTE_KIND);
   12507             for (n = 0; n < len; ++n)
   12508                 ucs4[n] = fill_char;
   12509         }
   12510     }
   12511     else {
   12512         /* number of characters copied this far */
   12513         Py_ssize_t done = PyUnicode_GET_LENGTH(str);
   12514         const Py_ssize_t char_size = PyUnicode_KIND(str);
   12515         char *to = (char *) PyUnicode_DATA(u);
   12516         memcpy(to, PyUnicode_DATA(str),
   12517                   PyUnicode_GET_LENGTH(str) * char_size);
   12518         while (done < nchars) {
   12519             n = (done <= nchars-done) ? done : nchars-done;
   12520             memcpy(to + (done * char_size), to, n * char_size);
   12521             done += n;
   12522         }
   12523     }
   12524 
   12525     assert(_PyUnicode_CheckConsistency(u, 1));
   12526     return u;
   12527 }
   12528 
   12529 PyObject *
   12530 PyUnicode_Replace(PyObject *str,
   12531                   PyObject *substr,
   12532                   PyObject *replstr,
   12533                   Py_ssize_t maxcount)
   12534 {
   12535     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
   12536             ensure_unicode(replstr) < 0)
   12537         return NULL;
   12538     return replace(str, substr, replstr, maxcount);
   12539 }
   12540 
   12541 PyDoc_STRVAR(replace__doc__,
   12542              "S.replace(old, new[, count]) -> str\n\
   12543 \n\
   12544 Return a copy of S with all occurrences of substring\n\
   12545 old replaced by new.  If the optional argument count is\n\
   12546 given, only the first count occurrences are replaced.");
   12547 
   12548 static PyObject*
   12549 unicode_replace(PyObject *self, PyObject *args)
   12550 {
   12551     PyObject *str1;
   12552     PyObject *str2;
   12553     Py_ssize_t maxcount = -1;
   12554 
   12555     if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
   12556         return NULL;
   12557     if (PyUnicode_READY(self) == -1)
   12558         return NULL;
   12559     return replace(self, str1, str2, maxcount);
   12560 }
   12561 
   12562 static PyObject *
   12563 unicode_repr(PyObject *unicode)
   12564 {
   12565     PyObject *repr;
   12566     Py_ssize_t isize;
   12567     Py_ssize_t osize, squote, dquote, i, o;
   12568     Py_UCS4 max, quote;
   12569     int ikind, okind, unchanged;
   12570     void *idata, *odata;
   12571 
   12572     if (PyUnicode_READY(unicode) == -1)
   12573         return NULL;
   12574 
   12575     isize = PyUnicode_GET_LENGTH(unicode);
   12576     idata = PyUnicode_DATA(unicode);
   12577 
   12578     /* Compute length of output, quote characters, and
   12579        maximum character */
   12580     osize = 0;
   12581     max = 127;
   12582     squote = dquote = 0;
   12583     ikind = PyUnicode_KIND(unicode);
   12584     for (i = 0; i < isize; i++) {
   12585         Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
   12586         Py_ssize_t incr = 1;
   12587         switch (ch) {
   12588         case '\'': squote++; break;
   12589         case '"':  dquote++; break;
   12590         case '\\': case '\t': case '\r': case '\n':
   12591             incr = 2;
   12592             break;
   12593         default:
   12594             /* Fast-path ASCII */
   12595             if (ch < ' ' || ch == 0x7f)
   12596                 incr = 4; /* \xHH */
   12597             else if (ch < 0x7f)
   12598                 ;
   12599             else if (Py_UNICODE_ISPRINTABLE(ch))
   12600                 max = ch > max ? ch : max;
   12601             else if (ch < 0x100)
   12602                 incr = 4; /* \xHH */
   12603             else if (ch < 0x10000)
   12604                 incr = 6; /* \uHHHH */
   12605             else
   12606                 incr = 10; /* \uHHHHHHHH */
   12607         }
   12608         if (osize > PY_SSIZE_T_MAX - incr) {
   12609             PyErr_SetString(PyExc_OverflowError,
   12610                             "string is too long to generate repr");
   12611             return NULL;
   12612         }
   12613         osize += incr;
   12614     }
   12615 
   12616     quote = '\'';
   12617     unchanged = (osize == isize);
   12618     if (squote) {
   12619         unchanged = 0;
   12620         if (dquote)
   12621             /* Both squote and dquote present. Use squote,
   12622                and escape them */
   12623             osize += squote;
   12624         else
   12625             quote = '"';
   12626     }
   12627     osize += 2;   /* quotes */
   12628 
   12629     repr = PyUnicode_New(osize, max);
   12630     if (repr == NULL)
   12631         return NULL;
   12632     okind = PyUnicode_KIND(repr);
   12633     odata = PyUnicode_DATA(repr);
   12634 
   12635     PyUnicode_WRITE(okind, odata, 0, quote);
   12636     PyUnicode_WRITE(okind, odata, osize-1, quote);
   12637     if (unchanged) {
   12638         _PyUnicode_FastCopyCharacters(repr, 1,
   12639                                       unicode, 0,
   12640                                       isize);
   12641     }
   12642     else {
   12643         for (i = 0, o = 1; i < isize; i++) {
   12644             Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
   12645 
   12646             /* Escape quotes and backslashes */
   12647             if ((ch == quote) || (ch == '\\')) {
   12648                 PyUnicode_WRITE(okind, odata, o++, '\\');
   12649                 PyUnicode_WRITE(okind, odata, o++, ch);
   12650                 continue;
   12651             }
   12652 
   12653             /* Map special whitespace to '\t', \n', '\r' */
   12654             if (ch == '\t') {
   12655                 PyUnicode_WRITE(okind, odata, o++, '\\');
   12656                 PyUnicode_WRITE(okind, odata, o++, 't');
   12657             }
   12658             else if (ch == '\n') {
   12659                 PyUnicode_WRITE(okind, odata, o++, '\\');
   12660                 PyUnicode_WRITE(okind, odata, o++, 'n');
   12661             }
   12662             else if (ch == '\r') {
   12663                 PyUnicode_WRITE(okind, odata, o++, '\\');
   12664                 PyUnicode_WRITE(okind, odata, o++, 'r');
   12665             }
   12666 
   12667             /* Map non-printable US ASCII to '\xhh' */
   12668             else if (ch < ' ' || ch == 0x7F) {
   12669                 PyUnicode_WRITE(okind, odata, o++, '\\');
   12670                 PyUnicode_WRITE(okind, odata, o++, 'x');
   12671                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
   12672                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
   12673             }
   12674 
   12675             /* Copy ASCII characters as-is */
   12676             else if (ch < 0x7F) {
   12677                 PyUnicode_WRITE(okind, odata, o++, ch);
   12678             }
   12679 
   12680             /* Non-ASCII characters */
   12681             else {
   12682                 /* Map Unicode whitespace and control characters
   12683                    (categories Z* and C* except ASCII space)
   12684                 */
   12685                 if (!Py_UNICODE_ISPRINTABLE(ch)) {
   12686                     PyUnicode_WRITE(okind, odata, o++, '\\');
   12687                     /* Map 8-bit characters to '\xhh' */
   12688                     if (ch <= 0xff) {
   12689                         PyUnicode_WRITE(okind, odata, o++, 'x');
   12690                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
   12691                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
   12692                     }
   12693                     /* Map 16-bit characters to '\uxxxx' */
   12694                     else if (ch <= 0xffff) {
   12695                         PyUnicode_WRITE(okind, odata, o++, 'u');
   12696                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
   12697                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
   12698                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
   12699                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
   12700                     }
   12701                     /* Map 21-bit characters to '\U00xxxxxx' */
   12702                     else {
   12703                         PyUnicode_WRITE(okind, odata, o++, 'U');
   12704                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
   12705                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
   12706                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
   12707                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
   12708                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
   12709                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
   12710                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
   12711                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
   12712                     }
   12713                 }
   12714                 /* Copy characters as-is */
   12715                 else {
   12716                     PyUnicode_WRITE(okind, odata, o++, ch);
   12717                 }
   12718             }
   12719         }
   12720     }
   12721     /* Closing quote already added at the beginning */
   12722     assert(_PyUnicode_CheckConsistency(repr, 1));
   12723     return repr;
   12724 }
   12725 
   12726 PyDoc_STRVAR(rfind__doc__,
   12727              "S.rfind(sub[, start[, end]]) -> int\n\
   12728 \n\
   12729 Return the highest index in S where substring sub is found,\n\
   12730 such that sub is contained within S[start:end].  Optional\n\
   12731 arguments start and end are interpreted as in slice notation.\n\
   12732 \n\
   12733 Return -1 on failure.");
   12734 
   12735 static PyObject *
   12736 unicode_rfind(PyObject *self, PyObject *args)
   12737 {
   12738     /* initialize variables to prevent gcc warning */
   12739     PyObject *substring = NULL;
   12740     Py_ssize_t start = 0;
   12741     Py_ssize_t end = 0;
   12742     Py_ssize_t result;
   12743 
   12744     if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
   12745         return NULL;
   12746 
   12747     if (PyUnicode_READY(self) == -1)
   12748         return NULL;
   12749 
   12750     result = any_find_slice(self, substring, start, end, -1);
   12751 
   12752     if (result == -2)
   12753         return NULL;
   12754 
   12755     return PyLong_FromSsize_t(result);
   12756 }
   12757 
   12758 PyDoc_STRVAR(rindex__doc__,
   12759              "S.rindex(sub[, start[, end]]) -> int\n\
   12760 \n\
   12761 Like S.rfind() but raise ValueError when the substring is not found.");
   12762 
   12763 static PyObject *
   12764 unicode_rindex(PyObject *self, PyObject *args)
   12765 {
   12766     /* initialize variables to prevent gcc warning */
   12767     PyObject *substring = NULL;
   12768     Py_ssize_t start = 0;
   12769     Py_ssize_t end = 0;
   12770     Py_ssize_t result;
   12771 
   12772     if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
   12773         return NULL;
   12774 
   12775     if (PyUnicode_READY(self) == -1)
   12776         return NULL;
   12777 
   12778     result = any_find_slice(self, substring, start, end, -1);
   12779 
   12780     if (result == -2)
   12781         return NULL;
   12782 
   12783     if (result < 0) {
   12784         PyErr_SetString(PyExc_ValueError, "substring not found");
   12785         return NULL;
   12786     }
   12787 
   12788     return PyLong_FromSsize_t(result);
   12789 }
   12790 
   12791 PyDoc_STRVAR(rjust__doc__,
   12792              "S.rjust(width[, fillchar]) -> str\n\
   12793 \n\
   12794 Return S right-justified in a string of length width. Padding is\n\
   12795 done using the specified fill character (default is a space).");
   12796 
   12797 static PyObject *
   12798 unicode_rjust(PyObject *self, PyObject *args)
   12799 {
   12800     Py_ssize_t width;
   12801     Py_UCS4 fillchar = ' ';
   12802 
   12803     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
   12804         return NULL;
   12805 
   12806     if (PyUnicode_READY(self) == -1)
   12807         return NULL;
   12808 
   12809     if (PyUnicode_GET_LENGTH(self) >= width)
   12810         return unicode_result_unchanged(self);
   12811 
   12812     return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
   12813 }
   12814 
   12815 PyObject *
   12816 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
   12817 {
   12818     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
   12819         return NULL;
   12820 
   12821     return split(s, sep, maxsplit);
   12822 }
   12823 
   12824 PyDoc_STRVAR(split__doc__,
   12825              "S.split(sep=None, maxsplit=-1) -> list of strings\n\
   12826 \n\
   12827 Return a list of the words in S, using sep as the\n\
   12828 delimiter string.  If maxsplit is given, at most maxsplit\n\
   12829 splits are done. If sep is not specified or is None, any\n\
   12830 whitespace string is a separator and empty strings are\n\
   12831 removed from the result.");
   12832 
   12833 static PyObject*
   12834 unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
   12835 {
   12836     static char *kwlist[] = {"sep", "maxsplit", 0};
   12837     PyObject *substring = Py_None;
   12838     Py_ssize_t maxcount = -1;
   12839 
   12840     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
   12841                                      kwlist, &substring, &maxcount))
   12842         return NULL;
   12843 
   12844     if (substring == Py_None)
   12845         return split(self, NULL, maxcount);
   12846 
   12847     if (PyUnicode_Check(substring))
   12848         return split(self, substring, maxcount);
   12849 
   12850     PyErr_Format(PyExc_TypeError,
   12851                  "must be str or None, not %.100s",
   12852                  Py_TYPE(substring)->tp_name);
   12853     return NULL;
   12854 }
   12855 
   12856 PyObject *
   12857 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
   12858 {
   12859     PyObject* out;
   12860     int kind1, kind2;
   12861     void *buf1, *buf2;
   12862     Py_ssize_t len1, len2;
   12863 
   12864     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
   12865         return NULL;
   12866 
   12867     kind1 = PyUnicode_KIND(str_obj);
   12868     kind2 = PyUnicode_KIND(sep_obj);
   12869     len1 = PyUnicode_GET_LENGTH(str_obj);
   12870     len2 = PyUnicode_GET_LENGTH(sep_obj);
   12871     if (kind1 < kind2 || len1 < len2) {
   12872         _Py_INCREF_UNICODE_EMPTY();
   12873         if (!unicode_empty)
   12874             out = NULL;
   12875         else {
   12876             out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
   12877             Py_DECREF(unicode_empty);
   12878         }
   12879         return out;
   12880     }
   12881     buf1 = PyUnicode_DATA(str_obj);
   12882     buf2 = PyUnicode_DATA(sep_obj);
   12883     if (kind2 != kind1) {
   12884         buf2 = _PyUnicode_AsKind(sep_obj, kind1);
   12885         if (!buf2)
   12886             return NULL;
   12887     }
   12888 
   12889     switch (kind1) {
   12890     case PyUnicode_1BYTE_KIND:
   12891         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
   12892             out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12893         else
   12894             out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12895         break;
   12896     case PyUnicode_2BYTE_KIND:
   12897         out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12898         break;
   12899     case PyUnicode_4BYTE_KIND:
   12900         out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12901         break;
   12902     default:
   12903         assert(0);
   12904         out = 0;
   12905     }
   12906 
   12907     if (kind2 != kind1)
   12908         PyMem_Free(buf2);
   12909 
   12910     return out;
   12911 }
   12912 
   12913 
   12914 PyObject *
   12915 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
   12916 {
   12917     PyObject* out;
   12918     int kind1, kind2;
   12919     void *buf1, *buf2;
   12920     Py_ssize_t len1, len2;
   12921 
   12922     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
   12923         return NULL;
   12924 
   12925     kind1 = PyUnicode_KIND(str_obj);
   12926     kind2 = PyUnicode_KIND(sep_obj);
   12927     len1 = PyUnicode_GET_LENGTH(str_obj);
   12928     len2 = PyUnicode_GET_LENGTH(sep_obj);
   12929     if (kind1 < kind2 || len1 < len2) {
   12930         _Py_INCREF_UNICODE_EMPTY();
   12931         if (!unicode_empty)
   12932             out = NULL;
   12933         else {
   12934             out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
   12935             Py_DECREF(unicode_empty);
   12936         }
   12937         return out;
   12938     }
   12939     buf1 = PyUnicode_DATA(str_obj);
   12940     buf2 = PyUnicode_DATA(sep_obj);
   12941     if (kind2 != kind1) {
   12942         buf2 = _PyUnicode_AsKind(sep_obj, kind1);
   12943         if (!buf2)
   12944             return NULL;
   12945     }
   12946 
   12947     switch (kind1) {
   12948     case PyUnicode_1BYTE_KIND:
   12949         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
   12950             out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12951         else
   12952             out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12953         break;
   12954     case PyUnicode_2BYTE_KIND:
   12955         out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12956         break;
   12957     case PyUnicode_4BYTE_KIND:
   12958         out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12959         break;
   12960     default:
   12961         assert(0);
   12962         out = 0;
   12963     }
   12964 
   12965     if (kind2 != kind1)
   12966         PyMem_Free(buf2);
   12967 
   12968     return out;
   12969 }
   12970 
   12971 PyDoc_STRVAR(partition__doc__,
   12972              "S.partition(sep) -> (head, sep, tail)\n\
   12973 \n\
   12974 Search for the separator sep in S, and return the part before it,\n\
   12975 the separator itself, and the part after it.  If the separator is not\n\
   12976 found, return S and two empty strings.");
   12977 
   12978 static PyObject*
   12979 unicode_partition(PyObject *self, PyObject *separator)
   12980 {
   12981     return PyUnicode_Partition(self, separator);
   12982 }
   12983 
   12984 PyDoc_STRVAR(rpartition__doc__,
   12985              "S.rpartition(sep) -> (head, sep, tail)\n\
   12986 \n\
   12987 Search for the separator sep in S, starting at the end of S, and return\n\
   12988 the part before it, the separator itself, and the part after it.  If the\n\
   12989 separator is not found, return two empty strings and S.");
   12990 
   12991 static PyObject*
   12992 unicode_rpartition(PyObject *self, PyObject *separator)
   12993 {
   12994     return PyUnicode_RPartition(self, separator);
   12995 }
   12996 
   12997 PyObject *
   12998 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
   12999 {
   13000     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
   13001         return NULL;
   13002 
   13003     return rsplit(s, sep, maxsplit);
   13004 }
   13005 
   13006 PyDoc_STRVAR(rsplit__doc__,
   13007              "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
   13008 \n\
   13009 Return a list of the words in S, using sep as the\n\
   13010 delimiter string, starting at the end of the string and\n\
   13011 working to the front.  If maxsplit is given, at most maxsplit\n\
   13012 splits are done. If sep is not specified, any whitespace string\n\
   13013 is a separator.");
   13014 
   13015 static PyObject*
   13016 unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
   13017 {
   13018     static char *kwlist[] = {"sep", "maxsplit", 0};
   13019     PyObject *substring = Py_None;
   13020     Py_ssize_t maxcount = -1;
   13021 
   13022     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
   13023                                      kwlist, &substring, &maxcount))
   13024         return NULL;
   13025 
   13026     if (substring == Py_None)
   13027         return rsplit(self, NULL, maxcount);
   13028 
   13029     if (PyUnicode_Check(substring))
   13030         return rsplit(self, substring, maxcount);
   13031 
   13032     PyErr_Format(PyExc_TypeError,
   13033                  "must be str or None, not %.100s",
   13034                  Py_TYPE(substring)->tp_name);
   13035     return NULL;
   13036 }
   13037 
   13038 PyDoc_STRVAR(splitlines__doc__,
   13039              "S.splitlines([keepends]) -> list of strings\n\
   13040 \n\
   13041 Return a list of the lines in S, breaking at line boundaries.\n\
   13042 Line breaks are not included in the resulting list unless keepends\n\
   13043 is given and true.");
   13044 
   13045 static PyObject*
   13046 unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
   13047 {
   13048     static char *kwlist[] = {"keepends", 0};
   13049     int keepends = 0;
   13050 
   13051     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
   13052                                      kwlist, &keepends))
   13053         return NULL;
   13054 
   13055     return PyUnicode_Splitlines(self, keepends);
   13056 }
   13057 
   13058 static
   13059 PyObject *unicode_str(PyObject *self)
   13060 {
   13061     return unicode_result_unchanged(self);
   13062 }
   13063 
   13064 PyDoc_STRVAR(swapcase__doc__,
   13065              "S.swapcase() -> str\n\
   13066 \n\
   13067 Return a copy of S with uppercase characters converted to lowercase\n\
   13068 and vice versa.");
   13069 
   13070 static PyObject*
   13071 unicode_swapcase(PyObject *self)
   13072 {
   13073     if (PyUnicode_READY(self) == -1)
   13074         return NULL;
   13075     return case_operation(self, do_swapcase);
   13076 }
   13077 
   13078 /*[clinic input]
   13079 
   13080 @staticmethod
   13081 str.maketrans as unicode_maketrans
   13082 
   13083   x: object
   13084 
   13085   y: unicode=NULL
   13086 
   13087   z: unicode=NULL
   13088 
   13089   /
   13090 
   13091 Return a translation table usable for str.translate().
   13092 
   13093 If there is only one argument, it must be a dictionary mapping Unicode
   13094 ordinals (integers) or characters to Unicode ordinals, strings or None.
   13095 Character keys will be then converted to ordinals.
   13096 If there are two arguments, they must be strings of equal length, and
   13097 in the resulting dictionary, each character in x will be mapped to the
   13098 character at the same position in y. If there is a third argument, it
   13099 must be a string, whose characters will be mapped to None in the result.
   13100 [clinic start generated code]*/
   13101 
   13102 static PyObject *
   13103 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
   13104 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
   13105 {
   13106     PyObject *new = NULL, *key, *value;
   13107     Py_ssize_t i = 0;
   13108     int res;
   13109 
   13110     new = PyDict_New();
   13111     if (!new)
   13112         return NULL;
   13113     if (y != NULL) {
   13114         int x_kind, y_kind, z_kind;
   13115         void *x_data, *y_data, *z_data;
   13116 
   13117         /* x must be a string too, of equal length */
   13118         if (!PyUnicode_Check(x)) {
   13119             PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
   13120                             "be a string if there is a second argument");
   13121             goto err;
   13122         }
   13123         if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
   13124             PyErr_SetString(PyExc_ValueError, "the first two maketrans "
   13125                             "arguments must have equal length");
   13126             goto err;
   13127         }
   13128         /* create entries for translating chars in x to those in y */
   13129         x_kind = PyUnicode_KIND(x);
   13130         y_kind = PyUnicode_KIND(y);
   13131         x_data = PyUnicode_DATA(x);
   13132         y_data = PyUnicode_DATA(y);
   13133         for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
   13134             key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
   13135             if (!key)
   13136                 goto err;
   13137             value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
   13138             if (!value) {
   13139                 Py_DECREF(key);
   13140                 goto err;
   13141             }
   13142             res = PyDict_SetItem(new, key, value);
   13143             Py_DECREF(key);
   13144             Py_DECREF(value);
   13145             if (res < 0)
   13146                 goto err;
   13147         }
   13148         /* create entries for deleting chars in z */
   13149         if (z != NULL) {
   13150             z_kind = PyUnicode_KIND(z);
   13151             z_data = PyUnicode_DATA(z);
   13152             for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
   13153                 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
   13154                 if (!key)
   13155                     goto err;
   13156                 res = PyDict_SetItem(new, key, Py_None);
   13157                 Py_DECREF(key);
   13158                 if (res < 0)
   13159                     goto err;
   13160             }
   13161         }
   13162     } else {
   13163         int kind;
   13164         void *data;
   13165 
   13166         /* x must be a dict */
   13167         if (!PyDict_CheckExact(x)) {
   13168             PyErr_SetString(PyExc_TypeError, "if you give only one argument "
   13169                             "to maketrans it must be a dict");
   13170             goto err;
   13171         }
   13172         /* copy entries into the new dict, converting string keys to int keys */
   13173         while (PyDict_Next(x, &i, &key, &value)) {
   13174             if (PyUnicode_Check(key)) {
   13175                 /* convert string keys to integer keys */
   13176                 PyObject *newkey;
   13177                 if (PyUnicode_GET_LENGTH(key) != 1) {
   13178                     PyErr_SetString(PyExc_ValueError, "string keys in translate "
   13179                                     "table must be of length 1");
   13180                     goto err;
   13181                 }
   13182                 kind = PyUnicode_KIND(key);
   13183                 data = PyUnicode_DATA(key);
   13184                 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
   13185                 if (!newkey)
   13186                     goto err;
   13187                 res = PyDict_SetItem(new, newkey, value);
   13188                 Py_DECREF(newkey);
   13189                 if (res < 0)
   13190                     goto err;
   13191             } else if (PyLong_Check(key)) {
   13192                 /* just keep integer keys */
   13193                 if (PyDict_SetItem(new, key, value) < 0)
   13194                     goto err;
   13195             } else {
   13196                 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
   13197                                 "be strings or integers");
   13198                 goto err;
   13199             }
   13200         }
   13201     }
   13202     return new;
   13203   err:
   13204     Py_DECREF(new);
   13205     return NULL;
   13206 }
   13207 
   13208 PyDoc_STRVAR(translate__doc__,
   13209              "S.translate(table) -> str\n\
   13210 \n\
   13211 Return a copy of the string S in which each character has been mapped\n\
   13212 through the given translation table. The table must implement\n\
   13213 lookup/indexing via __getitem__, for instance a dictionary or list,\n\
   13214 mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
   13215 this operation raises LookupError, the character is left untouched.\n\
   13216 Characters mapped to None are deleted.");
   13217 
   13218 static PyObject*
   13219 unicode_translate(PyObject *self, PyObject *table)
   13220 {
   13221     return _PyUnicode_TranslateCharmap(self, table, "ignore");
   13222 }
   13223 
   13224 PyDoc_STRVAR(upper__doc__,
   13225              "S.upper() -> str\n\
   13226 \n\
   13227 Return a copy of S converted to uppercase.");
   13228 
   13229 static PyObject*
   13230 unicode_upper(PyObject *self)
   13231 {
   13232     if (PyUnicode_READY(self) == -1)
   13233         return NULL;
   13234     if (PyUnicode_IS_ASCII(self))
   13235         return ascii_upper_or_lower(self, 0);
   13236     return case_operation(self, do_upper);
   13237 }
   13238 
   13239 PyDoc_STRVAR(zfill__doc__,
   13240              "S.zfill(width) -> str\n\
   13241 \n\
   13242 Pad a numeric string S with zeros on the left, to fill a field\n\
   13243 of the specified width. The string S is never truncated.");
   13244 
   13245 static PyObject *
   13246 unicode_zfill(PyObject *self, PyObject *args)
   13247 {
   13248     Py_ssize_t fill;
   13249     PyObject *u;
   13250     Py_ssize_t width;
   13251     int kind;
   13252     void *data;
   13253     Py_UCS4 chr;
   13254 
   13255     if (!PyArg_ParseTuple(args, "n:zfill", &width))
   13256         return NULL;
   13257 
   13258     if (PyUnicode_READY(self) == -1)
   13259         return NULL;
   13260 
   13261     if (PyUnicode_GET_LENGTH(self) >= width)
   13262         return unicode_result_unchanged(self);
   13263 
   13264     fill = width - PyUnicode_GET_LENGTH(self);
   13265 
   13266     u = pad(self, fill, 0, '0');
   13267 
   13268     if (u == NULL)
   13269         return NULL;
   13270 
   13271     kind = PyUnicode_KIND(u);
   13272     data = PyUnicode_DATA(u);
   13273     chr = PyUnicode_READ(kind, data, fill);
   13274 
   13275     if (chr == '+' || chr == '-') {
   13276         /* move sign to beginning of string */
   13277         PyUnicode_WRITE(kind, data, 0, chr);
   13278         PyUnicode_WRITE(kind, data, fill, '0');
   13279     }
   13280 
   13281     assert(_PyUnicode_CheckConsistency(u, 1));
   13282     return u;
   13283 }
   13284 
   13285 #if 0
   13286 static PyObject *
   13287 unicode__decimal2ascii(PyObject *self)
   13288 {
   13289     return PyUnicode_TransformDecimalAndSpaceToASCII(self);
   13290 }
   13291 #endif
   13292 
   13293 PyDoc_STRVAR(startswith__doc__,
   13294              "S.startswith(prefix[, start[, end]]) -> bool\n\
   13295 \n\
   13296 Return True if S starts with the specified prefix, False otherwise.\n\
   13297 With optional start, test S beginning at that position.\n\
   13298 With optional end, stop comparing S at that position.\n\
   13299 prefix can also be a tuple of strings to try.");
   13300 
   13301 static PyObject *
   13302 unicode_startswith(PyObject *self,
   13303                    PyObject *args)
   13304 {
   13305     PyObject *subobj;
   13306     PyObject *substring;
   13307     Py_ssize_t start = 0;
   13308     Py_ssize_t end = PY_SSIZE_T_MAX;
   13309     int result;
   13310 
   13311     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
   13312         return NULL;
   13313     if (PyTuple_Check(subobj)) {
   13314         Py_ssize_t i;
   13315         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
   13316             substring = PyTuple_GET_ITEM(subobj, i);
   13317             if (!PyUnicode_Check(substring)) {
   13318                 PyErr_Format(PyExc_TypeError,
   13319                              "tuple for startswith must only contain str, "
   13320                              "not %.100s",
   13321                              Py_TYPE(substring)->tp_name);
   13322                 return NULL;
   13323             }
   13324             result = tailmatch(self, substring, start, end, -1);
   13325             if (result == -1)
   13326                 return NULL;
   13327             if (result) {
   13328                 Py_RETURN_TRUE;
   13329             }
   13330         }
   13331         /* nothing matched */
   13332         Py_RETURN_FALSE;
   13333     }
   13334     if (!PyUnicode_Check(subobj)) {
   13335         PyErr_Format(PyExc_TypeError,
   13336                      "startswith first arg must be str or "
   13337                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
   13338         return NULL;
   13339     }
   13340     result = tailmatch(self, subobj, start, end, -1);
   13341     if (result == -1)
   13342         return NULL;
   13343     return PyBool_FromLong(result);
   13344 }
   13345 
   13346 
   13347 PyDoc_STRVAR(endswith__doc__,
   13348              "S.endswith(suffix[, start[, end]]) -> bool\n\
   13349 \n\
   13350 Return True if S ends with the specified suffix, False otherwise.\n\
   13351 With optional start, test S beginning at that position.\n\
   13352 With optional end, stop comparing S at that position.\n\
   13353 suffix can also be a tuple of strings to try.");
   13354 
   13355 static PyObject *
   13356 unicode_endswith(PyObject *self,
   13357                  PyObject *args)
   13358 {
   13359     PyObject *subobj;
   13360     PyObject *substring;
   13361     Py_ssize_t start = 0;
   13362     Py_ssize_t end = PY_SSIZE_T_MAX;
   13363     int result;
   13364 
   13365     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
   13366         return NULL;
   13367     if (PyTuple_Check(subobj)) {
   13368         Py_ssize_t i;
   13369         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
   13370             substring = PyTuple_GET_ITEM(subobj, i);
   13371             if (!PyUnicode_Check(substring)) {
   13372                 PyErr_Format(PyExc_TypeError,
   13373                              "tuple for endswith must only contain str, "
   13374                              "not %.100s",
   13375                              Py_TYPE(substring)->tp_name);
   13376                 return NULL;
   13377             }
   13378             result = tailmatch(self, substring, start, end, +1);
   13379             if (result == -1)
   13380                 return NULL;
   13381             if (result) {
   13382                 Py_RETURN_TRUE;
   13383             }
   13384         }
   13385         Py_RETURN_FALSE;
   13386     }
   13387     if (!PyUnicode_Check(subobj)) {
   13388         PyErr_Format(PyExc_TypeError,
   13389                      "endswith first arg must be str or "
   13390                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
   13391         return NULL;
   13392     }
   13393     result = tailmatch(self, subobj, start, end, +1);
   13394     if (result == -1)
   13395         return NULL;
   13396     return PyBool_FromLong(result);
   13397 }
   13398 
   13399 static inline void
   13400 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
   13401 {
   13402     writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
   13403     writer->data = PyUnicode_DATA(writer->buffer);
   13404 
   13405     if (!writer->readonly) {
   13406         writer->kind = PyUnicode_KIND(writer->buffer);
   13407         writer->size = PyUnicode_GET_LENGTH(writer->buffer);
   13408     }
   13409     else {
   13410         /* use a value smaller than PyUnicode_1BYTE_KIND() so
   13411            _PyUnicodeWriter_PrepareKind() will copy the buffer. */
   13412         writer->kind = PyUnicode_WCHAR_KIND;
   13413         assert(writer->kind <= PyUnicode_1BYTE_KIND);
   13414 
   13415         /* Copy-on-write mode: set buffer size to 0 so
   13416          * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
   13417          * next write. */
   13418         writer->size = 0;
   13419     }
   13420 }
   13421 
   13422 void
   13423 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
   13424 {
   13425     memset(writer, 0, sizeof(*writer));
   13426 
   13427     /* ASCII is the bare minimum */
   13428     writer->min_char = 127;
   13429 
   13430     /* use a value smaller than PyUnicode_1BYTE_KIND() so
   13431        _PyUnicodeWriter_PrepareKind() will copy the buffer. */
   13432     writer->kind = PyUnicode_WCHAR_KIND;
   13433     assert(writer->kind <= PyUnicode_1BYTE_KIND);
   13434 }
   13435 
   13436 int
   13437 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
   13438                                  Py_ssize_t length, Py_UCS4 maxchar)
   13439 {
   13440     Py_ssize_t newlen;
   13441     PyObject *newbuffer;
   13442 
   13443     assert(maxchar <= MAX_UNICODE);
   13444 
   13445     /* ensure that the _PyUnicodeWriter_Prepare macro was used */
   13446     assert((maxchar > writer->maxchar && length >= 0)
   13447            || length > 0);
   13448 
   13449     if (length > PY_SSIZE_T_MAX - writer->pos) {
   13450         PyErr_NoMemory();
   13451         return -1;
   13452     }
   13453     newlen = writer->pos + length;
   13454 
   13455     maxchar = Py_MAX(maxchar, writer->min_char);
   13456 
   13457     if (writer->buffer == NULL) {
   13458         assert(!writer->readonly);
   13459         if (writer->overallocate
   13460             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
   13461             /* overallocate to limit the number of realloc() */
   13462             newlen += newlen / OVERALLOCATE_FACTOR;
   13463         }
   13464         if (newlen < writer->min_length)
   13465             newlen = writer->min_length;
   13466 
   13467         writer->buffer = PyUnicode_New(newlen, maxchar);
   13468         if (writer->buffer == NULL)
   13469             return -1;
   13470     }
   13471     else if (newlen > writer->size) {
   13472         if (writer->overallocate
   13473             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
   13474             /* overallocate to limit the number of realloc() */
   13475             newlen += newlen / OVERALLOCATE_FACTOR;
   13476         }
   13477         if (newlen < writer->min_length)
   13478             newlen = writer->min_length;
   13479 
   13480         if (maxchar > writer->maxchar || writer->readonly) {
   13481             /* resize + widen */
   13482             maxchar = Py_MAX(maxchar, writer->maxchar);
   13483             newbuffer = PyUnicode_New(newlen, maxchar);
   13484             if (newbuffer == NULL)
   13485                 return -1;
   13486             _PyUnicode_FastCopyCharacters(newbuffer, 0,
   13487                                           writer->buffer, 0, writer->pos);
   13488             Py_DECREF(writer->buffer);
   13489             writer->readonly = 0;
   13490         }
   13491         else {
   13492             newbuffer = resize_compact(writer->buffer, newlen);
   13493             if (newbuffer == NULL)
   13494                 return -1;
   13495         }
   13496         writer->buffer = newbuffer;
   13497     }
   13498     else if (maxchar > writer->maxchar) {
   13499         assert(!writer->readonly);
   13500         newbuffer = PyUnicode_New(writer->size, maxchar);
   13501         if (newbuffer == NULL)
   13502             return -1;
   13503         _PyUnicode_FastCopyCharacters(newbuffer, 0,
   13504                                       writer->buffer, 0, writer->pos);
   13505         Py_SETREF(writer->buffer, newbuffer);
   13506     }
   13507     _PyUnicodeWriter_Update(writer);
   13508     return 0;
   13509 
   13510 #undef OVERALLOCATE_FACTOR
   13511 }
   13512 
   13513 int
   13514 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
   13515                                      enum PyUnicode_Kind kind)
   13516 {
   13517     Py_UCS4 maxchar;
   13518 
   13519     /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
   13520     assert(writer->kind < kind);
   13521 
   13522     switch (kind)
   13523     {
   13524     case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
   13525     case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
   13526     case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
   13527     default:
   13528         assert(0 && "invalid kind");
   13529         return -1;
   13530     }
   13531 
   13532     return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
   13533 }
   13534 
   13535 static inline int
   13536 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
   13537 {
   13538     assert(ch <= MAX_UNICODE);
   13539     if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
   13540         return -1;
   13541     PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
   13542     writer->pos++;
   13543     return 0;
   13544 }
   13545 
   13546 int
   13547 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
   13548 {
   13549     return _PyUnicodeWriter_WriteCharInline(writer, ch);
   13550 }
   13551 
   13552 int
   13553 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
   13554 {
   13555     Py_UCS4 maxchar;
   13556     Py_ssize_t len;
   13557 
   13558     if (PyUnicode_READY(str) == -1)
   13559         return -1;
   13560     len = PyUnicode_GET_LENGTH(str);
   13561     if (len == 0)
   13562         return 0;
   13563     maxchar = PyUnicode_MAX_CHAR_VALUE(str);
   13564     if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
   13565         if (writer->buffer == NULL && !writer->overallocate) {
   13566             assert(_PyUnicode_CheckConsistency(str, 1));
   13567             writer->readonly = 1;
   13568             Py_INCREF(str);
   13569             writer->buffer = str;
   13570             _PyUnicodeWriter_Update(writer);
   13571             writer->pos += len;
   13572             return 0;
   13573         }
   13574         if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
   13575             return -1;
   13576     }
   13577     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
   13578                                   str, 0, len);
   13579     writer->pos += len;
   13580     return 0;
   13581 }
   13582 
   13583 int
   13584 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
   13585                                 Py_ssize_t start, Py_ssize_t end)
   13586 {
   13587     Py_UCS4 maxchar;
   13588     Py_ssize_t len;
   13589 
   13590     if (PyUnicode_READY(str) == -1)
   13591         return -1;
   13592 
   13593     assert(0 <= start);
   13594     assert(end <= PyUnicode_GET_LENGTH(str));
   13595     assert(start <= end);
   13596 
   13597     if (end == 0)
   13598         return 0;
   13599 
   13600     if (start == 0 && end == PyUnicode_GET_LENGTH(str))
   13601         return _PyUnicodeWriter_WriteStr(writer, str);
   13602 
   13603     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
   13604         maxchar = _PyUnicode_FindMaxChar(str, start, end);
   13605     else
   13606         maxchar = writer->maxchar;
   13607     len = end - start;
   13608 
   13609     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
   13610         return -1;
   13611 
   13612     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
   13613                                   str, start, len);
   13614     writer->pos += len;
   13615     return 0;
   13616 }
   13617 
   13618 int
   13619 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
   13620                                   const char *ascii, Py_ssize_t len)
   13621 {
   13622     if (len == -1)
   13623         len = strlen(ascii);
   13624 
   13625     assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
   13626 
   13627     if (writer->buffer == NULL && !writer->overallocate) {
   13628         PyObject *str;
   13629 
   13630         str = _PyUnicode_FromASCII(ascii, len);
   13631         if (str == NULL)
   13632             return -1;
   13633 
   13634         writer->readonly = 1;
   13635         writer->buffer = str;
   13636         _PyUnicodeWriter_Update(writer);
   13637         writer->pos += len;
   13638         return 0;
   13639     }
   13640 
   13641     if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
   13642         return -1;
   13643 
   13644     switch (writer->kind)
   13645     {
   13646     case PyUnicode_1BYTE_KIND:
   13647     {
   13648         const Py_UCS1 *str = (const Py_UCS1 *)ascii;
   13649         Py_UCS1 *data = writer->data;
   13650 
   13651         memcpy(data + writer->pos, str, len);
   13652         break;
   13653     }
   13654     case PyUnicode_2BYTE_KIND:
   13655     {
   13656         _PyUnicode_CONVERT_BYTES(
   13657             Py_UCS1, Py_UCS2,
   13658             ascii, ascii + len,
   13659             (Py_UCS2 *)writer->data + writer->pos);
   13660         break;
   13661     }
   13662     case PyUnicode_4BYTE_KIND:
   13663     {
   13664         _PyUnicode_CONVERT_BYTES(
   13665             Py_UCS1, Py_UCS4,
   13666             ascii, ascii + len,
   13667             (Py_UCS4 *)writer->data + writer->pos);
   13668         break;
   13669     }
   13670     default:
   13671         assert(0);
   13672     }
   13673 
   13674     writer->pos += len;
   13675     return 0;
   13676 }
   13677 
   13678 int
   13679 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
   13680                                    const char *str, Py_ssize_t len)
   13681 {
   13682     Py_UCS4 maxchar;
   13683 
   13684     maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
   13685     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
   13686         return -1;
   13687     unicode_write_cstr(writer->buffer, writer->pos, str, len);
   13688     writer->pos += len;
   13689     return 0;
   13690 }
   13691 
   13692 PyObject *
   13693 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
   13694 {
   13695     PyObject *str;
   13696 
   13697     if (writer->pos == 0) {
   13698         Py_CLEAR(writer->buffer);
   13699         _Py_RETURN_UNICODE_EMPTY();
   13700     }
   13701 
   13702     str = writer->buffer;
   13703     writer->buffer = NULL;
   13704 
   13705     if (writer->readonly) {
   13706         assert(PyUnicode_GET_LENGTH(str) == writer->pos);
   13707         return str;
   13708     }
   13709 
   13710     if (PyUnicode_GET_LENGTH(str) != writer->pos) {
   13711         PyObject *str2;
   13712         str2 = resize_compact(str, writer->pos);
   13713         if (str2 == NULL) {
   13714             Py_DECREF(str);
   13715             return NULL;
   13716         }
   13717         str = str2;
   13718     }
   13719 
   13720     assert(_PyUnicode_CheckConsistency(str, 1));
   13721     return unicode_result_ready(str);
   13722 }
   13723 
   13724 void
   13725 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
   13726 {
   13727     Py_CLEAR(writer->buffer);
   13728 }
   13729 
   13730 #include "stringlib/unicode_format.h"
   13731 
   13732 PyDoc_STRVAR(format__doc__,
   13733              "S.format(*args, **kwargs) -> str\n\
   13734 \n\
   13735 Return a formatted version of S, using substitutions from args and kwargs.\n\
   13736 The substitutions are identified by braces ('{' and '}').");
   13737 
   13738 PyDoc_STRVAR(format_map__doc__,
   13739              "S.format_map(mapping) -> str\n\
   13740 \n\
   13741 Return a formatted version of S, using substitutions from mapping.\n\
   13742 The substitutions are identified by braces ('{' and '}').");
   13743 
   13744 static PyObject *
   13745 unicode__format__(PyObject* self, PyObject* args)
   13746 {
   13747     PyObject *format_spec;
   13748     _PyUnicodeWriter writer;
   13749     int ret;
   13750 
   13751     if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
   13752         return NULL;
   13753 
   13754     if (PyUnicode_READY(self) == -1)
   13755         return NULL;
   13756     _PyUnicodeWriter_Init(&writer);
   13757     ret = _PyUnicode_FormatAdvancedWriter(&writer,
   13758                                           self, format_spec, 0,
   13759                                           PyUnicode_GET_LENGTH(format_spec));
   13760     if (ret == -1) {
   13761         _PyUnicodeWriter_Dealloc(&writer);
   13762         return NULL;
   13763     }
   13764     return _PyUnicodeWriter_Finish(&writer);
   13765 }
   13766 
   13767 PyDoc_STRVAR(p_format__doc__,
   13768              "S.__format__(format_spec) -> str\n\
   13769 \n\
   13770 Return a formatted version of S as described by format_spec.");
   13771 
   13772 static PyObject *
   13773 unicode__sizeof__(PyObject *v)
   13774 {
   13775     Py_ssize_t size;
   13776 
   13777     /* If it's a compact object, account for base structure +
   13778        character data. */
   13779     if (PyUnicode_IS_COMPACT_ASCII(v))
   13780         size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
   13781     else if (PyUnicode_IS_COMPACT(v))
   13782         size = sizeof(PyCompactUnicodeObject) +
   13783             (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
   13784     else {
   13785         /* If it is a two-block object, account for base object, and
   13786            for character block if present. */
   13787         size = sizeof(PyUnicodeObject);
   13788         if (_PyUnicode_DATA_ANY(v))
   13789             size += (PyUnicode_GET_LENGTH(v) + 1) *
   13790                 PyUnicode_KIND(v);
   13791     }
   13792     /* If the wstr pointer is present, account for it unless it is shared
   13793        with the data pointer. Check if the data is not shared. */
   13794     if (_PyUnicode_HAS_WSTR_MEMORY(v))
   13795         size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
   13796     if (_PyUnicode_HAS_UTF8_MEMORY(v))
   13797         size += PyUnicode_UTF8_LENGTH(v) + 1;
   13798 
   13799     return PyLong_FromSsize_t(size);
   13800 }
   13801 
   13802 PyDoc_STRVAR(sizeof__doc__,
   13803              "S.__sizeof__() -> size of S in memory, in bytes");
   13804 
   13805 static PyObject *
   13806 unicode_getnewargs(PyObject *v)
   13807 {
   13808     PyObject *copy = _PyUnicode_Copy(v);
   13809     if (!copy)
   13810         return NULL;
   13811     return Py_BuildValue("(N)", copy);
   13812 }
   13813 
   13814 static PyMethodDef unicode_methods[] = {
   13815     {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
   13816     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
   13817     {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
   13818     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
   13819     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
   13820     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
   13821     {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
   13822     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
   13823     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
   13824     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
   13825     {"expandtabs", (PyCFunction) unicode_expandtabs,
   13826      METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
   13827     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
   13828     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
   13829     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
   13830     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
   13831     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
   13832     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
   13833     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
   13834     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
   13835     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
   13836     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
   13837     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
   13838     {"splitlines", (PyCFunction) unicode_splitlines,
   13839      METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
   13840     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
   13841     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
   13842     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
   13843     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
   13844     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
   13845     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
   13846     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
   13847     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
   13848     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
   13849     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
   13850     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
   13851     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
   13852     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
   13853     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
   13854     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
   13855     {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
   13856     {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
   13857     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
   13858     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
   13859     {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
   13860     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
   13861     UNICODE_MAKETRANS_METHODDEF
   13862     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
   13863 #if 0
   13864     /* These methods are just used for debugging the implementation. */
   13865     {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
   13866 #endif
   13867 
   13868     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
   13869     {NULL, NULL}
   13870 };
   13871 
   13872 static PyObject *
   13873 unicode_mod(PyObject *v, PyObject *w)
   13874 {
   13875     if (!PyUnicode_Check(v))
   13876         Py_RETURN_NOTIMPLEMENTED;
   13877     return PyUnicode_Format(v, w);
   13878 }
   13879 
   13880 static PyNumberMethods unicode_as_number = {
   13881     0,              /*nb_add*/
   13882     0,              /*nb_subtract*/
   13883     0,              /*nb_multiply*/
   13884     unicode_mod,            /*nb_remainder*/
   13885 };
   13886 
   13887 static PySequenceMethods unicode_as_sequence = {
   13888     (lenfunc) unicode_length,       /* sq_length */
   13889     PyUnicode_Concat,           /* sq_concat */
   13890     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
   13891     (ssizeargfunc) unicode_getitem,     /* sq_item */
   13892     0,                  /* sq_slice */
   13893     0,                  /* sq_ass_item */
   13894     0,                  /* sq_ass_slice */
   13895     PyUnicode_Contains,         /* sq_contains */
   13896 };
   13897 
   13898 static PyObject*
   13899 unicode_subscript(PyObject* self, PyObject* item)
   13900 {
   13901     if (PyUnicode_READY(self) == -1)
   13902         return NULL;
   13903 
   13904     if (PyIndex_Check(item)) {
   13905         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
   13906         if (i == -1 && PyErr_Occurred())
   13907             return NULL;
   13908         if (i < 0)
   13909             i += PyUnicode_GET_LENGTH(self);
   13910         return unicode_getitem(self, i);
   13911     } else if (PySlice_Check(item)) {
   13912         Py_ssize_t start, stop, step, slicelength, cur, i;
   13913         PyObject *result;
   13914         void *src_data, *dest_data;
   13915         int src_kind, dest_kind;
   13916         Py_UCS4 ch, max_char, kind_limit;
   13917 
   13918         if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
   13919                                  &start, &stop, &step, &slicelength) < 0) {
   13920             return NULL;
   13921         }
   13922 
   13923         if (slicelength <= 0) {
   13924             _Py_RETURN_UNICODE_EMPTY();
   13925         } else if (start == 0 && step == 1 &&
   13926                    slicelength == PyUnicode_GET_LENGTH(self)) {
   13927             return unicode_result_unchanged(self);
   13928         } else if (step == 1) {
   13929             return PyUnicode_Substring(self,
   13930                                        start, start + slicelength);
   13931         }
   13932         /* General case */
   13933         src_kind = PyUnicode_KIND(self);
   13934         src_data = PyUnicode_DATA(self);
   13935         if (!PyUnicode_IS_ASCII(self)) {
   13936             kind_limit = kind_maxchar_limit(src_kind);
   13937             max_char = 0;
   13938             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
   13939                 ch = PyUnicode_READ(src_kind, src_data, cur);
   13940                 if (ch > max_char) {
   13941                     max_char = ch;
   13942                     if (max_char >= kind_limit)
   13943                         break;
   13944                 }
   13945             }
   13946         }
   13947         else
   13948             max_char = 127;
   13949         result = PyUnicode_New(slicelength, max_char);
   13950         if (result == NULL)
   13951             return NULL;
   13952         dest_kind = PyUnicode_KIND(result);
   13953         dest_data = PyUnicode_DATA(result);
   13954 
   13955         for (cur = start, i = 0; i < slicelength; cur += step, i++) {
   13956             Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
   13957             PyUnicode_WRITE(dest_kind, dest_data, i, ch);
   13958         }
   13959         assert(_PyUnicode_CheckConsistency(result, 1));
   13960         return result;
   13961     } else {
   13962         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
   13963         return NULL;
   13964     }
   13965 }
   13966 
   13967 static PyMappingMethods unicode_as_mapping = {
   13968     (lenfunc)unicode_length,        /* mp_length */
   13969     (binaryfunc)unicode_subscript,  /* mp_subscript */
   13970     (objobjargproc)0,           /* mp_ass_subscript */
   13971 };
   13972 
   13973 
   13974 /* Helpers for PyUnicode_Format() */
   13975 
   13976 struct unicode_formatter_t {
   13977     PyObject *args;
   13978     int args_owned;
   13979     Py_ssize_t arglen, argidx;
   13980     PyObject *dict;
   13981 
   13982     enum PyUnicode_Kind fmtkind;
   13983     Py_ssize_t fmtcnt, fmtpos;
   13984     void *fmtdata;
   13985     PyObject *fmtstr;
   13986 
   13987     _PyUnicodeWriter writer;
   13988 };
   13989 
   13990 struct unicode_format_arg_t {
   13991     Py_UCS4 ch;
   13992     int flags;
   13993     Py_ssize_t width;
   13994     int prec;
   13995     int sign;
   13996 };
   13997 
   13998 static PyObject *
   13999 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
   14000 {
   14001     Py_ssize_t argidx = ctx->argidx;
   14002 
   14003     if (argidx < ctx->arglen) {
   14004         ctx->argidx++;
   14005         if (ctx->arglen < 0)
   14006             return ctx->args;
   14007         else
   14008             return PyTuple_GetItem(ctx->args, argidx);
   14009     }
   14010     PyErr_SetString(PyExc_TypeError,
   14011                     "not enough arguments for format string");
   14012     return NULL;
   14013 }
   14014 
   14015 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
   14016 
   14017 /* Format a float into the writer if the writer is not NULL, or into *p_output
   14018    otherwise.
   14019 
   14020    Return 0 on success, raise an exception and return -1 on error. */
   14021 static int
   14022 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
   14023             PyObject **p_output,
   14024             _PyUnicodeWriter *writer)
   14025 {
   14026     char *p;
   14027     double x;
   14028     Py_ssize_t len;
   14029     int prec;
   14030     int dtoa_flags;
   14031 
   14032     x = PyFloat_AsDouble(v);
   14033     if (x == -1.0 && PyErr_Occurred())
   14034         return -1;
   14035 
   14036     prec = arg->prec;
   14037     if (prec < 0)
   14038         prec = 6;
   14039 
   14040     if (arg->flags & F_ALT)
   14041         dtoa_flags = Py_DTSF_ALT;
   14042     else
   14043         dtoa_flags = 0;
   14044     p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
   14045     if (p == NULL)
   14046         return -1;
   14047     len = strlen(p);
   14048     if (writer) {
   14049         if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
   14050             PyMem_Free(p);
   14051             return -1;
   14052         }
   14053     }
   14054     else
   14055         *p_output = _PyUnicode_FromASCII(p, len);
   14056     PyMem_Free(p);
   14057     return 0;
   14058 }
   14059 
   14060 /* formatlong() emulates the format codes d, u, o, x and X, and
   14061  * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
   14062  * Python's regular ints.
   14063  * Return value:  a new PyUnicodeObject*, or NULL if error.
   14064  *     The output string is of the form
   14065  *         "-"? ("0x" | "0X")? digit+
   14066  *     "0x"/"0X" are present only for x and X conversions, with F_ALT
   14067  *         set in flags.  The case of hex digits will be correct,
   14068  *     There will be at least prec digits, zero-filled on the left if
   14069  *         necessary to get that many.
   14070  * val          object to be converted
   14071  * flags        bitmask of format flags; only F_ALT is looked at
   14072  * prec         minimum number of digits; 0-fill on left if needed
   14073  * type         a character in [duoxX]; u acts the same as d
   14074  *
   14075  * CAUTION:  o, x and X conversions on regular ints can never
   14076  * produce a '-' sign, but can for Python's unbounded ints.
   14077  */
   14078 PyObject *
   14079 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
   14080 {
   14081     PyObject *result = NULL;
   14082     char *buf;
   14083     Py_ssize_t i;
   14084     int sign;           /* 1 if '-', else 0 */
   14085     int len;            /* number of characters */
   14086     Py_ssize_t llen;
   14087     int numdigits;      /* len == numnondigits + numdigits */
   14088     int numnondigits = 0;
   14089 
   14090     /* Avoid exceeding SSIZE_T_MAX */
   14091     if (prec > INT_MAX-3) {
   14092         PyErr_SetString(PyExc_OverflowError,
   14093                         "precision too large");
   14094         return NULL;
   14095     }
   14096 
   14097     assert(PyLong_Check(val));
   14098 
   14099     switch (type) {
   14100     default:
   14101         assert(!"'type' not in [diuoxX]");
   14102     case 'd':
   14103     case 'i':
   14104     case 'u':
   14105         /* int and int subclasses should print numerically when a numeric */
   14106         /* format code is used (see issue18780) */
   14107         result = PyNumber_ToBase(val, 10);
   14108         break;
   14109     case 'o':
   14110         numnondigits = 2;
   14111         result = PyNumber_ToBase(val, 8);
   14112         break;
   14113     case 'x':
   14114     case 'X':
   14115         numnondigits = 2;
   14116         result = PyNumber_ToBase(val, 16);
   14117         break;
   14118     }
   14119     if (!result)
   14120         return NULL;
   14121 
   14122     assert(unicode_modifiable(result));
   14123     assert(PyUnicode_IS_READY(result));
   14124     assert(PyUnicode_IS_ASCII(result));
   14125 
   14126     /* To modify the string in-place, there can only be one reference. */
   14127     if (Py_REFCNT(result) != 1) {
   14128         Py_DECREF(result);
   14129         PyErr_BadInternalCall();
   14130         return NULL;
   14131     }
   14132     buf = PyUnicode_DATA(result);
   14133     llen = PyUnicode_GET_LENGTH(result);
   14134     if (llen > INT_MAX) {
   14135         Py_DECREF(result);
   14136         PyErr_SetString(PyExc_ValueError,
   14137                         "string too large in _PyUnicode_FormatLong");
   14138         return NULL;
   14139     }
   14140     len = (int)llen;
   14141     sign = buf[0] == '-';
   14142     numnondigits += sign;
   14143     numdigits = len - numnondigits;
   14144     assert(numdigits > 0);
   14145 
   14146     /* Get rid of base marker unless F_ALT */
   14147     if (((alt) == 0 &&
   14148         (type == 'o' || type == 'x' || type == 'X'))) {
   14149         assert(buf[sign] == '0');
   14150         assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
   14151                buf[sign+1] == 'o');
   14152         numnondigits -= 2;
   14153         buf += 2;
   14154         len -= 2;
   14155         if (sign)
   14156             buf[0] = '-';
   14157         assert(len == numnondigits + numdigits);
   14158         assert(numdigits > 0);
   14159     }
   14160 
   14161     /* Fill with leading zeroes to meet minimum width. */
   14162     if (prec > numdigits) {
   14163         PyObject *r1 = PyBytes_FromStringAndSize(NULL,
   14164                                 numnondigits + prec);
   14165         char *b1;
   14166         if (!r1) {
   14167             Py_DECREF(result);
   14168             return NULL;
   14169         }
   14170         b1 = PyBytes_AS_STRING(r1);
   14171         for (i = 0; i < numnondigits; ++i)
   14172             *b1++ = *buf++;
   14173         for (i = 0; i < prec - numdigits; i++)
   14174             *b1++ = '0';
   14175         for (i = 0; i < numdigits; i++)
   14176             *b1++ = *buf++;
   14177         *b1 = '\0';
   14178         Py_DECREF(result);
   14179         result = r1;
   14180         buf = PyBytes_AS_STRING(result);
   14181         len = numnondigits + prec;
   14182     }
   14183 
   14184     /* Fix up case for hex conversions. */
   14185     if (type == 'X') {
   14186         /* Need to convert all lower case letters to upper case.
   14187            and need to convert 0x to 0X (and -0x to -0X). */
   14188         for (i = 0; i < len; i++)
   14189             if (buf[i] >= 'a' && buf[i] <= 'x')
   14190                 buf[i] -= 'a'-'A';
   14191     }
   14192     if (!PyUnicode_Check(result)
   14193         || buf != PyUnicode_DATA(result)) {
   14194         PyObject *unicode;
   14195         unicode = _PyUnicode_FromASCII(buf, len);
   14196         Py_DECREF(result);
   14197         result = unicode;
   14198     }
   14199     else if (len != PyUnicode_GET_LENGTH(result)) {
   14200         if (PyUnicode_Resize(&result, len) < 0)
   14201             Py_CLEAR(result);
   14202     }
   14203     return result;
   14204 }
   14205 
   14206 /* Format an integer or a float as an integer.
   14207  * Return 1 if the number has been formatted into the writer,
   14208  *        0 if the number has been formatted into *p_output
   14209  *       -1 and raise an exception on error */
   14210 static int
   14211 mainformatlong(PyObject *v,
   14212                struct unicode_format_arg_t *arg,
   14213                PyObject **p_output,
   14214                _PyUnicodeWriter *writer)
   14215 {
   14216     PyObject *iobj, *res;
   14217     char type = (char)arg->ch;
   14218 
   14219     if (!PyNumber_Check(v))
   14220         goto wrongtype;
   14221 
   14222     /* make sure number is a type of integer for o, x, and X */
   14223     if (!PyLong_Check(v)) {
   14224         if (type == 'o' || type == 'x' || type == 'X') {
   14225             iobj = PyNumber_Index(v);
   14226             if (iobj == NULL) {
   14227                 if (PyErr_ExceptionMatches(PyExc_TypeError))
   14228                     goto wrongtype;
   14229                 return -1;
   14230             }
   14231         }
   14232         else {
   14233             iobj = PyNumber_Long(v);
   14234             if (iobj == NULL ) {
   14235                 if (PyErr_ExceptionMatches(PyExc_TypeError))
   14236                     goto wrongtype;
   14237                 return -1;
   14238             }
   14239         }
   14240         assert(PyLong_Check(iobj));
   14241     }
   14242     else {
   14243         iobj = v;
   14244         Py_INCREF(iobj);
   14245     }
   14246 
   14247     if (PyLong_CheckExact(v)
   14248         && arg->width == -1 && arg->prec == -1
   14249         && !(arg->flags & (F_SIGN | F_BLANK))
   14250         && type != 'X')
   14251     {
   14252         /* Fast path */
   14253         int alternate = arg->flags & F_ALT;
   14254         int base;
   14255 
   14256         switch(type)
   14257         {
   14258             default:
   14259                 assert(0 && "'type' not in [diuoxX]");
   14260             case 'd':
   14261             case 'i':
   14262             case 'u':
   14263                 base = 10;
   14264                 break;
   14265             case 'o':
   14266                 base = 8;
   14267                 break;
   14268             case 'x':
   14269             case 'X':
   14270                 base = 16;
   14271                 break;
   14272         }
   14273 
   14274         if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
   14275             Py_DECREF(iobj);
   14276             return -1;
   14277         }
   14278         Py_DECREF(iobj);
   14279         return 1;
   14280     }
   14281 
   14282     res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
   14283     Py_DECREF(iobj);
   14284     if (res == NULL)
   14285         return -1;
   14286     *p_output = res;
   14287     return 0;
   14288 
   14289 wrongtype:
   14290     switch(type)
   14291     {
   14292         case 'o':
   14293         case 'x':
   14294         case 'X':
   14295             PyErr_Format(PyExc_TypeError,
   14296                     "%%%c format: an integer is required, "
   14297                     "not %.200s",
   14298                     type, Py_TYPE(v)->tp_name);
   14299             break;
   14300         default:
   14301             PyErr_Format(PyExc_TypeError,
   14302                     "%%%c format: a number is required, "
   14303                     "not %.200s",
   14304                     type, Py_TYPE(v)->tp_name);
   14305             break;
   14306     }
   14307     return -1;
   14308 }
   14309 
   14310 static Py_UCS4
   14311 formatchar(PyObject *v)
   14312 {
   14313     /* presume that the buffer is at least 3 characters long */
   14314     if (PyUnicode_Check(v)) {
   14315         if (PyUnicode_GET_LENGTH(v) == 1) {
   14316             return PyUnicode_READ_CHAR(v, 0);
   14317         }
   14318         goto onError;
   14319     }
   14320     else {
   14321         PyObject *iobj;
   14322         long x;
   14323         /* make sure number is a type of integer */
   14324         if (!PyLong_Check(v)) {
   14325             iobj = PyNumber_Index(v);
   14326             if (iobj == NULL) {
   14327                 goto onError;
   14328             }
   14329             x = PyLong_AsLong(iobj);
   14330             Py_DECREF(iobj);
   14331         }
   14332         else {
   14333             x = PyLong_AsLong(v);
   14334         }
   14335         if (x == -1 && PyErr_Occurred())
   14336             goto onError;
   14337 
   14338         if (x < 0 || x > MAX_UNICODE) {
   14339             PyErr_SetString(PyExc_OverflowError,
   14340                             "%c arg not in range(0x110000)");
   14341             return (Py_UCS4) -1;
   14342         }
   14343 
   14344         return (Py_UCS4) x;
   14345     }
   14346 
   14347   onError:
   14348     PyErr_SetString(PyExc_TypeError,
   14349                     "%c requires int or char");
   14350     return (Py_UCS4) -1;
   14351 }
   14352 
   14353 /* Parse options of an argument: flags, width, precision.
   14354    Handle also "%(name)" syntax.
   14355 
   14356    Return 0 if the argument has been formatted into arg->str.
   14357    Return 1 if the argument has been written into ctx->writer,
   14358    Raise an exception and return -1 on error. */
   14359 static int
   14360 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
   14361                          struct unicode_format_arg_t *arg)
   14362 {
   14363 #define FORMAT_READ(ctx) \
   14364         PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
   14365 
   14366     PyObject *v;
   14367 
   14368     if (arg->ch == '(') {
   14369         /* Get argument value from a dictionary. Example: "%(name)s". */
   14370         Py_ssize_t keystart;
   14371         Py_ssize_t keylen;
   14372         PyObject *key;
   14373         int pcount = 1;
   14374 
   14375         if (ctx->dict == NULL) {
   14376             PyErr_SetString(PyExc_TypeError,
   14377                             "format requires a mapping");
   14378             return -1;
   14379         }
   14380         ++ctx->fmtpos;
   14381         --ctx->fmtcnt;
   14382         keystart = ctx->fmtpos;
   14383         /* Skip over balanced parentheses */
   14384         while (pcount > 0 && --ctx->fmtcnt >= 0) {
   14385             arg->ch = FORMAT_READ(ctx);
   14386             if (arg->ch == ')')
   14387                 --pcount;
   14388             else if (arg->ch == '(')
   14389                 ++pcount;
   14390             ctx->fmtpos++;
   14391         }
   14392         keylen = ctx->fmtpos - keystart - 1;
   14393         if (ctx->fmtcnt < 0 || pcount > 0) {
   14394             PyErr_SetString(PyExc_ValueError,
   14395                             "incomplete format key");
   14396             return -1;
   14397         }
   14398         key = PyUnicode_Substring(ctx->fmtstr,
   14399                                   keystart, keystart + keylen);
   14400         if (key == NULL)
   14401             return -1;
   14402         if (ctx->args_owned) {
   14403             ctx->args_owned = 0;
   14404             Py_DECREF(ctx->args);
   14405         }
   14406         ctx->args = PyObject_GetItem(ctx->dict, key);
   14407         Py_DECREF(key);
   14408         if (ctx->args == NULL)
   14409             return -1;
   14410         ctx->args_owned = 1;
   14411         ctx->arglen = -1;
   14412         ctx->argidx = -2;
   14413     }
   14414 
   14415     /* Parse flags. Example: "%+i" => flags=F_SIGN. */
   14416     while (--ctx->fmtcnt >= 0) {
   14417         arg->ch = FORMAT_READ(ctx);
   14418         ctx->fmtpos++;
   14419         switch (arg->ch) {
   14420         case '-': arg->flags |= F_LJUST; continue;
   14421         case '+': arg->flags |= F_SIGN; continue;
   14422         case ' ': arg->flags |= F_BLANK; continue;
   14423         case '#': arg->flags |= F_ALT; continue;
   14424         case '0': arg->flags |= F_ZERO; continue;
   14425         }
   14426         break;
   14427     }
   14428 
   14429     /* Parse width. Example: "%10s" => width=10 */
   14430     if (arg->ch == '*') {
   14431         v = unicode_format_getnextarg(ctx);
   14432         if (v == NULL)
   14433             return -1;
   14434         if (!PyLong_Check(v)) {
   14435             PyErr_SetString(PyExc_TypeError,
   14436                             "* wants int");
   14437             return -1;
   14438         }
   14439         arg->width = PyLong_AsSsize_t(v);
   14440         if (arg->width == -1 && PyErr_Occurred())
   14441             return -1;
   14442         if (arg->width < 0) {
   14443             arg->flags |= F_LJUST;
   14444             arg->width = -arg->width;
   14445         }
   14446         if (--ctx->fmtcnt >= 0) {
   14447             arg->ch = FORMAT_READ(ctx);
   14448             ctx->fmtpos++;
   14449         }
   14450     }
   14451     else if (arg->ch >= '0' && arg->ch <= '9') {
   14452         arg->width = arg->ch - '0';
   14453         while (--ctx->fmtcnt >= 0) {
   14454             arg->ch = FORMAT_READ(ctx);
   14455             ctx->fmtpos++;
   14456             if (arg->ch < '0' || arg->ch > '9')
   14457                 break;
   14458             /* Since arg->ch is unsigned, the RHS would end up as unsigned,
   14459                mixing signed and unsigned comparison. Since arg->ch is between
   14460                '0' and '9', casting to int is safe. */
   14461             if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
   14462                 PyErr_SetString(PyExc_ValueError,
   14463                                 "width too big");
   14464                 return -1;
   14465             }
   14466             arg->width = arg->width*10 + (arg->ch - '0');
   14467         }
   14468     }
   14469 
   14470     /* Parse precision. Example: "%.3f" => prec=3 */
   14471     if (arg->ch == '.') {
   14472         arg->prec = 0;
   14473         if (--ctx->fmtcnt >= 0) {
   14474             arg->ch = FORMAT_READ(ctx);
   14475             ctx->fmtpos++;
   14476         }
   14477         if (arg->ch == '*') {
   14478             v = unicode_format_getnextarg(ctx);
   14479             if (v == NULL)
   14480                 return -1;
   14481             if (!PyLong_Check(v)) {
   14482                 PyErr_SetString(PyExc_TypeError,
   14483                                 "* wants int");
   14484                 return -1;
   14485             }
   14486             arg->prec = _PyLong_AsInt(v);
   14487             if (arg->prec == -1 && PyErr_Occurred())
   14488                 return -1;
   14489             if (arg->prec < 0)
   14490                 arg->prec = 0;
   14491             if (--ctx->fmtcnt >= 0) {
   14492                 arg->ch = FORMAT_READ(ctx);
   14493                 ctx->fmtpos++;
   14494             }
   14495         }
   14496         else if (arg->ch >= '0' && arg->ch <= '9') {
   14497             arg->prec = arg->ch - '0';
   14498             while (--ctx->fmtcnt >= 0) {
   14499                 arg->ch = FORMAT_READ(ctx);
   14500                 ctx->fmtpos++;
   14501                 if (arg->ch < '0' || arg->ch > '9')
   14502                     break;
   14503                 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
   14504                     PyErr_SetString(PyExc_ValueError,
   14505                                     "precision too big");
   14506                     return -1;
   14507                 }
   14508                 arg->prec = arg->prec*10 + (arg->ch - '0');
   14509             }
   14510         }
   14511     }
   14512 
   14513     /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
   14514     if (ctx->fmtcnt >= 0) {
   14515         if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
   14516             if (--ctx->fmtcnt >= 0) {
   14517                 arg->ch = FORMAT_READ(ctx);
   14518                 ctx->fmtpos++;
   14519             }
   14520         }
   14521     }
   14522     if (ctx->fmtcnt < 0) {
   14523         PyErr_SetString(PyExc_ValueError,
   14524                         "incomplete format");
   14525         return -1;
   14526     }
   14527     return 0;
   14528 
   14529 #undef FORMAT_READ
   14530 }
   14531 
   14532 /* Format one argument. Supported conversion specifiers:
   14533 
   14534    - "s", "r", "a": any type
   14535    - "i", "d", "u": int or float
   14536    - "o", "x", "X": int
   14537    - "e", "E", "f", "F", "g", "G": float
   14538    - "c": int or str (1 character)
   14539 
   14540    When possible, the output is written directly into the Unicode writer
   14541    (ctx->writer). A string is created when padding is required.
   14542 
   14543    Return 0 if the argument has been formatted into *p_str,
   14544           1 if the argument has been written into ctx->writer,
   14545          -1 on error. */
   14546 static int
   14547 unicode_format_arg_format(struct unicode_formatter_t *ctx,
   14548                           struct unicode_format_arg_t *arg,
   14549                           PyObject **p_str)
   14550 {
   14551     PyObject *v;
   14552     _PyUnicodeWriter *writer = &ctx->writer;
   14553 
   14554     if (ctx->fmtcnt == 0)
   14555         ctx->writer.overallocate = 0;
   14556 
   14557     if (arg->ch == '%') {
   14558         if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
   14559             return -1;
   14560         return 1;
   14561     }
   14562 
   14563     v = unicode_format_getnextarg(ctx);
   14564     if (v == NULL)
   14565         return -1;
   14566 
   14567 
   14568     switch (arg->ch) {
   14569     case 's':
   14570     case 'r':
   14571     case 'a':
   14572         if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
   14573             /* Fast path */
   14574             if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
   14575                 return -1;
   14576             return 1;
   14577         }
   14578 
   14579         if (PyUnicode_CheckExact(v) && arg->ch == 's') {
   14580             *p_str = v;
   14581             Py_INCREF(*p_str);
   14582         }
   14583         else {
   14584             if (arg->ch == 's')
   14585                 *p_str = PyObject_Str(v);
   14586             else if (arg->ch == 'r')
   14587                 *p_str = PyObject_Repr(v);
   14588             else
   14589                 *p_str = PyObject_ASCII(v);
   14590         }
   14591         break;
   14592 
   14593     case 'i':
   14594     case 'd':
   14595     case 'u':
   14596     case 'o':
   14597     case 'x':
   14598     case 'X':
   14599     {
   14600         int ret = mainformatlong(v, arg, p_str, writer);
   14601         if (ret != 0)
   14602             return ret;
   14603         arg->sign = 1;
   14604         break;
   14605     }
   14606 
   14607     case 'e':
   14608     case 'E':
   14609     case 'f':
   14610     case 'F':
   14611     case 'g':
   14612     case 'G':
   14613         if (arg->width == -1 && arg->prec == -1
   14614             && !(arg->flags & (F_SIGN | F_BLANK)))
   14615         {
   14616             /* Fast path */
   14617             if (formatfloat(v, arg, NULL, writer) == -1)
   14618                 return -1;
   14619             return 1;
   14620         }
   14621 
   14622         arg->sign = 1;
   14623         if (formatfloat(v, arg, p_str, NULL) == -1)
   14624             return -1;
   14625         break;
   14626 
   14627     case 'c':
   14628     {
   14629         Py_UCS4 ch = formatchar(v);
   14630         if (ch == (Py_UCS4) -1)
   14631             return -1;
   14632         if (arg->width == -1 && arg->prec == -1) {
   14633             /* Fast path */
   14634             if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
   14635                 return -1;
   14636             return 1;
   14637         }
   14638         *p_str = PyUnicode_FromOrdinal(ch);
   14639         break;
   14640     }
   14641 
   14642     default:
   14643         PyErr_Format(PyExc_ValueError,
   14644                      "unsupported format character '%c' (0x%x) "
   14645                      "at index %zd",
   14646                      (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
   14647                      (int)arg->ch,
   14648                      ctx->fmtpos - 1);
   14649         return -1;
   14650     }
   14651     if (*p_str == NULL)
   14652         return -1;
   14653     assert (PyUnicode_Check(*p_str));
   14654     return 0;
   14655 }
   14656 
   14657 static int
   14658 unicode_format_arg_output(struct unicode_formatter_t *ctx,
   14659                           struct unicode_format_arg_t *arg,
   14660                           PyObject *str)
   14661 {
   14662     Py_ssize_t len;
   14663     enum PyUnicode_Kind kind;
   14664     void *pbuf;
   14665     Py_ssize_t pindex;
   14666     Py_UCS4 signchar;
   14667     Py_ssize_t buflen;
   14668     Py_UCS4 maxchar;
   14669     Py_ssize_t sublen;
   14670     _PyUnicodeWriter *writer = &ctx->writer;
   14671     Py_UCS4 fill;
   14672 
   14673     fill = ' ';
   14674     if (arg->sign && arg->flags & F_ZERO)
   14675         fill = '0';
   14676 
   14677     if (PyUnicode_READY(str) == -1)
   14678         return -1;
   14679 
   14680     len = PyUnicode_GET_LENGTH(str);
   14681     if ((arg->width == -1 || arg->width <= len)
   14682         && (arg->prec == -1 || arg->prec >= len)
   14683         && !(arg->flags & (F_SIGN | F_BLANK)))
   14684     {
   14685         /* Fast path */
   14686         if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
   14687             return -1;
   14688         return 0;
   14689     }
   14690 
   14691     /* Truncate the string for "s", "r" and "a" formats
   14692        if the precision is set */
   14693     if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
   14694         if (arg->prec >= 0 && len > arg->prec)
   14695             len = arg->prec;
   14696     }
   14697 
   14698     /* Adjust sign and width */
   14699     kind = PyUnicode_KIND(str);
   14700     pbuf = PyUnicode_DATA(str);
   14701     pindex = 0;
   14702     signchar = '\0';
   14703     if (arg->sign) {
   14704         Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
   14705         if (ch == '-' || ch == '+') {
   14706             signchar = ch;
   14707             len--;
   14708             pindex++;
   14709         }
   14710         else if (arg->flags & F_SIGN)
   14711             signchar = '+';
   14712         else if (arg->flags & F_BLANK)
   14713             signchar = ' ';
   14714         else
   14715             arg->sign = 0;
   14716     }
   14717     if (arg->width < len)
   14718         arg->width = len;
   14719 
   14720     /* Prepare the writer */
   14721     maxchar = writer->maxchar;
   14722     if (!(arg->flags & F_LJUST)) {
   14723         if (arg->sign) {
   14724             if ((arg->width-1) > len)
   14725                 maxchar = Py_MAX(maxchar, fill);
   14726         }
   14727         else {
   14728             if (arg->width > len)
   14729                 maxchar = Py_MAX(maxchar, fill);
   14730         }
   14731     }
   14732     if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
   14733         Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
   14734         maxchar = Py_MAX(maxchar, strmaxchar);
   14735     }
   14736 
   14737     buflen = arg->width;
   14738     if (arg->sign && len == arg->width)
   14739         buflen++;
   14740     if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
   14741         return -1;
   14742 
   14743     /* Write the sign if needed */
   14744     if (arg->sign) {
   14745         if (fill != ' ') {
   14746             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
   14747             writer->pos += 1;
   14748         }
   14749         if (arg->width > len)
   14750             arg->width--;
   14751     }
   14752 
   14753     /* Write the numeric prefix for "x", "X" and "o" formats
   14754        if the alternate form is used.
   14755        For example, write "0x" for the "%#x" format. */
   14756     if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
   14757         assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
   14758         assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
   14759         if (fill != ' ') {
   14760             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
   14761             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
   14762             writer->pos += 2;
   14763             pindex += 2;
   14764         }
   14765         arg->width -= 2;
   14766         if (arg->width < 0)
   14767             arg->width = 0;
   14768         len -= 2;
   14769     }
   14770 
   14771     /* Pad left with the fill character if needed */
   14772     if (arg->width > len && !(arg->flags & F_LJUST)) {
   14773         sublen = arg->width - len;
   14774         FILL(writer->kind, writer->data, fill, writer->pos, sublen);
   14775         writer->pos += sublen;
   14776         arg->width = len;
   14777     }
   14778 
   14779     /* If padding with spaces: write sign if needed and/or numeric prefix if
   14780        the alternate form is used */
   14781     if (fill == ' ') {
   14782         if (arg->sign) {
   14783             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
   14784             writer->pos += 1;
   14785         }
   14786         if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
   14787             assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
   14788             assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
   14789             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
   14790             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
   14791             writer->pos += 2;
   14792             pindex += 2;
   14793         }
   14794     }
   14795 
   14796     /* Write characters */
   14797     if (len) {
   14798         _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
   14799                                       str, pindex, len);
   14800         writer->pos += len;
   14801     }
   14802 
   14803     /* Pad right with the fill character if needed */
   14804     if (arg->width > len) {
   14805         sublen = arg->width - len;
   14806         FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
   14807         writer->pos += sublen;
   14808     }
   14809     return 0;
   14810 }
   14811 
   14812 /* Helper of PyUnicode_Format(): format one arg.
   14813    Return 0 on success, raise an exception and return -1 on error. */
   14814 static int
   14815 unicode_format_arg(struct unicode_formatter_t *ctx)
   14816 {
   14817     struct unicode_format_arg_t arg;
   14818     PyObject *str;
   14819     int ret;
   14820 
   14821     arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
   14822     arg.flags = 0;
   14823     arg.width = -1;
   14824     arg.prec = -1;
   14825     arg.sign = 0;
   14826     str = NULL;
   14827 
   14828     ret = unicode_format_arg_parse(ctx, &arg);
   14829     if (ret == -1)
   14830         return -1;
   14831 
   14832     ret = unicode_format_arg_format(ctx, &arg, &str);
   14833     if (ret == -1)
   14834         return -1;
   14835 
   14836     if (ret != 1) {
   14837         ret = unicode_format_arg_output(ctx, &arg, str);
   14838         Py_DECREF(str);
   14839         if (ret == -1)
   14840             return -1;
   14841     }
   14842 
   14843     if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
   14844         PyErr_SetString(PyExc_TypeError,
   14845                         "not all arguments converted during string formatting");
   14846         return -1;
   14847     }
   14848     return 0;
   14849 }
   14850 
   14851 PyObject *
   14852 PyUnicode_Format(PyObject *format, PyObject *args)
   14853 {
   14854     struct unicode_formatter_t ctx;
   14855 
   14856     if (format == NULL || args == NULL) {
   14857         PyErr_BadInternalCall();
   14858         return NULL;
   14859     }
   14860 
   14861     if (ensure_unicode(format) < 0)
   14862         return NULL;
   14863 
   14864     ctx.fmtstr = format;
   14865     ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
   14866     ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
   14867     ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
   14868     ctx.fmtpos = 0;
   14869 
   14870     _PyUnicodeWriter_Init(&ctx.writer);
   14871     ctx.writer.min_length = ctx.fmtcnt + 100;
   14872     ctx.writer.overallocate = 1;
   14873 
   14874     if (PyTuple_Check(args)) {
   14875         ctx.arglen = PyTuple_Size(args);
   14876         ctx.argidx = 0;
   14877     }
   14878     else {
   14879         ctx.arglen = -1;
   14880         ctx.argidx = -2;
   14881     }
   14882     ctx.args_owned = 0;
   14883     if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
   14884         ctx.dict = args;
   14885     else
   14886         ctx.dict = NULL;
   14887     ctx.args = args;
   14888 
   14889     while (--ctx.fmtcnt >= 0) {
   14890         if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
   14891             Py_ssize_t nonfmtpos;
   14892 
   14893             nonfmtpos = ctx.fmtpos++;
   14894             while (ctx.fmtcnt >= 0 &&
   14895                    PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
   14896                 ctx.fmtpos++;
   14897                 ctx.fmtcnt--;
   14898             }
   14899             if (ctx.fmtcnt < 0) {
   14900                 ctx.fmtpos--;
   14901                 ctx.writer.overallocate = 0;
   14902             }
   14903 
   14904             if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
   14905                                                 nonfmtpos, ctx.fmtpos) < 0)
   14906                 goto onError;
   14907         }
   14908         else {
   14909             ctx.fmtpos++;
   14910             if (unicode_format_arg(&ctx) == -1)
   14911                 goto onError;
   14912         }
   14913     }
   14914 
   14915     if (ctx.argidx < ctx.arglen && !ctx.dict) {
   14916         PyErr_SetString(PyExc_TypeError,
   14917                         "not all arguments converted during string formatting");
   14918         goto onError;
   14919     }
   14920 
   14921     if (ctx.args_owned) {
   14922         Py_DECREF(ctx.args);
   14923     }
   14924     return _PyUnicodeWriter_Finish(&ctx.writer);
   14925 
   14926   onError:
   14927     _PyUnicodeWriter_Dealloc(&ctx.writer);
   14928     if (ctx.args_owned) {
   14929         Py_DECREF(ctx.args);
   14930     }
   14931     return NULL;
   14932 }
   14933 
   14934 static PyObject *
   14935 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
   14936 
   14937 static PyObject *
   14938 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
   14939 {
   14940     PyObject *x = NULL;
   14941     static char *kwlist[] = {"object", "encoding", "errors", 0};
   14942     char *encoding = NULL;
   14943     char *errors = NULL;
   14944 
   14945     if (type != &PyUnicode_Type)
   14946         return unicode_subtype_new(type, args, kwds);
   14947     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
   14948                                      kwlist, &x, &encoding, &errors))
   14949         return NULL;
   14950     if (x == NULL)
   14951         _Py_RETURN_UNICODE_EMPTY();
   14952     if (encoding == NULL && errors == NULL)
   14953         return PyObject_Str(x);
   14954     else
   14955         return PyUnicode_FromEncodedObject(x, encoding, errors);
   14956 }
   14957 
   14958 static PyObject *
   14959 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
   14960 {
   14961     PyObject *unicode, *self;
   14962     Py_ssize_t length, char_size;
   14963     int share_wstr, share_utf8;
   14964     unsigned int kind;
   14965     void *data;
   14966 
   14967     assert(PyType_IsSubtype(type, &PyUnicode_Type));
   14968 
   14969     unicode = unicode_new(&PyUnicode_Type, args, kwds);
   14970     if (unicode == NULL)
   14971         return NULL;
   14972     assert(_PyUnicode_CHECK(unicode));
   14973     if (PyUnicode_READY(unicode) == -1) {
   14974         Py_DECREF(unicode);
   14975         return NULL;
   14976     }
   14977 
   14978     self = type->tp_alloc(type, 0);
   14979     if (self == NULL) {
   14980         Py_DECREF(unicode);
   14981         return NULL;
   14982     }
   14983     kind = PyUnicode_KIND(unicode);
   14984     length = PyUnicode_GET_LENGTH(unicode);
   14985 
   14986     _PyUnicode_LENGTH(self) = length;
   14987 #ifdef Py_DEBUG
   14988     _PyUnicode_HASH(self) = -1;
   14989 #else
   14990     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
   14991 #endif
   14992     _PyUnicode_STATE(self).interned = 0;
   14993     _PyUnicode_STATE(self).kind = kind;
   14994     _PyUnicode_STATE(self).compact = 0;
   14995     _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
   14996     _PyUnicode_STATE(self).ready = 1;
   14997     _PyUnicode_WSTR(self) = NULL;
   14998     _PyUnicode_UTF8_LENGTH(self) = 0;
   14999     _PyUnicode_UTF8(self) = NULL;
   15000     _PyUnicode_WSTR_LENGTH(self) = 0;
   15001     _PyUnicode_DATA_ANY(self) = NULL;
   15002 
   15003     share_utf8 = 0;
   15004     share_wstr = 0;
   15005     if (kind == PyUnicode_1BYTE_KIND) {
   15006         char_size = 1;
   15007         if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
   15008             share_utf8 = 1;
   15009     }
   15010     else if (kind == PyUnicode_2BYTE_KIND) {
   15011         char_size = 2;
   15012         if (sizeof(wchar_t) == 2)
   15013             share_wstr = 1;
   15014     }
   15015     else {
   15016         assert(kind == PyUnicode_4BYTE_KIND);
   15017         char_size = 4;
   15018         if (sizeof(wchar_t) == 4)
   15019             share_wstr = 1;
   15020     }
   15021 
   15022     /* Ensure we won't overflow the length. */
   15023     if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
   15024         PyErr_NoMemory();
   15025         goto onError;
   15026     }
   15027     data = PyObject_MALLOC((length + 1) * char_size);
   15028     if (data == NULL) {
   15029         PyErr_NoMemory();
   15030         goto onError;
   15031     }
   15032 
   15033     _PyUnicode_DATA_ANY(self) = data;
   15034     if (share_utf8) {
   15035         _PyUnicode_UTF8_LENGTH(self) = length;
   15036         _PyUnicode_UTF8(self) = data;
   15037     }
   15038     if (share_wstr) {
   15039         _PyUnicode_WSTR_LENGTH(self) = length;
   15040         _PyUnicode_WSTR(self) = (wchar_t *)data;
   15041     }
   15042 
   15043     memcpy(data, PyUnicode_DATA(unicode),
   15044               kind * (length + 1));
   15045     assert(_PyUnicode_CheckConsistency(self, 1));
   15046 #ifdef Py_DEBUG
   15047     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
   15048 #endif
   15049     Py_DECREF(unicode);
   15050     return self;
   15051 
   15052 onError:
   15053     Py_DECREF(unicode);
   15054     Py_DECREF(self);
   15055     return NULL;
   15056 }
   15057 
   15058 PyDoc_STRVAR(unicode_doc,
   15059 "str(object='') -> str\n\
   15060 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
   15061 \n\
   15062 Create a new string object from the given object. If encoding or\n\
   15063 errors is specified, then the object must expose a data buffer\n\
   15064 that will be decoded using the given encoding and error handler.\n\
   15065 Otherwise, returns the result of object.__str__() (if defined)\n\
   15066 or repr(object).\n\
   15067 encoding defaults to sys.getdefaultencoding().\n\
   15068 errors defaults to 'strict'.");
   15069 
   15070 static PyObject *unicode_iter(PyObject *seq);
   15071 
   15072 PyTypeObject PyUnicode_Type = {
   15073     PyVarObject_HEAD_INIT(&PyType_Type, 0)
   15074     "str",              /* tp_name */
   15075     sizeof(PyUnicodeObject),        /* tp_size */
   15076     0,                  /* tp_itemsize */
   15077     /* Slots */
   15078     (destructor)unicode_dealloc,    /* tp_dealloc */
   15079     0,                  /* tp_print */
   15080     0,                  /* tp_getattr */
   15081     0,                  /* tp_setattr */
   15082     0,                  /* tp_reserved */
   15083     unicode_repr,           /* tp_repr */
   15084     &unicode_as_number,         /* tp_as_number */
   15085     &unicode_as_sequence,       /* tp_as_sequence */
   15086     &unicode_as_mapping,        /* tp_as_mapping */
   15087     (hashfunc) unicode_hash,        /* tp_hash*/
   15088     0,                  /* tp_call*/
   15089     (reprfunc) unicode_str,     /* tp_str */
   15090     PyObject_GenericGetAttr,        /* tp_getattro */
   15091     0,                  /* tp_setattro */
   15092     0,                  /* tp_as_buffer */
   15093     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
   15094     Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
   15095     unicode_doc,            /* tp_doc */
   15096     0,                  /* tp_traverse */
   15097     0,                  /* tp_clear */
   15098     PyUnicode_RichCompare,      /* tp_richcompare */
   15099     0,                  /* tp_weaklistoffset */
   15100     unicode_iter,           /* tp_iter */
   15101     0,                  /* tp_iternext */
   15102     unicode_methods,            /* tp_methods */
   15103     0,                  /* tp_members */
   15104     0,                  /* tp_getset */
   15105     &PyBaseObject_Type,         /* tp_base */
   15106     0,                  /* tp_dict */
   15107     0,                  /* tp_descr_get */
   15108     0,                  /* tp_descr_set */
   15109     0,                  /* tp_dictoffset */
   15110     0,                  /* tp_init */
   15111     0,                  /* tp_alloc */
   15112     unicode_new,            /* tp_new */
   15113     PyObject_Del,           /* tp_free */
   15114 };
   15115 
   15116 /* Initialize the Unicode implementation */
   15117 
   15118 int _PyUnicode_Init(void)
   15119 {
   15120     /* XXX - move this array to unicodectype.c ? */
   15121     Py_UCS2 linebreak[] = {
   15122         0x000A, /* LINE FEED */
   15123         0x000D, /* CARRIAGE RETURN */
   15124         0x001C, /* FILE SEPARATOR */
   15125         0x001D, /* GROUP SEPARATOR */
   15126         0x001E, /* RECORD SEPARATOR */
   15127         0x0085, /* NEXT LINE */
   15128         0x2028, /* LINE SEPARATOR */
   15129         0x2029, /* PARAGRAPH SEPARATOR */
   15130     };
   15131 
   15132     /* Init the implementation */
   15133     _Py_INCREF_UNICODE_EMPTY();
   15134     if (!unicode_empty)
   15135         Py_FatalError("Can't create empty string");
   15136     Py_DECREF(unicode_empty);
   15137 
   15138     if (PyType_Ready(&PyUnicode_Type) < 0)
   15139         Py_FatalError("Can't initialize 'unicode'");
   15140 
   15141     /* initialize the linebreak bloom filter */
   15142     bloom_linebreak = make_bloom_mask(
   15143         PyUnicode_2BYTE_KIND, linebreak,
   15144         Py_ARRAY_LENGTH(linebreak));
   15145 
   15146     if (PyType_Ready(&EncodingMapType) < 0)
   15147          Py_FatalError("Can't initialize encoding map type");
   15148 
   15149     if (PyType_Ready(&PyFieldNameIter_Type) < 0)
   15150         Py_FatalError("Can't initialize field name iterator type");
   15151 
   15152     if (PyType_Ready(&PyFormatterIter_Type) < 0)
   15153         Py_FatalError("Can't initialize formatter iter type");
   15154 
   15155     return 0;
   15156 }
   15157 
   15158 /* Finalize the Unicode implementation */
   15159 
   15160 int
   15161 PyUnicode_ClearFreeList(void)
   15162 {
   15163     return 0;
   15164 }
   15165 
   15166 void
   15167 _PyUnicode_Fini(void)
   15168 {
   15169     int i;
   15170 
   15171     Py_CLEAR(unicode_empty);
   15172 
   15173     for (i = 0; i < 256; i++)
   15174         Py_CLEAR(unicode_latin1[i]);
   15175     _PyUnicode_ClearStaticStrings();
   15176     (void)PyUnicode_ClearFreeList();
   15177 }
   15178 
   15179 void
   15180 PyUnicode_InternInPlace(PyObject **p)
   15181 {
   15182     PyObject *s = *p;
   15183     PyObject *t;
   15184 #ifdef Py_DEBUG
   15185     assert(s != NULL);
   15186     assert(_PyUnicode_CHECK(s));
   15187 #else
   15188     if (s == NULL || !PyUnicode_Check(s))
   15189         return;
   15190 #endif
   15191     /* If it's a subclass, we don't really know what putting
   15192        it in the interned dict might do. */
   15193     if (!PyUnicode_CheckExact(s))
   15194         return;
   15195     if (PyUnicode_CHECK_INTERNED(s))
   15196         return;
   15197     if (interned == NULL) {
   15198         interned = PyDict_New();
   15199         if (interned == NULL) {
   15200             PyErr_Clear(); /* Don't leave an exception */
   15201             return;
   15202         }
   15203     }
   15204     Py_ALLOW_RECURSION
   15205     t = PyDict_SetDefault(interned, s, s);
   15206     Py_END_ALLOW_RECURSION
   15207     if (t == NULL) {
   15208         PyErr_Clear();
   15209         return;
   15210     }
   15211     if (t != s) {
   15212         Py_INCREF(t);
   15213         Py_SETREF(*p, t);
   15214         return;
   15215     }
   15216     /* The two references in interned are not counted by refcnt.
   15217        The deallocator will take care of this */
   15218     Py_REFCNT(s) -= 2;
   15219     _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
   15220 }
   15221 
   15222 void
   15223 PyUnicode_InternImmortal(PyObject **p)
   15224 {
   15225     PyUnicode_InternInPlace(p);
   15226     if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
   15227         _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
   15228         Py_INCREF(*p);
   15229     }
   15230 }
   15231 
   15232 PyObject *
   15233 PyUnicode_InternFromString(const char *cp)
   15234 {
   15235     PyObject *s = PyUnicode_FromString(cp);
   15236     if (s == NULL)
   15237         return NULL;
   15238     PyUnicode_InternInPlace(&s);
   15239     return s;
   15240 }
   15241 
   15242 void
   15243 _Py_ReleaseInternedUnicodeStrings(void)
   15244 {
   15245     PyObject *keys;
   15246     PyObject *s;
   15247     Py_ssize_t i, n;
   15248     Py_ssize_t immortal_size = 0, mortal_size = 0;
   15249 
   15250     if (interned == NULL || !PyDict_Check(interned))
   15251         return;
   15252     keys = PyDict_Keys(interned);
   15253     if (keys == NULL || !PyList_Check(keys)) {
   15254         PyErr_Clear();
   15255         return;
   15256     }
   15257 
   15258     /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
   15259        detector, interned unicode strings are not forcibly deallocated;
   15260        rather, we give them their stolen references back, and then clear
   15261        and DECREF the interned dict. */
   15262 
   15263     n = PyList_GET_SIZE(keys);
   15264     fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
   15265             n);
   15266     for (i = 0; i < n; i++) {
   15267         s = PyList_GET_ITEM(keys, i);
   15268         if (PyUnicode_READY(s) == -1) {
   15269             assert(0 && "could not ready string");
   15270             fprintf(stderr, "could not ready string\n");
   15271         }
   15272         switch (PyUnicode_CHECK_INTERNED(s)) {
   15273         case SSTATE_NOT_INTERNED:
   15274             /* XXX Shouldn't happen */
   15275             break;
   15276         case SSTATE_INTERNED_IMMORTAL:
   15277             Py_REFCNT(s) += 1;
   15278             immortal_size += PyUnicode_GET_LENGTH(s);
   15279             break;
   15280         case SSTATE_INTERNED_MORTAL:
   15281             Py_REFCNT(s) += 2;
   15282             mortal_size += PyUnicode_GET_LENGTH(s);
   15283             break;
   15284         default:
   15285             Py_FatalError("Inconsistent interned string state.");
   15286         }
   15287         _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
   15288     }
   15289     fprintf(stderr, "total size of all interned strings: "
   15290             "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
   15291             "mortal/immortal\n", mortal_size, immortal_size);
   15292     Py_DECREF(keys);
   15293     PyDict_Clear(interned);
   15294     Py_CLEAR(interned);
   15295 }
   15296 
   15297 
   15298 /********************* Unicode Iterator **************************/
   15299 
   15300 typedef struct {
   15301     PyObject_HEAD
   15302     Py_ssize_t it_index;
   15303     PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
   15304 } unicodeiterobject;
   15305 
   15306 static void
   15307 unicodeiter_dealloc(unicodeiterobject *it)
   15308 {
   15309     _PyObject_GC_UNTRACK(it);
   15310     Py_XDECREF(it->it_seq);
   15311     PyObject_GC_Del(it);
   15312 }
   15313 
   15314 static int
   15315 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
   15316 {
   15317     Py_VISIT(it->it_seq);
   15318     return 0;
   15319 }
   15320 
   15321 static PyObject *
   15322 unicodeiter_next(unicodeiterobject *it)
   15323 {
   15324     PyObject *seq, *item;
   15325 
   15326     assert(it != NULL);
   15327     seq = it->it_seq;
   15328     if (seq == NULL)
   15329         return NULL;
   15330     assert(_PyUnicode_CHECK(seq));
   15331 
   15332     if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
   15333         int kind = PyUnicode_KIND(seq);
   15334         void *data = PyUnicode_DATA(seq);
   15335         Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
   15336         item = PyUnicode_FromOrdinal(chr);
   15337         if (item != NULL)
   15338             ++it->it_index;
   15339         return item;
   15340     }
   15341 
   15342     it->it_seq = NULL;
   15343     Py_DECREF(seq);
   15344     return NULL;
   15345 }
   15346 
   15347 static PyObject *
   15348 unicodeiter_len(unicodeiterobject *it)
   15349 {
   15350     Py_ssize_t len = 0;
   15351     if (it->it_seq)
   15352         len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
   15353     return PyLong_FromSsize_t(len);
   15354 }
   15355 
   15356 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
   15357 
   15358 static PyObject *
   15359 unicodeiter_reduce(unicodeiterobject *it)
   15360 {
   15361     if (it->it_seq != NULL) {
   15362         return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
   15363                              it->it_seq, it->it_index);
   15364     } else {
   15365         PyObject *u = PyUnicode_FromUnicode(NULL, 0);
   15366         if (u == NULL)
   15367             return NULL;
   15368         return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
   15369     }
   15370 }
   15371 
   15372 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
   15373 
   15374 static PyObject *
   15375 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
   15376 {
   15377     Py_ssize_t index = PyLong_AsSsize_t(state);
   15378     if (index == -1 && PyErr_Occurred())
   15379         return NULL;
   15380     if (it->it_seq != NULL) {
   15381         if (index < 0)
   15382             index = 0;
   15383         else if (index > PyUnicode_GET_LENGTH(it->it_seq))
   15384             index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
   15385         it->it_index = index;
   15386     }
   15387     Py_RETURN_NONE;
   15388 }
   15389 
   15390 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
   15391 
   15392 static PyMethodDef unicodeiter_methods[] = {
   15393     {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
   15394      length_hint_doc},
   15395     {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
   15396      reduce_doc},
   15397     {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
   15398      setstate_doc},
   15399     {NULL,      NULL}       /* sentinel */
   15400 };
   15401 
   15402 PyTypeObject PyUnicodeIter_Type = {
   15403     PyVarObject_HEAD_INIT(&PyType_Type, 0)
   15404     "str_iterator",         /* tp_name */
   15405     sizeof(unicodeiterobject),      /* tp_basicsize */
   15406     0,                  /* tp_itemsize */
   15407     /* methods */
   15408     (destructor)unicodeiter_dealloc,    /* tp_dealloc */
   15409     0,                  /* tp_print */
   15410     0,                  /* tp_getattr */
   15411     0,                  /* tp_setattr */
   15412     0,                  /* tp_reserved */
   15413     0,                  /* tp_repr */
   15414     0,                  /* tp_as_number */
   15415     0,                  /* tp_as_sequence */
   15416     0,                  /* tp_as_mapping */
   15417     0,                  /* tp_hash */
   15418     0,                  /* tp_call */
   15419     0,                  /* tp_str */
   15420     PyObject_GenericGetAttr,        /* tp_getattro */
   15421     0,                  /* tp_setattro */
   15422     0,                  /* tp_as_buffer */
   15423     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
   15424     0,                  /* tp_doc */
   15425     (traverseproc)unicodeiter_traverse, /* tp_traverse */
   15426     0,                  /* tp_clear */
   15427     0,                  /* tp_richcompare */
   15428     0,                  /* tp_weaklistoffset */
   15429     PyObject_SelfIter,          /* tp_iter */
   15430     (iternextfunc)unicodeiter_next,     /* tp_iternext */
   15431     unicodeiter_methods,            /* tp_methods */
   15432     0,
   15433 };
   15434 
   15435 static PyObject *
   15436 unicode_iter(PyObject *seq)
   15437 {
   15438     unicodeiterobject *it;
   15439 
   15440     if (!PyUnicode_Check(seq)) {
   15441         PyErr_BadInternalCall();
   15442         return NULL;
   15443     }
   15444     if (PyUnicode_READY(seq) == -1)
   15445         return NULL;
   15446     it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
   15447     if (it == NULL)
   15448         return NULL;
   15449     it->it_index = 0;
   15450     Py_INCREF(seq);
   15451     it->it_seq = seq;
   15452     _PyObject_GC_TRACK(it);
   15453     return (PyObject *)it;
   15454 }
   15455 
   15456 
   15457 size_t
   15458 Py_UNICODE_strlen(const Py_UNICODE *u)
   15459 {
   15460     int res = 0;
   15461     while(*u++)
   15462         res++;
   15463     return res;
   15464 }
   15465 
   15466 Py_UNICODE*
   15467 Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
   15468 {
   15469     Py_UNICODE *u = s1;
   15470     while ((*u++ = *s2++));
   15471     return s1;
   15472 }
   15473 
   15474 Py_UNICODE*
   15475 Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
   15476 {
   15477     Py_UNICODE *u = s1;
   15478     while ((*u++ = *s2++))
   15479         if (n-- == 0)
   15480             break;
   15481     return s1;
   15482 }
   15483 
   15484 Py_UNICODE*
   15485 Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
   15486 {
   15487     Py_UNICODE *u1 = s1;
   15488     u1 += Py_UNICODE_strlen(u1);
   15489     Py_UNICODE_strcpy(u1, s2);
   15490     return s1;
   15491 }
   15492 
   15493 int
   15494 Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
   15495 {
   15496     while (*s1 && *s2 && *s1 == *s2)
   15497         s1++, s2++;
   15498     if (*s1 && *s2)
   15499         return (*s1 < *s2) ? -1 : +1;
   15500     if (*s1)
   15501         return 1;
   15502     if (*s2)
   15503         return -1;
   15504     return 0;
   15505 }
   15506 
   15507 int
   15508 Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
   15509 {
   15510     Py_UNICODE u1, u2;
   15511     for (; n != 0; n--) {
   15512         u1 = *s1;
   15513         u2 = *s2;
   15514         if (u1 != u2)
   15515             return (u1 < u2) ? -1 : +1;
   15516         if (u1 == '\0')
   15517             return 0;
   15518         s1++;
   15519         s2++;
   15520     }
   15521     return 0;
   15522 }
   15523 
   15524 Py_UNICODE*
   15525 Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
   15526 {
   15527     const Py_UNICODE *p;
   15528     for (p = s; *p; p++)
   15529         if (*p == c)
   15530             return (Py_UNICODE*)p;
   15531     return NULL;
   15532 }
   15533 
   15534 Py_UNICODE*
   15535 Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
   15536 {
   15537     const Py_UNICODE *p;
   15538     p = s + Py_UNICODE_strlen(s);
   15539     while (p != s) {
   15540         p--;
   15541         if (*p == c)
   15542             return (Py_UNICODE*)p;
   15543     }
   15544     return NULL;
   15545 }
   15546 
   15547 Py_UNICODE*
   15548 PyUnicode_AsUnicodeCopy(PyObject *unicode)
   15549 {
   15550     Py_UNICODE *u, *copy;
   15551     Py_ssize_t len, size;
   15552 
   15553     if (!PyUnicode_Check(unicode)) {
   15554         PyErr_BadArgument();
   15555         return NULL;
   15556     }
   15557     u = PyUnicode_AsUnicodeAndSize(unicode, &len);
   15558     if (u == NULL)
   15559         return NULL;
   15560     /* Ensure we won't overflow the size. */
   15561     if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
   15562         PyErr_NoMemory();
   15563         return NULL;
   15564     }
   15565     size = len + 1; /* copy the null character */
   15566     size *= sizeof(Py_UNICODE);
   15567     copy = PyMem_Malloc(size);
   15568     if (copy == NULL) {
   15569         PyErr_NoMemory();
   15570         return NULL;
   15571     }
   15572     memcpy(copy, u, size);
   15573     return copy;
   15574 }
   15575 
   15576 /* A _string module, to export formatter_parser and formatter_field_name_split
   15577    to the string.Formatter class implemented in Python. */
   15578 
   15579 static PyMethodDef _string_methods[] = {
   15580     {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
   15581      METH_O, PyDoc_STR("split the argument as a field name")},
   15582     {"formatter_parser", (PyCFunction) formatter_parser,
   15583      METH_O, PyDoc_STR("parse the argument as a format string")},
   15584     {NULL, NULL}
   15585 };
   15586 
   15587 static struct PyModuleDef _string_module = {
   15588     PyModuleDef_HEAD_INIT,
   15589     "_string",
   15590     PyDoc_STR("string helper module"),
   15591     0,
   15592     _string_methods,
   15593     NULL,
   15594     NULL,
   15595     NULL,
   15596     NULL
   15597 };
   15598 
   15599 PyMODINIT_FUNC
   15600 PyInit__string(void)
   15601 {
   15602     return PyModule_Create(&_string_module);
   15603 }
   15604 
   15605 
   15606 #ifdef __cplusplus
   15607 }
   15608 #endif
   15609