Home | History | Annotate | Download | only in Objects
      1 /*
      2 
      3 Unicode implementation based on original code by Fredrik Lundh,
      4 modified by Marc-Andre Lemburg <mal (at) lemburg.com>.
      5 
      6 Major speed upgrades to the method implementations at the Reykjavik
      7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
      8 
      9 Copyright (c) Corporation for National Research Initiatives.
     10 
     11 --------------------------------------------------------------------
     12 The original string type implementation is:
     13 
     14   Copyright (c) 1999 by Secret Labs AB
     15   Copyright (c) 1999 by Fredrik Lundh
     16 
     17 By obtaining, using, and/or copying this software and/or its
     18 associated documentation, you agree that you have read, understood,
     19 and will comply with the following terms and conditions:
     20 
     21 Permission to use, copy, modify, and distribute this software and its
     22 associated documentation for any purpose and without fee is hereby
     23 granted, provided that the above copyright notice appears in all
     24 copies, and that both that copyright notice and this permission notice
     25 appear in supporting documentation, and that the name of Secret Labs
     26 AB or the author not be used in advertising or publicity pertaining to
     27 distribution of the software without specific, written prior
     28 permission.
     29 
     30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
     31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
     32 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
     33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
     36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     37 --------------------------------------------------------------------
     38 
     39 */
     40 
     41 #define PY_SSIZE_T_CLEAN
     42 #include "Python.h"
     43 #include "internal/pystate.h"
     44 #include "ucnhash.h"
     45 #include "bytes_methods.h"
     46 #include "stringlib/eq.h"
     47 
     48 #ifdef MS_WINDOWS
     49 #include <windows.h>
     50 #endif
     51 
     52 /*[clinic input]
     53 class str "PyObject *" "&PyUnicode_Type"
     54 [clinic start generated code]*/
     55 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
     56 
     57 /*[python input]
     58 class Py_UCS4_converter(CConverter):
     59     type = 'Py_UCS4'
     60     converter = 'convert_uc'
     61 
     62     def converter_init(self):
     63         if self.default is not unspecified:
     64             self.c_default = ascii(self.default)
     65             if len(self.c_default) > 4 or self.c_default[0] != "'":
     66                 self.c_default = hex(ord(self.default))
     67 
     68 [python start generated code]*/
     69 /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
     70 
     71 /* --- Globals ------------------------------------------------------------
     72 
     73 NOTE: In the interpreter's initialization phase, some globals are currently
     74       initialized dynamically as needed. In the process Unicode objects may
     75       be created before the Unicode type is ready.
     76 
     77 */
     78 
     79 
     80 #ifdef __cplusplus
     81 extern "C" {
     82 #endif
     83 
     84 /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
     85 #define MAX_UNICODE 0x10ffff
     86 
     87 #ifdef Py_DEBUG
     88 #  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
     89 #else
     90 #  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
     91 #endif
     92 
     93 #define _PyUnicode_UTF8(op)                             \
     94     (((PyCompactUnicodeObject*)(op))->utf8)
     95 #define PyUnicode_UTF8(op)                              \
     96     (assert(_PyUnicode_CHECK(op)),                      \
     97      assert(PyUnicode_IS_READY(op)),                    \
     98      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
     99          ((char*)((PyASCIIObject*)(op) + 1)) :          \
    100          _PyUnicode_UTF8(op))
    101 #define _PyUnicode_UTF8_LENGTH(op)                      \
    102     (((PyCompactUnicodeObject*)(op))->utf8_length)
    103 #define PyUnicode_UTF8_LENGTH(op)                       \
    104     (assert(_PyUnicode_CHECK(op)),                      \
    105      assert(PyUnicode_IS_READY(op)),                    \
    106      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
    107          ((PyASCIIObject*)(op))->length :               \
    108          _PyUnicode_UTF8_LENGTH(op))
    109 #define _PyUnicode_WSTR(op)                             \
    110     (((PyASCIIObject*)(op))->wstr)
    111 #define _PyUnicode_WSTR_LENGTH(op)                      \
    112     (((PyCompactUnicodeObject*)(op))->wstr_length)
    113 #define _PyUnicode_LENGTH(op)                           \
    114     (((PyASCIIObject *)(op))->length)
    115 #define _PyUnicode_STATE(op)                            \
    116     (((PyASCIIObject *)(op))->state)
    117 #define _PyUnicode_HASH(op)                             \
    118     (((PyASCIIObject *)(op))->hash)
    119 #define _PyUnicode_KIND(op)                             \
    120     (assert(_PyUnicode_CHECK(op)),                      \
    121      ((PyASCIIObject *)(op))->state.kind)
    122 #define _PyUnicode_GET_LENGTH(op)                       \
    123     (assert(_PyUnicode_CHECK(op)),                      \
    124      ((PyASCIIObject *)(op))->length)
    125 #define _PyUnicode_DATA_ANY(op)                         \
    126     (((PyUnicodeObject*)(op))->data.any)
    127 
    128 #undef PyUnicode_READY
    129 #define PyUnicode_READY(op)                             \
    130     (assert(_PyUnicode_CHECK(op)),                      \
    131      (PyUnicode_IS_READY(op) ?                          \
    132       0 :                                               \
    133       _PyUnicode_Ready(op)))
    134 
    135 #define _PyUnicode_SHARE_UTF8(op)                       \
    136     (assert(_PyUnicode_CHECK(op)),                      \
    137      assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
    138      (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
    139 #define _PyUnicode_SHARE_WSTR(op)                       \
    140     (assert(_PyUnicode_CHECK(op)),                      \
    141      (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
    142 
    143 /* true if the Unicode object has an allocated UTF-8 memory block
    144    (not shared with other data) */
    145 #define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
    146     ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
    147       && _PyUnicode_UTF8(op)                            \
    148       && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
    149 
    150 /* true if the Unicode object has an allocated wstr memory block
    151    (not shared with other data) */
    152 #define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
    153     ((_PyUnicode_WSTR(op) &&                            \
    154       (!PyUnicode_IS_READY(op) ||                       \
    155        _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
    156 
    157 /* Generic helper macro to convert characters of different types.
    158    from_type and to_type have to be valid type names, begin and end
    159    are pointers to the source characters which should be of type
    160    "from_type *".  to is a pointer of type "to_type *" and points to the
    161    buffer where the result characters are written to. */
    162 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
    163     do {                                                \
    164         to_type *_to = (to_type *)(to);                \
    165         const from_type *_iter = (from_type *)(begin);  \
    166         const from_type *_end = (from_type *)(end);     \
    167         Py_ssize_t n = (_end) - (_iter);                \
    168         const from_type *_unrolled_end =                \
    169             _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
    170         while (_iter < (_unrolled_end)) {               \
    171             _to[0] = (to_type) _iter[0];                \
    172             _to[1] = (to_type) _iter[1];                \
    173             _to[2] = (to_type) _iter[2];                \
    174             _to[3] = (to_type) _iter[3];                \
    175             _iter += 4; _to += 4;                       \
    176         }                                               \
    177         while (_iter < (_end))                          \
    178             *_to++ = (to_type) *_iter++;                \
    179     } while (0)
    180 
    181 #ifdef MS_WINDOWS
    182    /* On Windows, overallocate by 50% is the best factor */
    183 #  define OVERALLOCATE_FACTOR 2
    184 #else
    185    /* On Linux, overallocate by 25% is the best factor */
    186 #  define OVERALLOCATE_FACTOR 4
    187 #endif
    188 
    189 /* This dictionary holds all interned unicode strings.  Note that references
    190    to strings in this dictionary are *not* counted in the string's ob_refcnt.
    191    When the interned string reaches a refcnt of 0 the string deallocation
    192    function will delete the reference from this dictionary.
    193 
    194    Another way to look at this is that to say that the actual reference
    195    count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
    196 */
    197 static PyObject *interned = NULL;
    198 
    199 /* The empty Unicode object is shared to improve performance. */
    200 static PyObject *unicode_empty = NULL;
    201 
    202 #define _Py_INCREF_UNICODE_EMPTY()                      \
    203     do {                                                \
    204         if (unicode_empty != NULL)                      \
    205             Py_INCREF(unicode_empty);                   \
    206         else {                                          \
    207             unicode_empty = PyUnicode_New(0, 0);        \
    208             if (unicode_empty != NULL) {                \
    209                 Py_INCREF(unicode_empty);               \
    210                 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
    211             }                                           \
    212         }                                               \
    213     } while (0)
    214 
    215 #define _Py_RETURN_UNICODE_EMPTY()                      \
    216     do {                                                \
    217         _Py_INCREF_UNICODE_EMPTY();                     \
    218         return unicode_empty;                           \
    219     } while (0)
    220 
    221 #define FILL(kind, data, value, start, length) \
    222     do { \
    223         assert(0 <= start); \
    224         assert(kind != PyUnicode_WCHAR_KIND); \
    225         switch (kind) { \
    226         case PyUnicode_1BYTE_KIND: { \
    227             assert(value <= 0xff); \
    228             Py_UCS1 ch = (unsigned char)value; \
    229             Py_UCS1 *to = (Py_UCS1 *)data + start; \
    230             memset(to, ch, length); \
    231             break; \
    232         } \
    233         case PyUnicode_2BYTE_KIND: { \
    234             assert(value <= 0xffff); \
    235             Py_UCS2 ch = (Py_UCS2)value; \
    236             Py_UCS2 *to = (Py_UCS2 *)data + start; \
    237             const Py_UCS2 *end = to + length; \
    238             for (; to < end; ++to) *to = ch; \
    239             break; \
    240         } \
    241         case PyUnicode_4BYTE_KIND: { \
    242             assert(value <= MAX_UNICODE); \
    243             Py_UCS4 ch = value; \
    244             Py_UCS4 * to = (Py_UCS4 *)data + start; \
    245             const Py_UCS4 *end = to + length; \
    246             for (; to < end; ++to) *to = ch; \
    247             break; \
    248         } \
    249         default: Py_UNREACHABLE(); \
    250         } \
    251     } while (0)
    252 
    253 
    254 /* Forward declaration */
    255 static inline int
    256 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
    257 
    258 /* List of static strings. */
    259 static _Py_Identifier *static_strings = NULL;
    260 
    261 /* Single character Unicode strings in the Latin-1 range are being
    262    shared as well. */
    263 static PyObject *unicode_latin1[256] = {NULL};
    264 
    265 /* Fast detection of the most frequent whitespace characters */
    266 const unsigned char _Py_ascii_whitespace[] = {
    267     0, 0, 0, 0, 0, 0, 0, 0,
    268 /*     case 0x0009: * CHARACTER TABULATION */
    269 /*     case 0x000A: * LINE FEED */
    270 /*     case 0x000B: * LINE TABULATION */
    271 /*     case 0x000C: * FORM FEED */
    272 /*     case 0x000D: * CARRIAGE RETURN */
    273     0, 1, 1, 1, 1, 1, 0, 0,
    274     0, 0, 0, 0, 0, 0, 0, 0,
    275 /*     case 0x001C: * FILE SEPARATOR */
    276 /*     case 0x001D: * GROUP SEPARATOR */
    277 /*     case 0x001E: * RECORD SEPARATOR */
    278 /*     case 0x001F: * UNIT SEPARATOR */
    279     0, 0, 0, 0, 1, 1, 1, 1,
    280 /*     case 0x0020: * SPACE */
    281     1, 0, 0, 0, 0, 0, 0, 0,
    282     0, 0, 0, 0, 0, 0, 0, 0,
    283     0, 0, 0, 0, 0, 0, 0, 0,
    284     0, 0, 0, 0, 0, 0, 0, 0,
    285 
    286     0, 0, 0, 0, 0, 0, 0, 0,
    287     0, 0, 0, 0, 0, 0, 0, 0,
    288     0, 0, 0, 0, 0, 0, 0, 0,
    289     0, 0, 0, 0, 0, 0, 0, 0,
    290     0, 0, 0, 0, 0, 0, 0, 0,
    291     0, 0, 0, 0, 0, 0, 0, 0,
    292     0, 0, 0, 0, 0, 0, 0, 0,
    293     0, 0, 0, 0, 0, 0, 0, 0
    294 };
    295 
    296 /* forward */
    297 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
    298 static PyObject* get_latin1_char(unsigned char ch);
    299 static int unicode_modifiable(PyObject *unicode);
    300 
    301 
    302 static PyObject *
    303 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
    304 static PyObject *
    305 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
    306 static PyObject *
    307 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
    308 
    309 static PyObject *
    310 unicode_encode_call_errorhandler(const char *errors,
    311        PyObject **errorHandler,const char *encoding, const char *reason,
    312        PyObject *unicode, PyObject **exceptionObject,
    313        Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
    314 
    315 static void
    316 raise_encode_exception(PyObject **exceptionObject,
    317                        const char *encoding,
    318                        PyObject *unicode,
    319                        Py_ssize_t startpos, Py_ssize_t endpos,
    320                        const char *reason);
    321 
    322 /* Same for linebreaks */
    323 static const unsigned char ascii_linebreak[] = {
    324     0, 0, 0, 0, 0, 0, 0, 0,
    325 /*         0x000A, * LINE FEED */
    326 /*         0x000B, * LINE TABULATION */
    327 /*         0x000C, * FORM FEED */
    328 /*         0x000D, * CARRIAGE RETURN */
    329     0, 0, 1, 1, 1, 1, 0, 0,
    330     0, 0, 0, 0, 0, 0, 0, 0,
    331 /*         0x001C, * FILE SEPARATOR */
    332 /*         0x001D, * GROUP SEPARATOR */
    333 /*         0x001E, * RECORD SEPARATOR */
    334     0, 0, 0, 0, 1, 1, 1, 0,
    335     0, 0, 0, 0, 0, 0, 0, 0,
    336     0, 0, 0, 0, 0, 0, 0, 0,
    337     0, 0, 0, 0, 0, 0, 0, 0,
    338     0, 0, 0, 0, 0, 0, 0, 0,
    339 
    340     0, 0, 0, 0, 0, 0, 0, 0,
    341     0, 0, 0, 0, 0, 0, 0, 0,
    342     0, 0, 0, 0, 0, 0, 0, 0,
    343     0, 0, 0, 0, 0, 0, 0, 0,
    344     0, 0, 0, 0, 0, 0, 0, 0,
    345     0, 0, 0, 0, 0, 0, 0, 0,
    346     0, 0, 0, 0, 0, 0, 0, 0,
    347     0, 0, 0, 0, 0, 0, 0, 0
    348 };
    349 
    350 static int convert_uc(PyObject *obj, void *addr);
    351 
    352 #include "clinic/unicodeobject.c.h"
    353 
    354 typedef enum {
    355     _Py_ERROR_UNKNOWN=0,
    356     _Py_ERROR_STRICT,
    357     _Py_ERROR_SURROGATEESCAPE,
    358     _Py_ERROR_REPLACE,
    359     _Py_ERROR_IGNORE,
    360     _Py_ERROR_BACKSLASHREPLACE,
    361     _Py_ERROR_SURROGATEPASS,
    362     _Py_ERROR_XMLCHARREFREPLACE,
    363     _Py_ERROR_OTHER
    364 } _Py_error_handler;
    365 
    366 static _Py_error_handler
    367 get_error_handler(const char *errors)
    368 {
    369     if (errors == NULL || strcmp(errors, "strict") == 0) {
    370         return _Py_ERROR_STRICT;
    371     }
    372     if (strcmp(errors, "surrogateescape") == 0) {
    373         return _Py_ERROR_SURROGATEESCAPE;
    374     }
    375     if (strcmp(errors, "replace") == 0) {
    376         return _Py_ERROR_REPLACE;
    377     }
    378     if (strcmp(errors, "ignore") == 0) {
    379         return _Py_ERROR_IGNORE;
    380     }
    381     if (strcmp(errors, "backslashreplace") == 0) {
    382         return _Py_ERROR_BACKSLASHREPLACE;
    383     }
    384     if (strcmp(errors, "surrogatepass") == 0) {
    385         return _Py_ERROR_SURROGATEPASS;
    386     }
    387     if (strcmp(errors, "xmlcharrefreplace") == 0) {
    388         return _Py_ERROR_XMLCHARREFREPLACE;
    389     }
    390     return _Py_ERROR_OTHER;
    391 }
    392 
    393 /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
    394    This function is kept for backward compatibility with the old API. */
    395 Py_UNICODE
    396 PyUnicode_GetMax(void)
    397 {
    398 #ifdef Py_UNICODE_WIDE
    399     return 0x10FFFF;
    400 #else
    401     /* This is actually an illegal character, so it should
    402        not be passed to unichr. */
    403     return 0xFFFF;
    404 #endif
    405 }
    406 
    407 #ifdef Py_DEBUG
    408 int
    409 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
    410 {
    411     PyASCIIObject *ascii;
    412     unsigned int kind;
    413 
    414     assert(PyUnicode_Check(op));
    415 
    416     ascii = (PyASCIIObject *)op;
    417     kind = ascii->state.kind;
    418 
    419     if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
    420         assert(kind == PyUnicode_1BYTE_KIND);
    421         assert(ascii->state.ready == 1);
    422     }
    423     else {
    424         PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
    425         void *data;
    426 
    427         if (ascii->state.compact == 1) {
    428             data = compact + 1;
    429             assert(kind == PyUnicode_1BYTE_KIND
    430                    || kind == PyUnicode_2BYTE_KIND
    431                    || kind == PyUnicode_4BYTE_KIND);
    432             assert(ascii->state.ascii == 0);
    433             assert(ascii->state.ready == 1);
    434             assert (compact->utf8 != data);
    435         }
    436         else {
    437             PyUnicodeObject *unicode = (PyUnicodeObject *)op;
    438 
    439             data = unicode->data.any;
    440             if (kind == PyUnicode_WCHAR_KIND) {
    441                 assert(ascii->length == 0);
    442                 assert(ascii->hash == -1);
    443                 assert(ascii->state.compact == 0);
    444                 assert(ascii->state.ascii == 0);
    445                 assert(ascii->state.ready == 0);
    446                 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
    447                 assert(ascii->wstr != NULL);
    448                 assert(data == NULL);
    449                 assert(compact->utf8 == NULL);
    450             }
    451             else {
    452                 assert(kind == PyUnicode_1BYTE_KIND
    453                        || kind == PyUnicode_2BYTE_KIND
    454                        || kind == PyUnicode_4BYTE_KIND);
    455                 assert(ascii->state.compact == 0);
    456                 assert(ascii->state.ready == 1);
    457                 assert(data != NULL);
    458                 if (ascii->state.ascii) {
    459                     assert (compact->utf8 == data);
    460                     assert (compact->utf8_length == ascii->length);
    461                 }
    462                 else
    463                     assert (compact->utf8 != data);
    464             }
    465         }
    466         if (kind != PyUnicode_WCHAR_KIND) {
    467             if (
    468 #if SIZEOF_WCHAR_T == 2
    469                 kind == PyUnicode_2BYTE_KIND
    470 #else
    471                 kind == PyUnicode_4BYTE_KIND
    472 #endif
    473                )
    474             {
    475                 assert(ascii->wstr == data);
    476                 assert(compact->wstr_length == ascii->length);
    477             } else
    478                 assert(ascii->wstr != data);
    479         }
    480 
    481         if (compact->utf8 == NULL)
    482             assert(compact->utf8_length == 0);
    483         if (ascii->wstr == NULL)
    484             assert(compact->wstr_length == 0);
    485     }
    486     /* check that the best kind is used */
    487     if (check_content && kind != PyUnicode_WCHAR_KIND)
    488     {
    489         Py_ssize_t i;
    490         Py_UCS4 maxchar = 0;
    491         void *data;
    492         Py_UCS4 ch;
    493 
    494         data = PyUnicode_DATA(ascii);
    495         for (i=0; i < ascii->length; i++)
    496         {
    497             ch = PyUnicode_READ(kind, data, i);
    498             if (ch > maxchar)
    499                 maxchar = ch;
    500         }
    501         if (kind == PyUnicode_1BYTE_KIND) {
    502             if (ascii->state.ascii == 0) {
    503                 assert(maxchar >= 128);
    504                 assert(maxchar <= 255);
    505             }
    506             else
    507                 assert(maxchar < 128);
    508         }
    509         else if (kind == PyUnicode_2BYTE_KIND) {
    510             assert(maxchar >= 0x100);
    511             assert(maxchar <= 0xFFFF);
    512         }
    513         else {
    514             assert(maxchar >= 0x10000);
    515             assert(maxchar <= MAX_UNICODE);
    516         }
    517         assert(PyUnicode_READ(kind, data, ascii->length) == 0);
    518     }
    519     return 1;
    520 }
    521 #endif
    522 
    523 static PyObject*
    524 unicode_result_wchar(PyObject *unicode)
    525 {
    526 #ifndef Py_DEBUG
    527     Py_ssize_t len;
    528 
    529     len = _PyUnicode_WSTR_LENGTH(unicode);
    530     if (len == 0) {
    531         Py_DECREF(unicode);
    532         _Py_RETURN_UNICODE_EMPTY();
    533     }
    534 
    535     if (len == 1) {
    536         wchar_t ch = _PyUnicode_WSTR(unicode)[0];
    537         if ((Py_UCS4)ch < 256) {
    538             PyObject *latin1_char = get_latin1_char((unsigned char)ch);
    539             Py_DECREF(unicode);
    540             return latin1_char;
    541         }
    542     }
    543 
    544     if (_PyUnicode_Ready(unicode) < 0) {
    545         Py_DECREF(unicode);
    546         return NULL;
    547     }
    548 #else
    549     assert(Py_REFCNT(unicode) == 1);
    550 
    551     /* don't make the result ready in debug mode to ensure that the caller
    552        makes the string ready before using it */
    553     assert(_PyUnicode_CheckConsistency(unicode, 1));
    554 #endif
    555     return unicode;
    556 }
    557 
    558 static PyObject*
    559 unicode_result_ready(PyObject *unicode)
    560 {
    561     Py_ssize_t length;
    562 
    563     length = PyUnicode_GET_LENGTH(unicode);
    564     if (length == 0) {
    565         if (unicode != unicode_empty) {
    566             Py_DECREF(unicode);
    567             _Py_RETURN_UNICODE_EMPTY();
    568         }
    569         return unicode_empty;
    570     }
    571 
    572     if (length == 1) {
    573         void *data = PyUnicode_DATA(unicode);
    574         int kind = PyUnicode_KIND(unicode);
    575         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
    576         if (ch < 256) {
    577             PyObject *latin1_char = unicode_latin1[ch];
    578             if (latin1_char != NULL) {
    579                 if (unicode != latin1_char) {
    580                     Py_INCREF(latin1_char);
    581                     Py_DECREF(unicode);
    582                 }
    583                 return latin1_char;
    584             }
    585             else {
    586                 assert(_PyUnicode_CheckConsistency(unicode, 1));
    587                 Py_INCREF(unicode);
    588                 unicode_latin1[ch] = unicode;
    589                 return unicode;
    590             }
    591         }
    592     }
    593 
    594     assert(_PyUnicode_CheckConsistency(unicode, 1));
    595     return unicode;
    596 }
    597 
    598 static PyObject*
    599 unicode_result(PyObject *unicode)
    600 {
    601     assert(_PyUnicode_CHECK(unicode));
    602     if (PyUnicode_IS_READY(unicode))
    603         return unicode_result_ready(unicode);
    604     else
    605         return unicode_result_wchar(unicode);
    606 }
    607 
    608 static PyObject*
    609 unicode_result_unchanged(PyObject *unicode)
    610 {
    611     if (PyUnicode_CheckExact(unicode)) {
    612         if (PyUnicode_READY(unicode) == -1)
    613             return NULL;
    614         Py_INCREF(unicode);
    615         return unicode;
    616     }
    617     else
    618         /* Subtype -- return genuine unicode string with the same value. */
    619         return _PyUnicode_Copy(unicode);
    620 }
    621 
    622 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
    623    ASCII, Latin1, UTF-8, etc. */
    624 static char*
    625 backslashreplace(_PyBytesWriter *writer, char *str,
    626                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
    627 {
    628     Py_ssize_t size, i;
    629     Py_UCS4 ch;
    630     enum PyUnicode_Kind kind;
    631     void *data;
    632 
    633     assert(PyUnicode_IS_READY(unicode));
    634     kind = PyUnicode_KIND(unicode);
    635     data = PyUnicode_DATA(unicode);
    636 
    637     size = 0;
    638     /* determine replacement size */
    639     for (i = collstart; i < collend; ++i) {
    640         Py_ssize_t incr;
    641 
    642         ch = PyUnicode_READ(kind, data, i);
    643         if (ch < 0x100)
    644             incr = 2+2;
    645         else if (ch < 0x10000)
    646             incr = 2+4;
    647         else {
    648             assert(ch <= MAX_UNICODE);
    649             incr = 2+8;
    650         }
    651         if (size > PY_SSIZE_T_MAX - incr) {
    652             PyErr_SetString(PyExc_OverflowError,
    653                             "encoded result is too long for a Python string");
    654             return NULL;
    655         }
    656         size += incr;
    657     }
    658 
    659     str = _PyBytesWriter_Prepare(writer, str, size);
    660     if (str == NULL)
    661         return NULL;
    662 
    663     /* generate replacement */
    664     for (i = collstart; i < collend; ++i) {
    665         ch = PyUnicode_READ(kind, data, i);
    666         *str++ = '\\';
    667         if (ch >= 0x00010000) {
    668             *str++ = 'U';
    669             *str++ = Py_hexdigits[(ch>>28)&0xf];
    670             *str++ = Py_hexdigits[(ch>>24)&0xf];
    671             *str++ = Py_hexdigits[(ch>>20)&0xf];
    672             *str++ = Py_hexdigits[(ch>>16)&0xf];
    673             *str++ = Py_hexdigits[(ch>>12)&0xf];
    674             *str++ = Py_hexdigits[(ch>>8)&0xf];
    675         }
    676         else if (ch >= 0x100) {
    677             *str++ = 'u';
    678             *str++ = Py_hexdigits[(ch>>12)&0xf];
    679             *str++ = Py_hexdigits[(ch>>8)&0xf];
    680         }
    681         else
    682             *str++ = 'x';
    683         *str++ = Py_hexdigits[(ch>>4)&0xf];
    684         *str++ = Py_hexdigits[ch&0xf];
    685     }
    686     return str;
    687 }
    688 
    689 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
    690    ASCII, Latin1, UTF-8, etc. */
    691 static char*
    692 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
    693                   PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
    694 {
    695     Py_ssize_t size, i;
    696     Py_UCS4 ch;
    697     enum PyUnicode_Kind kind;
    698     void *data;
    699 
    700     assert(PyUnicode_IS_READY(unicode));
    701     kind = PyUnicode_KIND(unicode);
    702     data = PyUnicode_DATA(unicode);
    703 
    704     size = 0;
    705     /* determine replacement size */
    706     for (i = collstart; i < collend; ++i) {
    707         Py_ssize_t incr;
    708 
    709         ch = PyUnicode_READ(kind, data, i);
    710         if (ch < 10)
    711             incr = 2+1+1;
    712         else if (ch < 100)
    713             incr = 2+2+1;
    714         else if (ch < 1000)
    715             incr = 2+3+1;
    716         else if (ch < 10000)
    717             incr = 2+4+1;
    718         else if (ch < 100000)
    719             incr = 2+5+1;
    720         else if (ch < 1000000)
    721             incr = 2+6+1;
    722         else {
    723             assert(ch <= MAX_UNICODE);
    724             incr = 2+7+1;
    725         }
    726         if (size > PY_SSIZE_T_MAX - incr) {
    727             PyErr_SetString(PyExc_OverflowError,
    728                             "encoded result is too long for a Python string");
    729             return NULL;
    730         }
    731         size += incr;
    732     }
    733 
    734     str = _PyBytesWriter_Prepare(writer, str, size);
    735     if (str == NULL)
    736         return NULL;
    737 
    738     /* generate replacement */
    739     for (i = collstart; i < collend; ++i) {
    740         str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
    741     }
    742     return str;
    743 }
    744 
    745 /* --- Bloom Filters ----------------------------------------------------- */
    746 
    747 /* stuff to implement simple "bloom filters" for Unicode characters.
    748    to keep things simple, we use a single bitmask, using the least 5
    749    bits from each unicode characters as the bit index. */
    750 
    751 /* the linebreak mask is set up by Unicode_Init below */
    752 
    753 #if LONG_BIT >= 128
    754 #define BLOOM_WIDTH 128
    755 #elif LONG_BIT >= 64
    756 #define BLOOM_WIDTH 64
    757 #elif LONG_BIT >= 32
    758 #define BLOOM_WIDTH 32
    759 #else
    760 #error "LONG_BIT is smaller than 32"
    761 #endif
    762 
    763 #define BLOOM_MASK unsigned long
    764 
    765 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
    766 
    767 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
    768 
    769 #define BLOOM_LINEBREAK(ch)                                             \
    770     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
    771      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
    772 
    773 static inline BLOOM_MASK
    774 make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
    775 {
    776 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
    777     do {                                               \
    778         TYPE *data = (TYPE *)PTR;                      \
    779         TYPE *end = data + LEN;                        \
    780         Py_UCS4 ch;                                    \
    781         for (; data != end; data++) {                  \
    782             ch = *data;                                \
    783             MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
    784         }                                              \
    785         break;                                         \
    786     } while (0)
    787 
    788     /* calculate simple bloom-style bitmask for a given unicode string */
    789 
    790     BLOOM_MASK mask;
    791 
    792     mask = 0;
    793     switch (kind) {
    794     case PyUnicode_1BYTE_KIND:
    795         BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
    796         break;
    797     case PyUnicode_2BYTE_KIND:
    798         BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
    799         break;
    800     case PyUnicode_4BYTE_KIND:
    801         BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
    802         break;
    803     default:
    804         Py_UNREACHABLE();
    805     }
    806     return mask;
    807 
    808 #undef BLOOM_UPDATE
    809 }
    810 
    811 static int
    812 ensure_unicode(PyObject *obj)
    813 {
    814     if (!PyUnicode_Check(obj)) {
    815         PyErr_Format(PyExc_TypeError,
    816                      "must be str, not %.100s",
    817                      Py_TYPE(obj)->tp_name);
    818         return -1;
    819     }
    820     return PyUnicode_READY(obj);
    821 }
    822 
    823 /* Compilation of templated routines */
    824 
    825 #include "stringlib/asciilib.h"
    826 #include "stringlib/fastsearch.h"
    827 #include "stringlib/partition.h"
    828 #include "stringlib/split.h"
    829 #include "stringlib/count.h"
    830 #include "stringlib/find.h"
    831 #include "stringlib/find_max_char.h"
    832 #include "stringlib/undef.h"
    833 
    834 #include "stringlib/ucs1lib.h"
    835 #include "stringlib/fastsearch.h"
    836 #include "stringlib/partition.h"
    837 #include "stringlib/split.h"
    838 #include "stringlib/count.h"
    839 #include "stringlib/find.h"
    840 #include "stringlib/replace.h"
    841 #include "stringlib/find_max_char.h"
    842 #include "stringlib/undef.h"
    843 
    844 #include "stringlib/ucs2lib.h"
    845 #include "stringlib/fastsearch.h"
    846 #include "stringlib/partition.h"
    847 #include "stringlib/split.h"
    848 #include "stringlib/count.h"
    849 #include "stringlib/find.h"
    850 #include "stringlib/replace.h"
    851 #include "stringlib/find_max_char.h"
    852 #include "stringlib/undef.h"
    853 
    854 #include "stringlib/ucs4lib.h"
    855 #include "stringlib/fastsearch.h"
    856 #include "stringlib/partition.h"
    857 #include "stringlib/split.h"
    858 #include "stringlib/count.h"
    859 #include "stringlib/find.h"
    860 #include "stringlib/replace.h"
    861 #include "stringlib/find_max_char.h"
    862 #include "stringlib/undef.h"
    863 
    864 #include "stringlib/unicodedefs.h"
    865 #include "stringlib/fastsearch.h"
    866 #include "stringlib/count.h"
    867 #include "stringlib/find.h"
    868 #include "stringlib/undef.h"
    869 
    870 /* --- Unicode Object ----------------------------------------------------- */
    871 
    872 static inline Py_ssize_t
    873 findchar(const void *s, int kind,
    874          Py_ssize_t size, Py_UCS4 ch,
    875          int direction)
    876 {
    877     switch (kind) {
    878     case PyUnicode_1BYTE_KIND:
    879         if ((Py_UCS1) ch != ch)
    880             return -1;
    881         if (direction > 0)
    882             return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
    883         else
    884             return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
    885     case PyUnicode_2BYTE_KIND:
    886         if ((Py_UCS2) ch != ch)
    887             return -1;
    888         if (direction > 0)
    889             return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
    890         else
    891             return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
    892     case PyUnicode_4BYTE_KIND:
    893         if (direction > 0)
    894             return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
    895         else
    896             return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
    897     default:
    898         Py_UNREACHABLE();
    899     }
    900 }
    901 
    902 #ifdef Py_DEBUG
    903 /* Fill the data of a Unicode string with invalid characters to detect bugs
    904    earlier.
    905 
    906    _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
    907    ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
    908    invalid character in Unicode 6.0. */
    909 static void
    910 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
    911 {
    912     int kind = PyUnicode_KIND(unicode);
    913     Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
    914     Py_ssize_t length = _PyUnicode_LENGTH(unicode);
    915     if (length <= old_length)
    916         return;
    917     memset(data + old_length * kind, 0xff, (length - old_length) * kind);
    918 }
    919 #endif
    920 
    921 static PyObject*
    922 resize_compact(PyObject *unicode, Py_ssize_t length)
    923 {
    924     Py_ssize_t char_size;
    925     Py_ssize_t struct_size;
    926     Py_ssize_t new_size;
    927     int share_wstr;
    928     PyObject *new_unicode;
    929 #ifdef Py_DEBUG
    930     Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
    931 #endif
    932 
    933     assert(unicode_modifiable(unicode));
    934     assert(PyUnicode_IS_READY(unicode));
    935     assert(PyUnicode_IS_COMPACT(unicode));
    936 
    937     char_size = PyUnicode_KIND(unicode);
    938     if (PyUnicode_IS_ASCII(unicode))
    939         struct_size = sizeof(PyASCIIObject);
    940     else
    941         struct_size = sizeof(PyCompactUnicodeObject);
    942     share_wstr = _PyUnicode_SHARE_WSTR(unicode);
    943 
    944     if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
    945         PyErr_NoMemory();
    946         return NULL;
    947     }
    948     new_size = (struct_size + (length + 1) * char_size);
    949 
    950     if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
    951         PyObject_DEL(_PyUnicode_UTF8(unicode));
    952         _PyUnicode_UTF8(unicode) = NULL;
    953         _PyUnicode_UTF8_LENGTH(unicode) = 0;
    954     }
    955     _Py_DEC_REFTOTAL;
    956     _Py_ForgetReference(unicode);
    957 
    958     new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
    959     if (new_unicode == NULL) {
    960         _Py_NewReference(unicode);
    961         PyErr_NoMemory();
    962         return NULL;
    963     }
    964     unicode = new_unicode;
    965     _Py_NewReference(unicode);
    966 
    967     _PyUnicode_LENGTH(unicode) = length;
    968     if (share_wstr) {
    969         _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
    970         if (!PyUnicode_IS_ASCII(unicode))
    971             _PyUnicode_WSTR_LENGTH(unicode) = length;
    972     }
    973     else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
    974         PyObject_DEL(_PyUnicode_WSTR(unicode));
    975         _PyUnicode_WSTR(unicode) = NULL;
    976         if (!PyUnicode_IS_ASCII(unicode))
    977             _PyUnicode_WSTR_LENGTH(unicode) = 0;
    978     }
    979 #ifdef Py_DEBUG
    980     unicode_fill_invalid(unicode, old_length);
    981 #endif
    982     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
    983                     length, 0);
    984     assert(_PyUnicode_CheckConsistency(unicode, 0));
    985     return unicode;
    986 }
    987 
    988 static int
    989 resize_inplace(PyObject *unicode, Py_ssize_t length)
    990 {
    991     wchar_t *wstr;
    992     Py_ssize_t new_size;
    993     assert(!PyUnicode_IS_COMPACT(unicode));
    994     assert(Py_REFCNT(unicode) == 1);
    995 
    996     if (PyUnicode_IS_READY(unicode)) {
    997         Py_ssize_t char_size;
    998         int share_wstr, share_utf8;
    999         void *data;
   1000 #ifdef Py_DEBUG
   1001         Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
   1002 #endif
   1003 
   1004         data = _PyUnicode_DATA_ANY(unicode);
   1005         char_size = PyUnicode_KIND(unicode);
   1006         share_wstr = _PyUnicode_SHARE_WSTR(unicode);
   1007         share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
   1008 
   1009         if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
   1010             PyErr_NoMemory();
   1011             return -1;
   1012         }
   1013         new_size = (length + 1) * char_size;
   1014 
   1015         if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
   1016         {
   1017             PyObject_DEL(_PyUnicode_UTF8(unicode));
   1018             _PyUnicode_UTF8(unicode) = NULL;
   1019             _PyUnicode_UTF8_LENGTH(unicode) = 0;
   1020         }
   1021 
   1022         data = (PyObject *)PyObject_REALLOC(data, new_size);
   1023         if (data == NULL) {
   1024             PyErr_NoMemory();
   1025             return -1;
   1026         }
   1027         _PyUnicode_DATA_ANY(unicode) = data;
   1028         if (share_wstr) {
   1029             _PyUnicode_WSTR(unicode) = data;
   1030             _PyUnicode_WSTR_LENGTH(unicode) = length;
   1031         }
   1032         if (share_utf8) {
   1033             _PyUnicode_UTF8(unicode) = data;
   1034             _PyUnicode_UTF8_LENGTH(unicode) = length;
   1035         }
   1036         _PyUnicode_LENGTH(unicode) = length;
   1037         PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
   1038 #ifdef Py_DEBUG
   1039         unicode_fill_invalid(unicode, old_length);
   1040 #endif
   1041         if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
   1042             assert(_PyUnicode_CheckConsistency(unicode, 0));
   1043             return 0;
   1044         }
   1045     }
   1046     assert(_PyUnicode_WSTR(unicode) != NULL);
   1047 
   1048     /* check for integer overflow */
   1049     if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
   1050         PyErr_NoMemory();
   1051         return -1;
   1052     }
   1053     new_size = sizeof(wchar_t) * (length + 1);
   1054     wstr =  _PyUnicode_WSTR(unicode);
   1055     wstr = PyObject_REALLOC(wstr, new_size);
   1056     if (!wstr) {
   1057         PyErr_NoMemory();
   1058         return -1;
   1059     }
   1060     _PyUnicode_WSTR(unicode) = wstr;
   1061     _PyUnicode_WSTR(unicode)[length] = 0;
   1062     _PyUnicode_WSTR_LENGTH(unicode) = length;
   1063     assert(_PyUnicode_CheckConsistency(unicode, 0));
   1064     return 0;
   1065 }
   1066 
   1067 static PyObject*
   1068 resize_copy(PyObject *unicode, Py_ssize_t length)
   1069 {
   1070     Py_ssize_t copy_length;
   1071     if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
   1072         PyObject *copy;
   1073 
   1074         assert(PyUnicode_IS_READY(unicode));
   1075 
   1076         copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
   1077         if (copy == NULL)
   1078             return NULL;
   1079 
   1080         copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
   1081         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
   1082         return copy;
   1083     }
   1084     else {
   1085         PyObject *w;
   1086 
   1087         w = (PyObject*)_PyUnicode_New(length);
   1088         if (w == NULL)
   1089             return NULL;
   1090         copy_length = _PyUnicode_WSTR_LENGTH(unicode);
   1091         copy_length = Py_MIN(copy_length, length);
   1092         memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
   1093                   copy_length * sizeof(wchar_t));
   1094         return w;
   1095     }
   1096 }
   1097 
   1098 /* We allocate one more byte to make sure the string is
   1099    Ux0000 terminated; some code (e.g. new_identifier)
   1100    relies on that.
   1101 
   1102    XXX This allocator could further be enhanced by assuring that the
   1103    free list never reduces its size below 1.
   1104 
   1105 */
   1106 
   1107 static PyUnicodeObject *
   1108 _PyUnicode_New(Py_ssize_t length)
   1109 {
   1110     PyUnicodeObject *unicode;
   1111     size_t new_size;
   1112 
   1113     /* Optimization for empty strings */
   1114     if (length == 0 && unicode_empty != NULL) {
   1115         Py_INCREF(unicode_empty);
   1116         return (PyUnicodeObject*)unicode_empty;
   1117     }
   1118 
   1119     /* Ensure we won't overflow the size. */
   1120     if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
   1121         return (PyUnicodeObject *)PyErr_NoMemory();
   1122     }
   1123     if (length < 0) {
   1124         PyErr_SetString(PyExc_SystemError,
   1125                         "Negative size passed to _PyUnicode_New");
   1126         return NULL;
   1127     }
   1128 
   1129     unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
   1130     if (unicode == NULL)
   1131         return NULL;
   1132     new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
   1133 
   1134     _PyUnicode_WSTR_LENGTH(unicode) = length;
   1135     _PyUnicode_HASH(unicode) = -1;
   1136     _PyUnicode_STATE(unicode).interned = 0;
   1137     _PyUnicode_STATE(unicode).kind = 0;
   1138     _PyUnicode_STATE(unicode).compact = 0;
   1139     _PyUnicode_STATE(unicode).ready = 0;
   1140     _PyUnicode_STATE(unicode).ascii = 0;
   1141     _PyUnicode_DATA_ANY(unicode) = NULL;
   1142     _PyUnicode_LENGTH(unicode) = 0;
   1143     _PyUnicode_UTF8(unicode) = NULL;
   1144     _PyUnicode_UTF8_LENGTH(unicode) = 0;
   1145 
   1146     _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
   1147     if (!_PyUnicode_WSTR(unicode)) {
   1148         Py_DECREF(unicode);
   1149         PyErr_NoMemory();
   1150         return NULL;
   1151     }
   1152 
   1153     /* Initialize the first element to guard against cases where
   1154      * the caller fails before initializing str -- unicode_resize()
   1155      * reads str[0], and the Keep-Alive optimization can keep memory
   1156      * allocated for str alive across a call to unicode_dealloc(unicode).
   1157      * We don't want unicode_resize to read uninitialized memory in
   1158      * that case.
   1159      */
   1160     _PyUnicode_WSTR(unicode)[0] = 0;
   1161     _PyUnicode_WSTR(unicode)[length] = 0;
   1162 
   1163     assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
   1164     return unicode;
   1165 }
   1166 
   1167 static const char*
   1168 unicode_kind_name(PyObject *unicode)
   1169 {
   1170     /* don't check consistency: unicode_kind_name() is called from
   1171        _PyUnicode_Dump() */
   1172     if (!PyUnicode_IS_COMPACT(unicode))
   1173     {
   1174         if (!PyUnicode_IS_READY(unicode))
   1175             return "wstr";
   1176         switch (PyUnicode_KIND(unicode))
   1177         {
   1178         case PyUnicode_1BYTE_KIND:
   1179             if (PyUnicode_IS_ASCII(unicode))
   1180                 return "legacy ascii";
   1181             else
   1182                 return "legacy latin1";
   1183         case PyUnicode_2BYTE_KIND:
   1184             return "legacy UCS2";
   1185         case PyUnicode_4BYTE_KIND:
   1186             return "legacy UCS4";
   1187         default:
   1188             return "<legacy invalid kind>";
   1189         }
   1190     }
   1191     assert(PyUnicode_IS_READY(unicode));
   1192     switch (PyUnicode_KIND(unicode)) {
   1193     case PyUnicode_1BYTE_KIND:
   1194         if (PyUnicode_IS_ASCII(unicode))
   1195             return "ascii";
   1196         else
   1197             return "latin1";
   1198     case PyUnicode_2BYTE_KIND:
   1199         return "UCS2";
   1200     case PyUnicode_4BYTE_KIND:
   1201         return "UCS4";
   1202     default:
   1203         return "<invalid compact kind>";
   1204     }
   1205 }
   1206 
   1207 #ifdef Py_DEBUG
   1208 /* Functions wrapping macros for use in debugger */
   1209 char *_PyUnicode_utf8(void *unicode){
   1210     return PyUnicode_UTF8(unicode);
   1211 }
   1212 
   1213 void *_PyUnicode_compact_data(void *unicode) {
   1214     return _PyUnicode_COMPACT_DATA(unicode);
   1215 }
   1216 void *_PyUnicode_data(void *unicode){
   1217     printf("obj %p\n", unicode);
   1218     printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
   1219     printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
   1220     printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
   1221     printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
   1222     printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
   1223     return PyUnicode_DATA(unicode);
   1224 }
   1225 
   1226 void
   1227 _PyUnicode_Dump(PyObject *op)
   1228 {
   1229     PyASCIIObject *ascii = (PyASCIIObject *)op;
   1230     PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
   1231     PyUnicodeObject *unicode = (PyUnicodeObject *)op;
   1232     void *data;
   1233 
   1234     if (ascii->state.compact)
   1235     {
   1236         if (ascii->state.ascii)
   1237             data = (ascii + 1);
   1238         else
   1239             data = (compact + 1);
   1240     }
   1241     else
   1242         data = unicode->data.any;
   1243     printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
   1244            unicode_kind_name(op), ascii->length);
   1245 
   1246     if (ascii->wstr == data)
   1247         printf("shared ");
   1248     printf("wstr=%p", ascii->wstr);
   1249 
   1250     if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
   1251         printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
   1252         if (!ascii->state.compact && compact->utf8 == unicode->data.any)
   1253             printf("shared ");
   1254         printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
   1255                compact->utf8, compact->utf8_length);
   1256     }
   1257     printf(", data=%p\n", data);
   1258 }
   1259 #endif
   1260 
   1261 PyObject *
   1262 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
   1263 {
   1264     PyObject *obj;
   1265     PyCompactUnicodeObject *unicode;
   1266     void *data;
   1267     enum PyUnicode_Kind kind;
   1268     int is_sharing, is_ascii;
   1269     Py_ssize_t char_size;
   1270     Py_ssize_t struct_size;
   1271 
   1272     /* Optimization for empty strings */
   1273     if (size == 0 && unicode_empty != NULL) {
   1274         Py_INCREF(unicode_empty);
   1275         return unicode_empty;
   1276     }
   1277 
   1278     is_ascii = 0;
   1279     is_sharing = 0;
   1280     struct_size = sizeof(PyCompactUnicodeObject);
   1281     if (maxchar < 128) {
   1282         kind = PyUnicode_1BYTE_KIND;
   1283         char_size = 1;
   1284         is_ascii = 1;
   1285         struct_size = sizeof(PyASCIIObject);
   1286     }
   1287     else if (maxchar < 256) {
   1288         kind = PyUnicode_1BYTE_KIND;
   1289         char_size = 1;
   1290     }
   1291     else if (maxchar < 65536) {
   1292         kind = PyUnicode_2BYTE_KIND;
   1293         char_size = 2;
   1294         if (sizeof(wchar_t) == 2)
   1295             is_sharing = 1;
   1296     }
   1297     else {
   1298         if (maxchar > MAX_UNICODE) {
   1299             PyErr_SetString(PyExc_SystemError,
   1300                             "invalid maximum character passed to PyUnicode_New");
   1301             return NULL;
   1302         }
   1303         kind = PyUnicode_4BYTE_KIND;
   1304         char_size = 4;
   1305         if (sizeof(wchar_t) == 4)
   1306             is_sharing = 1;
   1307     }
   1308 
   1309     /* Ensure we won't overflow the size. */
   1310     if (size < 0) {
   1311         PyErr_SetString(PyExc_SystemError,
   1312                         "Negative size passed to PyUnicode_New");
   1313         return NULL;
   1314     }
   1315     if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
   1316         return PyErr_NoMemory();
   1317 
   1318     /* Duplicated allocation code from _PyObject_New() instead of a call to
   1319      * PyObject_New() so we are able to allocate space for the object and
   1320      * it's data buffer.
   1321      */
   1322     obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
   1323     if (obj == NULL)
   1324         return PyErr_NoMemory();
   1325     obj = PyObject_INIT(obj, &PyUnicode_Type);
   1326     if (obj == NULL)
   1327         return NULL;
   1328 
   1329     unicode = (PyCompactUnicodeObject *)obj;
   1330     if (is_ascii)
   1331         data = ((PyASCIIObject*)obj) + 1;
   1332     else
   1333         data = unicode + 1;
   1334     _PyUnicode_LENGTH(unicode) = size;
   1335     _PyUnicode_HASH(unicode) = -1;
   1336     _PyUnicode_STATE(unicode).interned = 0;
   1337     _PyUnicode_STATE(unicode).kind = kind;
   1338     _PyUnicode_STATE(unicode).compact = 1;
   1339     _PyUnicode_STATE(unicode).ready = 1;
   1340     _PyUnicode_STATE(unicode).ascii = is_ascii;
   1341     if (is_ascii) {
   1342         ((char*)data)[size] = 0;
   1343         _PyUnicode_WSTR(unicode) = NULL;
   1344     }
   1345     else if (kind == PyUnicode_1BYTE_KIND) {
   1346         ((char*)data)[size] = 0;
   1347         _PyUnicode_WSTR(unicode) = NULL;
   1348         _PyUnicode_WSTR_LENGTH(unicode) = 0;
   1349         unicode->utf8 = NULL;
   1350         unicode->utf8_length = 0;
   1351     }
   1352     else {
   1353         unicode->utf8 = NULL;
   1354         unicode->utf8_length = 0;
   1355         if (kind == PyUnicode_2BYTE_KIND)
   1356             ((Py_UCS2*)data)[size] = 0;
   1357         else /* kind == PyUnicode_4BYTE_KIND */
   1358             ((Py_UCS4*)data)[size] = 0;
   1359         if (is_sharing) {
   1360             _PyUnicode_WSTR_LENGTH(unicode) = size;
   1361             _PyUnicode_WSTR(unicode) = (wchar_t *)data;
   1362         }
   1363         else {
   1364             _PyUnicode_WSTR_LENGTH(unicode) = 0;
   1365             _PyUnicode_WSTR(unicode) = NULL;
   1366         }
   1367     }
   1368 #ifdef Py_DEBUG
   1369     unicode_fill_invalid((PyObject*)unicode, 0);
   1370 #endif
   1371     assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
   1372     return obj;
   1373 }
   1374 
   1375 #if SIZEOF_WCHAR_T == 2
   1376 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
   1377    will decode surrogate pairs, the other conversions are implemented as macros
   1378    for efficiency.
   1379 
   1380    This function assumes that unicode can hold one more code point than wstr
   1381    characters for a terminating null character. */
   1382 static void
   1383 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
   1384                               PyObject *unicode)
   1385 {
   1386     const wchar_t *iter;
   1387     Py_UCS4 *ucs4_out;
   1388 
   1389     assert(unicode != NULL);
   1390     assert(_PyUnicode_CHECK(unicode));
   1391     assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
   1392     ucs4_out = PyUnicode_4BYTE_DATA(unicode);
   1393 
   1394     for (iter = begin; iter < end; ) {
   1395         assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
   1396                            _PyUnicode_GET_LENGTH(unicode)));
   1397         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
   1398             && (iter+1) < end
   1399             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
   1400         {
   1401             *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
   1402             iter += 2;
   1403         }
   1404         else {
   1405             *ucs4_out++ = *iter;
   1406             iter++;
   1407         }
   1408     }
   1409     assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
   1410                         _PyUnicode_GET_LENGTH(unicode)));
   1411 
   1412 }
   1413 #endif
   1414 
   1415 static int
   1416 unicode_check_modifiable(PyObject *unicode)
   1417 {
   1418     if (!unicode_modifiable(unicode)) {
   1419         PyErr_SetString(PyExc_SystemError,
   1420                         "Cannot modify a string currently used");
   1421         return -1;
   1422     }
   1423     return 0;
   1424 }
   1425 
   1426 static int
   1427 _copy_characters(PyObject *to, Py_ssize_t to_start,
   1428                  PyObject *from, Py_ssize_t from_start,
   1429                  Py_ssize_t how_many, int check_maxchar)
   1430 {
   1431     unsigned int from_kind, to_kind;
   1432     void *from_data, *to_data;
   1433 
   1434     assert(0 <= how_many);
   1435     assert(0 <= from_start);
   1436     assert(0 <= to_start);
   1437     assert(PyUnicode_Check(from));
   1438     assert(PyUnicode_IS_READY(from));
   1439     assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
   1440 
   1441     assert(PyUnicode_Check(to));
   1442     assert(PyUnicode_IS_READY(to));
   1443     assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
   1444 
   1445     if (how_many == 0)
   1446         return 0;
   1447 
   1448     from_kind = PyUnicode_KIND(from);
   1449     from_data = PyUnicode_DATA(from);
   1450     to_kind = PyUnicode_KIND(to);
   1451     to_data = PyUnicode_DATA(to);
   1452 
   1453 #ifdef Py_DEBUG
   1454     if (!check_maxchar
   1455         && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
   1456     {
   1457         const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
   1458         Py_UCS4 ch;
   1459         Py_ssize_t i;
   1460         for (i=0; i < how_many; i++) {
   1461             ch = PyUnicode_READ(from_kind, from_data, from_start + i);
   1462             assert(ch <= to_maxchar);
   1463         }
   1464     }
   1465 #endif
   1466 
   1467     if (from_kind == to_kind) {
   1468         if (check_maxchar
   1469             && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
   1470         {
   1471             /* Writing Latin-1 characters into an ASCII string requires to
   1472                check that all written characters are pure ASCII */
   1473             Py_UCS4 max_char;
   1474             max_char = ucs1lib_find_max_char(from_data,
   1475                                              (Py_UCS1*)from_data + how_many);
   1476             if (max_char >= 128)
   1477                 return -1;
   1478         }
   1479         memcpy((char*)to_data + to_kind * to_start,
   1480                   (char*)from_data + from_kind * from_start,
   1481                   to_kind * how_many);
   1482     }
   1483     else if (from_kind == PyUnicode_1BYTE_KIND
   1484              && to_kind == PyUnicode_2BYTE_KIND)
   1485     {
   1486         _PyUnicode_CONVERT_BYTES(
   1487             Py_UCS1, Py_UCS2,
   1488             PyUnicode_1BYTE_DATA(from) + from_start,
   1489             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
   1490             PyUnicode_2BYTE_DATA(to) + to_start
   1491             );
   1492     }
   1493     else if (from_kind == PyUnicode_1BYTE_KIND
   1494              && to_kind == PyUnicode_4BYTE_KIND)
   1495     {
   1496         _PyUnicode_CONVERT_BYTES(
   1497             Py_UCS1, Py_UCS4,
   1498             PyUnicode_1BYTE_DATA(from) + from_start,
   1499             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
   1500             PyUnicode_4BYTE_DATA(to) + to_start
   1501             );
   1502     }
   1503     else if (from_kind == PyUnicode_2BYTE_KIND
   1504              && to_kind == PyUnicode_4BYTE_KIND)
   1505     {
   1506         _PyUnicode_CONVERT_BYTES(
   1507             Py_UCS2, Py_UCS4,
   1508             PyUnicode_2BYTE_DATA(from) + from_start,
   1509             PyUnicode_2BYTE_DATA(from) + from_start + how_many,
   1510             PyUnicode_4BYTE_DATA(to) + to_start
   1511             );
   1512     }
   1513     else {
   1514         assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
   1515 
   1516         if (!check_maxchar) {
   1517             if (from_kind == PyUnicode_2BYTE_KIND
   1518                 && to_kind == PyUnicode_1BYTE_KIND)
   1519             {
   1520                 _PyUnicode_CONVERT_BYTES(
   1521                     Py_UCS2, Py_UCS1,
   1522                     PyUnicode_2BYTE_DATA(from) + from_start,
   1523                     PyUnicode_2BYTE_DATA(from) + from_start + how_many,
   1524                     PyUnicode_1BYTE_DATA(to) + to_start
   1525                     );
   1526             }
   1527             else if (from_kind == PyUnicode_4BYTE_KIND
   1528                      && to_kind == PyUnicode_1BYTE_KIND)
   1529             {
   1530                 _PyUnicode_CONVERT_BYTES(
   1531                     Py_UCS4, Py_UCS1,
   1532                     PyUnicode_4BYTE_DATA(from) + from_start,
   1533                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
   1534                     PyUnicode_1BYTE_DATA(to) + to_start
   1535                     );
   1536             }
   1537             else if (from_kind == PyUnicode_4BYTE_KIND
   1538                      && to_kind == PyUnicode_2BYTE_KIND)
   1539             {
   1540                 _PyUnicode_CONVERT_BYTES(
   1541                     Py_UCS4, Py_UCS2,
   1542                     PyUnicode_4BYTE_DATA(from) + from_start,
   1543                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
   1544                     PyUnicode_2BYTE_DATA(to) + to_start
   1545                     );
   1546             }
   1547             else {
   1548                 Py_UNREACHABLE();
   1549             }
   1550         }
   1551         else {
   1552             const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
   1553             Py_UCS4 ch;
   1554             Py_ssize_t i;
   1555 
   1556             for (i=0; i < how_many; i++) {
   1557                 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
   1558                 if (ch > to_maxchar)
   1559                     return -1;
   1560                 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
   1561             }
   1562         }
   1563     }
   1564     return 0;
   1565 }
   1566 
   1567 void
   1568 _PyUnicode_FastCopyCharacters(
   1569     PyObject *to, Py_ssize_t to_start,
   1570     PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
   1571 {
   1572     (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
   1573 }
   1574 
   1575 Py_ssize_t
   1576 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
   1577                          PyObject *from, Py_ssize_t from_start,
   1578                          Py_ssize_t how_many)
   1579 {
   1580     int err;
   1581 
   1582     if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
   1583         PyErr_BadInternalCall();
   1584         return -1;
   1585     }
   1586 
   1587     if (PyUnicode_READY(from) == -1)
   1588         return -1;
   1589     if (PyUnicode_READY(to) == -1)
   1590         return -1;
   1591 
   1592     if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
   1593         PyErr_SetString(PyExc_IndexError, "string index out of range");
   1594         return -1;
   1595     }
   1596     if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
   1597         PyErr_SetString(PyExc_IndexError, "string index out of range");
   1598         return -1;
   1599     }
   1600     if (how_many < 0) {
   1601         PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
   1602         return -1;
   1603     }
   1604     how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
   1605     if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
   1606         PyErr_Format(PyExc_SystemError,
   1607                      "Cannot write %zi characters at %zi "
   1608                      "in a string of %zi characters",
   1609                      how_many, to_start, PyUnicode_GET_LENGTH(to));
   1610         return -1;
   1611     }
   1612 
   1613     if (how_many == 0)
   1614         return 0;
   1615 
   1616     if (unicode_check_modifiable(to))
   1617         return -1;
   1618 
   1619     err = _copy_characters(to, to_start, from, from_start, how_many, 1);
   1620     if (err) {
   1621         PyErr_Format(PyExc_SystemError,
   1622                      "Cannot copy %s characters "
   1623                      "into a string of %s characters",
   1624                      unicode_kind_name(from),
   1625                      unicode_kind_name(to));
   1626         return -1;
   1627     }
   1628     return how_many;
   1629 }
   1630 
   1631 /* Find the maximum code point and count the number of surrogate pairs so a
   1632    correct string length can be computed before converting a string to UCS4.
   1633    This function counts single surrogates as a character and not as a pair.
   1634 
   1635    Return 0 on success, or -1 on error. */
   1636 static int
   1637 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
   1638                         Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
   1639 {
   1640     const wchar_t *iter;
   1641     Py_UCS4 ch;
   1642 
   1643     assert(num_surrogates != NULL && maxchar != NULL);
   1644     *num_surrogates = 0;
   1645     *maxchar = 0;
   1646 
   1647     for (iter = begin; iter < end; ) {
   1648 #if SIZEOF_WCHAR_T == 2
   1649         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
   1650             && (iter+1) < end
   1651             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
   1652         {
   1653             ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
   1654             ++(*num_surrogates);
   1655             iter += 2;
   1656         }
   1657         else
   1658 #endif
   1659         {
   1660             ch = *iter;
   1661             iter++;
   1662         }
   1663         if (ch > *maxchar) {
   1664             *maxchar = ch;
   1665             if (*maxchar > MAX_UNICODE) {
   1666                 PyErr_Format(PyExc_ValueError,
   1667                              "character U+%x is not in range [U+0000; U+10ffff]",
   1668                              ch);
   1669                 return -1;
   1670             }
   1671         }
   1672     }
   1673     return 0;
   1674 }
   1675 
   1676 int
   1677 _PyUnicode_Ready(PyObject *unicode)
   1678 {
   1679     wchar_t *end;
   1680     Py_UCS4 maxchar = 0;
   1681     Py_ssize_t num_surrogates;
   1682 #if SIZEOF_WCHAR_T == 2
   1683     Py_ssize_t length_wo_surrogates;
   1684 #endif
   1685 
   1686     /* _PyUnicode_Ready() is only intended for old-style API usage where
   1687        strings were created using _PyObject_New() and where no canonical
   1688        representation (the str field) has been set yet aka strings
   1689        which are not yet ready. */
   1690     assert(_PyUnicode_CHECK(unicode));
   1691     assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
   1692     assert(_PyUnicode_WSTR(unicode) != NULL);
   1693     assert(_PyUnicode_DATA_ANY(unicode) == NULL);
   1694     assert(_PyUnicode_UTF8(unicode) == NULL);
   1695     /* Actually, it should neither be interned nor be anything else: */
   1696     assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
   1697 
   1698     end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
   1699     if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
   1700                                 &maxchar, &num_surrogates) == -1)
   1701         return -1;
   1702 
   1703     if (maxchar < 256) {
   1704         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
   1705         if (!_PyUnicode_DATA_ANY(unicode)) {
   1706             PyErr_NoMemory();
   1707             return -1;
   1708         }
   1709         _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
   1710                                 _PyUnicode_WSTR(unicode), end,
   1711                                 PyUnicode_1BYTE_DATA(unicode));
   1712         PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
   1713         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
   1714         _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
   1715         if (maxchar < 128) {
   1716             _PyUnicode_STATE(unicode).ascii = 1;
   1717             _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
   1718             _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
   1719         }
   1720         else {
   1721             _PyUnicode_STATE(unicode).ascii = 0;
   1722             _PyUnicode_UTF8(unicode) = NULL;
   1723             _PyUnicode_UTF8_LENGTH(unicode) = 0;
   1724         }
   1725         PyObject_FREE(_PyUnicode_WSTR(unicode));
   1726         _PyUnicode_WSTR(unicode) = NULL;
   1727         _PyUnicode_WSTR_LENGTH(unicode) = 0;
   1728     }
   1729     /* In this case we might have to convert down from 4-byte native
   1730        wchar_t to 2-byte unicode. */
   1731     else if (maxchar < 65536) {
   1732         assert(num_surrogates == 0 &&
   1733                "FindMaxCharAndNumSurrogatePairs() messed up");
   1734 
   1735 #if SIZEOF_WCHAR_T == 2
   1736         /* We can share representations and are done. */
   1737         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
   1738         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
   1739         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
   1740         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
   1741         _PyUnicode_UTF8(unicode) = NULL;
   1742         _PyUnicode_UTF8_LENGTH(unicode) = 0;
   1743 #else
   1744         /* sizeof(wchar_t) == 4 */
   1745         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
   1746             2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
   1747         if (!_PyUnicode_DATA_ANY(unicode)) {
   1748             PyErr_NoMemory();
   1749             return -1;
   1750         }
   1751         _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
   1752                                 _PyUnicode_WSTR(unicode), end,
   1753                                 PyUnicode_2BYTE_DATA(unicode));
   1754         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
   1755         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
   1756         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
   1757         _PyUnicode_UTF8(unicode) = NULL;
   1758         _PyUnicode_UTF8_LENGTH(unicode) = 0;
   1759         PyObject_FREE(_PyUnicode_WSTR(unicode));
   1760         _PyUnicode_WSTR(unicode) = NULL;
   1761         _PyUnicode_WSTR_LENGTH(unicode) = 0;
   1762 #endif
   1763     }
   1764     /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
   1765     else {
   1766 #if SIZEOF_WCHAR_T == 2
   1767         /* in case the native representation is 2-bytes, we need to allocate a
   1768            new normalized 4-byte version. */
   1769         length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
   1770         if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
   1771             PyErr_NoMemory();
   1772             return -1;
   1773         }
   1774         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
   1775         if (!_PyUnicode_DATA_ANY(unicode)) {
   1776             PyErr_NoMemory();
   1777             return -1;
   1778         }
   1779         _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
   1780         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
   1781         _PyUnicode_UTF8(unicode) = NULL;
   1782         _PyUnicode_UTF8_LENGTH(unicode) = 0;
   1783         /* unicode_convert_wchar_to_ucs4() requires a ready string */
   1784         _PyUnicode_STATE(unicode).ready = 1;
   1785         unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
   1786         PyObject_FREE(_PyUnicode_WSTR(unicode));
   1787         _PyUnicode_WSTR(unicode) = NULL;
   1788         _PyUnicode_WSTR_LENGTH(unicode) = 0;
   1789 #else
   1790         assert(num_surrogates == 0);
   1791 
   1792         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
   1793         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
   1794         _PyUnicode_UTF8(unicode) = NULL;
   1795         _PyUnicode_UTF8_LENGTH(unicode) = 0;
   1796         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
   1797 #endif
   1798         PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
   1799     }
   1800     _PyUnicode_STATE(unicode).ready = 1;
   1801     assert(_PyUnicode_CheckConsistency(unicode, 1));
   1802     return 0;
   1803 }
   1804 
   1805 static void
   1806 unicode_dealloc(PyObject *unicode)
   1807 {
   1808     switch (PyUnicode_CHECK_INTERNED(unicode)) {
   1809     case SSTATE_NOT_INTERNED:
   1810         break;
   1811 
   1812     case SSTATE_INTERNED_MORTAL:
   1813         /* revive dead object temporarily for DelItem */
   1814         Py_REFCNT(unicode) = 3;
   1815         if (PyDict_DelItem(interned, unicode) != 0)
   1816             Py_FatalError(
   1817                 "deletion of interned string failed");
   1818         break;
   1819 
   1820     case SSTATE_INTERNED_IMMORTAL:
   1821         Py_FatalError("Immortal interned string died.");
   1822         /* fall through */
   1823 
   1824     default:
   1825         Py_FatalError("Inconsistent interned string state.");
   1826     }
   1827 
   1828     if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
   1829         PyObject_DEL(_PyUnicode_WSTR(unicode));
   1830     if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
   1831         PyObject_DEL(_PyUnicode_UTF8(unicode));
   1832     if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
   1833         PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
   1834 
   1835     Py_TYPE(unicode)->tp_free(unicode);
   1836 }
   1837 
   1838 #ifdef Py_DEBUG
   1839 static int
   1840 unicode_is_singleton(PyObject *unicode)
   1841 {
   1842     PyASCIIObject *ascii = (PyASCIIObject *)unicode;
   1843     if (unicode == unicode_empty)
   1844         return 1;
   1845     if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
   1846     {
   1847         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
   1848         if (ch < 256 && unicode_latin1[ch] == unicode)
   1849             return 1;
   1850     }
   1851     return 0;
   1852 }
   1853 #endif
   1854 
   1855 static int
   1856 unicode_modifiable(PyObject *unicode)
   1857 {
   1858     assert(_PyUnicode_CHECK(unicode));
   1859     if (Py_REFCNT(unicode) != 1)
   1860         return 0;
   1861     if (_PyUnicode_HASH(unicode) != -1)
   1862         return 0;
   1863     if (PyUnicode_CHECK_INTERNED(unicode))
   1864         return 0;
   1865     if (!PyUnicode_CheckExact(unicode))
   1866         return 0;
   1867 #ifdef Py_DEBUG
   1868     /* singleton refcount is greater than 1 */
   1869     assert(!unicode_is_singleton(unicode));
   1870 #endif
   1871     return 1;
   1872 }
   1873 
   1874 static int
   1875 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
   1876 {
   1877     PyObject *unicode;
   1878     Py_ssize_t old_length;
   1879 
   1880     assert(p_unicode != NULL);
   1881     unicode = *p_unicode;
   1882 
   1883     assert(unicode != NULL);
   1884     assert(PyUnicode_Check(unicode));
   1885     assert(0 <= length);
   1886 
   1887     if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
   1888         old_length = PyUnicode_WSTR_LENGTH(unicode);
   1889     else
   1890         old_length = PyUnicode_GET_LENGTH(unicode);
   1891     if (old_length == length)
   1892         return 0;
   1893 
   1894     if (length == 0) {
   1895         _Py_INCREF_UNICODE_EMPTY();
   1896         if (!unicode_empty)
   1897             return -1;
   1898         Py_SETREF(*p_unicode, unicode_empty);
   1899         return 0;
   1900     }
   1901 
   1902     if (!unicode_modifiable(unicode)) {
   1903         PyObject *copy = resize_copy(unicode, length);
   1904         if (copy == NULL)
   1905             return -1;
   1906         Py_SETREF(*p_unicode, copy);
   1907         return 0;
   1908     }
   1909 
   1910     if (PyUnicode_IS_COMPACT(unicode)) {
   1911         PyObject *new_unicode = resize_compact(unicode, length);
   1912         if (new_unicode == NULL)
   1913             return -1;
   1914         *p_unicode = new_unicode;
   1915         return 0;
   1916     }
   1917     return resize_inplace(unicode, length);
   1918 }
   1919 
   1920 int
   1921 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
   1922 {
   1923     PyObject *unicode;
   1924     if (p_unicode == NULL) {
   1925         PyErr_BadInternalCall();
   1926         return -1;
   1927     }
   1928     unicode = *p_unicode;
   1929     if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
   1930     {
   1931         PyErr_BadInternalCall();
   1932         return -1;
   1933     }
   1934     return unicode_resize(p_unicode, length);
   1935 }
   1936 
   1937 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
   1938 
   1939    WARNING: The function doesn't copy the terminating null character and
   1940    doesn't check the maximum character (may write a latin1 character in an
   1941    ASCII string). */
   1942 static void
   1943 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
   1944                    const char *str, Py_ssize_t len)
   1945 {
   1946     enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
   1947     void *data = PyUnicode_DATA(unicode);
   1948     const char *end = str + len;
   1949 
   1950     switch (kind) {
   1951     case PyUnicode_1BYTE_KIND: {
   1952         assert(index + len <= PyUnicode_GET_LENGTH(unicode));
   1953 #ifdef Py_DEBUG
   1954         if (PyUnicode_IS_ASCII(unicode)) {
   1955             Py_UCS4 maxchar = ucs1lib_find_max_char(
   1956                 (const Py_UCS1*)str,
   1957                 (const Py_UCS1*)str + len);
   1958             assert(maxchar < 128);
   1959         }
   1960 #endif
   1961         memcpy((char *) data + index, str, len);
   1962         break;
   1963     }
   1964     case PyUnicode_2BYTE_KIND: {
   1965         Py_UCS2 *start = (Py_UCS2 *)data + index;
   1966         Py_UCS2 *ucs2 = start;
   1967         assert(index <= PyUnicode_GET_LENGTH(unicode));
   1968 
   1969         for (; str < end; ++ucs2, ++str)
   1970             *ucs2 = (Py_UCS2)*str;
   1971 
   1972         assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
   1973         break;
   1974     }
   1975     default: {
   1976         Py_UCS4 *start = (Py_UCS4 *)data + index;
   1977         Py_UCS4 *ucs4 = start;
   1978         assert(kind == PyUnicode_4BYTE_KIND);
   1979         assert(index <= PyUnicode_GET_LENGTH(unicode));
   1980 
   1981         for (; str < end; ++ucs4, ++str)
   1982             *ucs4 = (Py_UCS4)*str;
   1983 
   1984         assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
   1985     }
   1986     }
   1987 }
   1988 
   1989 static PyObject*
   1990 get_latin1_char(unsigned char ch)
   1991 {
   1992     PyObject *unicode = unicode_latin1[ch];
   1993     if (!unicode) {
   1994         unicode = PyUnicode_New(1, ch);
   1995         if (!unicode)
   1996             return NULL;
   1997         PyUnicode_1BYTE_DATA(unicode)[0] = ch;
   1998         assert(_PyUnicode_CheckConsistency(unicode, 1));
   1999         unicode_latin1[ch] = unicode;
   2000     }
   2001     Py_INCREF(unicode);
   2002     return unicode;
   2003 }
   2004 
   2005 static PyObject*
   2006 unicode_char(Py_UCS4 ch)
   2007 {
   2008     PyObject *unicode;
   2009 
   2010     assert(ch <= MAX_UNICODE);
   2011 
   2012     if (ch < 256)
   2013         return get_latin1_char(ch);
   2014 
   2015     unicode = PyUnicode_New(1, ch);
   2016     if (unicode == NULL)
   2017         return NULL;
   2018 
   2019     assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
   2020     if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
   2021         PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
   2022     } else {
   2023         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
   2024         PyUnicode_4BYTE_DATA(unicode)[0] = ch;
   2025     }
   2026     assert(_PyUnicode_CheckConsistency(unicode, 1));
   2027     return unicode;
   2028 }
   2029 
   2030 PyObject *
   2031 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
   2032 {
   2033     if (u == NULL)
   2034         return (PyObject*)_PyUnicode_New(size);
   2035 
   2036     if (size < 0) {
   2037         PyErr_BadInternalCall();
   2038         return NULL;
   2039     }
   2040 
   2041     return PyUnicode_FromWideChar(u, size);
   2042 }
   2043 
   2044 PyObject *
   2045 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
   2046 {
   2047     PyObject *unicode;
   2048     Py_UCS4 maxchar = 0;
   2049     Py_ssize_t num_surrogates;
   2050 
   2051     if (u == NULL && size != 0) {
   2052         PyErr_BadInternalCall();
   2053         return NULL;
   2054     }
   2055 
   2056     if (size == -1) {
   2057         size = wcslen(u);
   2058     }
   2059 
   2060     /* If the Unicode data is known at construction time, we can apply
   2061        some optimizations which share commonly used objects. */
   2062 
   2063     /* Optimization for empty strings */
   2064     if (size == 0)
   2065         _Py_RETURN_UNICODE_EMPTY();
   2066 
   2067     /* Single character Unicode objects in the Latin-1 range are
   2068        shared when using this constructor */
   2069     if (size == 1 && (Py_UCS4)*u < 256)
   2070         return get_latin1_char((unsigned char)*u);
   2071 
   2072     /* If not empty and not single character, copy the Unicode data
   2073        into the new object */
   2074     if (find_maxchar_surrogates(u, u + size,
   2075                                 &maxchar, &num_surrogates) == -1)
   2076         return NULL;
   2077 
   2078     unicode = PyUnicode_New(size - num_surrogates, maxchar);
   2079     if (!unicode)
   2080         return NULL;
   2081 
   2082     switch (PyUnicode_KIND(unicode)) {
   2083     case PyUnicode_1BYTE_KIND:
   2084         _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
   2085                                 u, u + size, PyUnicode_1BYTE_DATA(unicode));
   2086         break;
   2087     case PyUnicode_2BYTE_KIND:
   2088 #if Py_UNICODE_SIZE == 2
   2089         memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
   2090 #else
   2091         _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
   2092                                 u, u + size, PyUnicode_2BYTE_DATA(unicode));
   2093 #endif
   2094         break;
   2095     case PyUnicode_4BYTE_KIND:
   2096 #if SIZEOF_WCHAR_T == 2
   2097         /* This is the only case which has to process surrogates, thus
   2098            a simple copy loop is not enough and we need a function. */
   2099         unicode_convert_wchar_to_ucs4(u, u + size, unicode);
   2100 #else
   2101         assert(num_surrogates == 0);
   2102         memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
   2103 #endif
   2104         break;
   2105     default:
   2106         Py_UNREACHABLE();
   2107     }
   2108 
   2109     return unicode_result(unicode);
   2110 }
   2111 
   2112 PyObject *
   2113 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
   2114 {
   2115     if (size < 0) {
   2116         PyErr_SetString(PyExc_SystemError,
   2117                         "Negative size passed to PyUnicode_FromStringAndSize");
   2118         return NULL;
   2119     }
   2120     if (u != NULL)
   2121         return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
   2122     else
   2123         return (PyObject *)_PyUnicode_New(size);
   2124 }
   2125 
   2126 PyObject *
   2127 PyUnicode_FromString(const char *u)
   2128 {
   2129     size_t size = strlen(u);
   2130     if (size > PY_SSIZE_T_MAX) {
   2131         PyErr_SetString(PyExc_OverflowError, "input too long");
   2132         return NULL;
   2133     }
   2134     return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
   2135 }
   2136 
   2137 PyObject *
   2138 _PyUnicode_FromId(_Py_Identifier *id)
   2139 {
   2140     if (!id->object) {
   2141         id->object = PyUnicode_DecodeUTF8Stateful(id->string,
   2142                                                   strlen(id->string),
   2143                                                   NULL, NULL);
   2144         if (!id->object)
   2145             return NULL;
   2146         PyUnicode_InternInPlace(&id->object);
   2147         assert(!id->next);
   2148         id->next = static_strings;
   2149         static_strings = id;
   2150     }
   2151     return id->object;
   2152 }
   2153 
   2154 void
   2155 _PyUnicode_ClearStaticStrings()
   2156 {
   2157     _Py_Identifier *tmp, *s = static_strings;
   2158     while (s) {
   2159         Py_CLEAR(s->object);
   2160         tmp = s->next;
   2161         s->next = NULL;
   2162         s = tmp;
   2163     }
   2164     static_strings = NULL;
   2165 }
   2166 
   2167 /* Internal function, doesn't check maximum character */
   2168 
   2169 PyObject*
   2170 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
   2171 {
   2172     const unsigned char *s = (const unsigned char *)buffer;
   2173     PyObject *unicode;
   2174     if (size == 1) {
   2175 #ifdef Py_DEBUG
   2176         assert((unsigned char)s[0] < 128);
   2177 #endif
   2178         return get_latin1_char(s[0]);
   2179     }
   2180     unicode = PyUnicode_New(size, 127);
   2181     if (!unicode)
   2182         return NULL;
   2183     memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
   2184     assert(_PyUnicode_CheckConsistency(unicode, 1));
   2185     return unicode;
   2186 }
   2187 
   2188 static Py_UCS4
   2189 kind_maxchar_limit(unsigned int kind)
   2190 {
   2191     switch (kind) {
   2192     case PyUnicode_1BYTE_KIND:
   2193         return 0x80;
   2194     case PyUnicode_2BYTE_KIND:
   2195         return 0x100;
   2196     case PyUnicode_4BYTE_KIND:
   2197         return 0x10000;
   2198     default:
   2199         Py_UNREACHABLE();
   2200     }
   2201 }
   2202 
   2203 static PyObject*
   2204 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
   2205 {
   2206     PyObject *res;
   2207     unsigned char max_char;
   2208 
   2209     if (size == 0)
   2210         _Py_RETURN_UNICODE_EMPTY();
   2211     assert(size > 0);
   2212     if (size == 1)
   2213         return get_latin1_char(u[0]);
   2214 
   2215     max_char = ucs1lib_find_max_char(u, u + size);
   2216     res = PyUnicode_New(size, max_char);
   2217     if (!res)
   2218         return NULL;
   2219     memcpy(PyUnicode_1BYTE_DATA(res), u, size);
   2220     assert(_PyUnicode_CheckConsistency(res, 1));
   2221     return res;
   2222 }
   2223 
   2224 static PyObject*
   2225 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
   2226 {
   2227     PyObject *res;
   2228     Py_UCS2 max_char;
   2229 
   2230     if (size == 0)
   2231         _Py_RETURN_UNICODE_EMPTY();
   2232     assert(size > 0);
   2233     if (size == 1)
   2234         return unicode_char(u[0]);
   2235 
   2236     max_char = ucs2lib_find_max_char(u, u + size);
   2237     res = PyUnicode_New(size, max_char);
   2238     if (!res)
   2239         return NULL;
   2240     if (max_char >= 256)
   2241         memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
   2242     else {
   2243         _PyUnicode_CONVERT_BYTES(
   2244             Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
   2245     }
   2246     assert(_PyUnicode_CheckConsistency(res, 1));
   2247     return res;
   2248 }
   2249 
   2250 static PyObject*
   2251 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
   2252 {
   2253     PyObject *res;
   2254     Py_UCS4 max_char;
   2255 
   2256     if (size == 0)
   2257         _Py_RETURN_UNICODE_EMPTY();
   2258     assert(size > 0);
   2259     if (size == 1)
   2260         return unicode_char(u[0]);
   2261 
   2262     max_char = ucs4lib_find_max_char(u, u + size);
   2263     res = PyUnicode_New(size, max_char);
   2264     if (!res)
   2265         return NULL;
   2266     if (max_char < 256)
   2267         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
   2268                                  PyUnicode_1BYTE_DATA(res));
   2269     else if (max_char < 0x10000)
   2270         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
   2271                                  PyUnicode_2BYTE_DATA(res));
   2272     else
   2273         memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
   2274     assert(_PyUnicode_CheckConsistency(res, 1));
   2275     return res;
   2276 }
   2277 
   2278 PyObject*
   2279 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
   2280 {
   2281     if (size < 0) {
   2282         PyErr_SetString(PyExc_ValueError, "size must be positive");
   2283         return NULL;
   2284     }
   2285     switch (kind) {
   2286     case PyUnicode_1BYTE_KIND:
   2287         return _PyUnicode_FromUCS1(buffer, size);
   2288     case PyUnicode_2BYTE_KIND:
   2289         return _PyUnicode_FromUCS2(buffer, size);
   2290     case PyUnicode_4BYTE_KIND:
   2291         return _PyUnicode_FromUCS4(buffer, size);
   2292     default:
   2293         PyErr_SetString(PyExc_SystemError, "invalid kind");
   2294         return NULL;
   2295     }
   2296 }
   2297 
   2298 Py_UCS4
   2299 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
   2300 {
   2301     enum PyUnicode_Kind kind;
   2302     void *startptr, *endptr;
   2303 
   2304     assert(PyUnicode_IS_READY(unicode));
   2305     assert(0 <= start);
   2306     assert(end <= PyUnicode_GET_LENGTH(unicode));
   2307     assert(start <= end);
   2308 
   2309     if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
   2310         return PyUnicode_MAX_CHAR_VALUE(unicode);
   2311 
   2312     if (start == end)
   2313         return 127;
   2314 
   2315     if (PyUnicode_IS_ASCII(unicode))
   2316         return 127;
   2317 
   2318     kind = PyUnicode_KIND(unicode);
   2319     startptr = PyUnicode_DATA(unicode);
   2320     endptr = (char *)startptr + end * kind;
   2321     startptr = (char *)startptr + start * kind;
   2322     switch(kind) {
   2323     case PyUnicode_1BYTE_KIND:
   2324         return ucs1lib_find_max_char(startptr, endptr);
   2325     case PyUnicode_2BYTE_KIND:
   2326         return ucs2lib_find_max_char(startptr, endptr);
   2327     case PyUnicode_4BYTE_KIND:
   2328         return ucs4lib_find_max_char(startptr, endptr);
   2329     default:
   2330         Py_UNREACHABLE();
   2331     }
   2332 }
   2333 
   2334 /* Ensure that a string uses the most efficient storage, if it is not the
   2335    case: create a new string with of the right kind. Write NULL into *p_unicode
   2336    on error. */
   2337 static void
   2338 unicode_adjust_maxchar(PyObject **p_unicode)
   2339 {
   2340     PyObject *unicode, *copy;
   2341     Py_UCS4 max_char;
   2342     Py_ssize_t len;
   2343     unsigned int kind;
   2344 
   2345     assert(p_unicode != NULL);
   2346     unicode = *p_unicode;
   2347     assert(PyUnicode_IS_READY(unicode));
   2348     if (PyUnicode_IS_ASCII(unicode))
   2349         return;
   2350 
   2351     len = PyUnicode_GET_LENGTH(unicode);
   2352     kind = PyUnicode_KIND(unicode);
   2353     if (kind == PyUnicode_1BYTE_KIND) {
   2354         const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
   2355         max_char = ucs1lib_find_max_char(u, u + len);
   2356         if (max_char >= 128)
   2357             return;
   2358     }
   2359     else if (kind == PyUnicode_2BYTE_KIND) {
   2360         const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
   2361         max_char = ucs2lib_find_max_char(u, u + len);
   2362         if (max_char >= 256)
   2363             return;
   2364     }
   2365     else {
   2366         const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
   2367         assert(kind == PyUnicode_4BYTE_KIND);
   2368         max_char = ucs4lib_find_max_char(u, u + len);
   2369         if (max_char >= 0x10000)
   2370             return;
   2371     }
   2372     copy = PyUnicode_New(len, max_char);
   2373     if (copy != NULL)
   2374         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
   2375     Py_DECREF(unicode);
   2376     *p_unicode = copy;
   2377 }
   2378 
   2379 PyObject*
   2380 _PyUnicode_Copy(PyObject *unicode)
   2381 {
   2382     Py_ssize_t length;
   2383     PyObject *copy;
   2384 
   2385     if (!PyUnicode_Check(unicode)) {
   2386         PyErr_BadInternalCall();
   2387         return NULL;
   2388     }
   2389     if (PyUnicode_READY(unicode) == -1)
   2390         return NULL;
   2391 
   2392     length = PyUnicode_GET_LENGTH(unicode);
   2393     copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
   2394     if (!copy)
   2395         return NULL;
   2396     assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
   2397 
   2398     memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
   2399               length * PyUnicode_KIND(unicode));
   2400     assert(_PyUnicode_CheckConsistency(copy, 1));
   2401     return copy;
   2402 }
   2403 
   2404 
   2405 /* Widen Unicode objects to larger buffers. Don't write terminating null
   2406    character. Return NULL on error. */
   2407 
   2408 void*
   2409 _PyUnicode_AsKind(PyObject *s, unsigned int kind)
   2410 {
   2411     Py_ssize_t len;
   2412     void *result;
   2413     unsigned int skind;
   2414 
   2415     if (PyUnicode_READY(s) == -1)
   2416         return NULL;
   2417 
   2418     len = PyUnicode_GET_LENGTH(s);
   2419     skind = PyUnicode_KIND(s);
   2420     if (skind >= kind) {
   2421         PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
   2422         return NULL;
   2423     }
   2424     switch (kind) {
   2425     case PyUnicode_2BYTE_KIND:
   2426         result = PyMem_New(Py_UCS2, len);
   2427         if (!result)
   2428             return PyErr_NoMemory();
   2429         assert(skind == PyUnicode_1BYTE_KIND);
   2430         _PyUnicode_CONVERT_BYTES(
   2431             Py_UCS1, Py_UCS2,
   2432             PyUnicode_1BYTE_DATA(s),
   2433             PyUnicode_1BYTE_DATA(s) + len,
   2434             result);
   2435         return result;
   2436     case PyUnicode_4BYTE_KIND:
   2437         result = PyMem_New(Py_UCS4, len);
   2438         if (!result)
   2439             return PyErr_NoMemory();
   2440         if (skind == PyUnicode_2BYTE_KIND) {
   2441             _PyUnicode_CONVERT_BYTES(
   2442                 Py_UCS2, Py_UCS4,
   2443                 PyUnicode_2BYTE_DATA(s),
   2444                 PyUnicode_2BYTE_DATA(s) + len,
   2445                 result);
   2446         }
   2447         else {
   2448             assert(skind == PyUnicode_1BYTE_KIND);
   2449             _PyUnicode_CONVERT_BYTES(
   2450                 Py_UCS1, Py_UCS4,
   2451                 PyUnicode_1BYTE_DATA(s),
   2452                 PyUnicode_1BYTE_DATA(s) + len,
   2453                 result);
   2454         }
   2455         return result;
   2456     default:
   2457         break;
   2458     }
   2459     PyErr_SetString(PyExc_SystemError, "invalid kind");
   2460     return NULL;
   2461 }
   2462 
   2463 static Py_UCS4*
   2464 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
   2465         int copy_null)
   2466 {
   2467     int kind;
   2468     void *data;
   2469     Py_ssize_t len, targetlen;
   2470     if (PyUnicode_READY(string) == -1)
   2471         return NULL;
   2472     kind = PyUnicode_KIND(string);
   2473     data = PyUnicode_DATA(string);
   2474     len = PyUnicode_GET_LENGTH(string);
   2475     targetlen = len;
   2476     if (copy_null)
   2477         targetlen++;
   2478     if (!target) {
   2479         target = PyMem_New(Py_UCS4, targetlen);
   2480         if (!target) {
   2481             PyErr_NoMemory();
   2482             return NULL;
   2483         }
   2484     }
   2485     else {
   2486         if (targetsize < targetlen) {
   2487             PyErr_Format(PyExc_SystemError,
   2488                          "string is longer than the buffer");
   2489             if (copy_null && 0 < targetsize)
   2490                 target[0] = 0;
   2491             return NULL;
   2492         }
   2493     }
   2494     if (kind == PyUnicode_1BYTE_KIND) {
   2495         Py_UCS1 *start = (Py_UCS1 *) data;
   2496         _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
   2497     }
   2498     else if (kind == PyUnicode_2BYTE_KIND) {
   2499         Py_UCS2 *start = (Py_UCS2 *) data;
   2500         _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
   2501     }
   2502     else {
   2503         assert(kind == PyUnicode_4BYTE_KIND);
   2504         memcpy(target, data, len * sizeof(Py_UCS4));
   2505     }
   2506     if (copy_null)
   2507         target[len] = 0;
   2508     return target;
   2509 }
   2510 
   2511 Py_UCS4*
   2512 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
   2513                  int copy_null)
   2514 {
   2515     if (target == NULL || targetsize < 0) {
   2516         PyErr_BadInternalCall();
   2517         return NULL;
   2518     }
   2519     return as_ucs4(string, target, targetsize, copy_null);
   2520 }
   2521 
   2522 Py_UCS4*
   2523 PyUnicode_AsUCS4Copy(PyObject *string)
   2524 {
   2525     return as_ucs4(string, NULL, 0, 1);
   2526 }
   2527 
   2528 /* maximum number of characters required for output of %lld or %p.
   2529    We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
   2530    plus 1 for the sign.  53/22 is an upper bound for log10(256). */
   2531 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
   2532 
   2533 static int
   2534 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
   2535                              Py_ssize_t width, Py_ssize_t precision)
   2536 {
   2537     Py_ssize_t length, fill, arglen;
   2538     Py_UCS4 maxchar;
   2539 
   2540     if (PyUnicode_READY(str) == -1)
   2541         return -1;
   2542 
   2543     length = PyUnicode_GET_LENGTH(str);
   2544     if ((precision == -1 || precision >= length)
   2545         && width <= length)
   2546         return _PyUnicodeWriter_WriteStr(writer, str);
   2547 
   2548     if (precision != -1)
   2549         length = Py_MIN(precision, length);
   2550 
   2551     arglen = Py_MAX(length, width);
   2552     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
   2553         maxchar = _PyUnicode_FindMaxChar(str, 0, length);
   2554     else
   2555         maxchar = writer->maxchar;
   2556 
   2557     if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
   2558         return -1;
   2559 
   2560     if (width > length) {
   2561         fill = width - length;
   2562         if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
   2563             return -1;
   2564         writer->pos += fill;
   2565     }
   2566 
   2567     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
   2568                                   str, 0, length);
   2569     writer->pos += length;
   2570     return 0;
   2571 }
   2572 
   2573 static int
   2574 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
   2575                               Py_ssize_t width, Py_ssize_t precision)
   2576 {
   2577     /* UTF-8 */
   2578     Py_ssize_t length;
   2579     PyObject *unicode;
   2580     int res;
   2581 
   2582     if (precision == -1) {
   2583         length = strlen(str);
   2584     }
   2585     else {
   2586         length = 0;
   2587         while (length < precision && str[length]) {
   2588             length++;
   2589         }
   2590     }
   2591     unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
   2592     if (unicode == NULL)
   2593         return -1;
   2594 
   2595     res = unicode_fromformat_write_str(writer, unicode, width, -1);
   2596     Py_DECREF(unicode);
   2597     return res;
   2598 }
   2599 
   2600 static const char*
   2601 unicode_fromformat_arg(_PyUnicodeWriter *writer,
   2602                        const char *f, va_list *vargs)
   2603 {
   2604     const char *p;
   2605     Py_ssize_t len;
   2606     int zeropad;
   2607     Py_ssize_t width;
   2608     Py_ssize_t precision;
   2609     int longflag;
   2610     int longlongflag;
   2611     int size_tflag;
   2612     Py_ssize_t fill;
   2613 
   2614     p = f;
   2615     f++;
   2616     zeropad = 0;
   2617     if (*f == '0') {
   2618         zeropad = 1;
   2619         f++;
   2620     }
   2621 
   2622     /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
   2623     width = -1;
   2624     if (Py_ISDIGIT((unsigned)*f)) {
   2625         width = *f - '0';
   2626         f++;
   2627         while (Py_ISDIGIT((unsigned)*f)) {
   2628             if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
   2629                 PyErr_SetString(PyExc_ValueError,
   2630                                 "width too big");
   2631                 return NULL;
   2632             }
   2633             width = (width * 10) + (*f - '0');
   2634             f++;
   2635         }
   2636     }
   2637     precision = -1;
   2638     if (*f == '.') {
   2639         f++;
   2640         if (Py_ISDIGIT((unsigned)*f)) {
   2641             precision = (*f - '0');
   2642             f++;
   2643             while (Py_ISDIGIT((unsigned)*f)) {
   2644                 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
   2645                     PyErr_SetString(PyExc_ValueError,
   2646                                     "precision too big");
   2647                     return NULL;
   2648                 }
   2649                 precision = (precision * 10) + (*f - '0');
   2650                 f++;
   2651             }
   2652         }
   2653         if (*f == '%') {
   2654             /* "%.3%s" => f points to "3" */
   2655             f--;
   2656         }
   2657     }
   2658     if (*f == '\0') {
   2659         /* bogus format "%.123" => go backward, f points to "3" */
   2660         f--;
   2661     }
   2662 
   2663     /* Handle %ld, %lu, %lld and %llu. */
   2664     longflag = 0;
   2665     longlongflag = 0;
   2666     size_tflag = 0;
   2667     if (*f == 'l') {
   2668         if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
   2669             longflag = 1;
   2670             ++f;
   2671         }
   2672         else if (f[1] == 'l' &&
   2673                  (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
   2674             longlongflag = 1;
   2675             f += 2;
   2676         }
   2677     }
   2678     /* handle the size_t flag. */
   2679     else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
   2680         size_tflag = 1;
   2681         ++f;
   2682     }
   2683 
   2684     if (f[1] == '\0')
   2685         writer->overallocate = 0;
   2686 
   2687     switch (*f) {
   2688     case 'c':
   2689     {
   2690         int ordinal = va_arg(*vargs, int);
   2691         if (ordinal < 0 || ordinal > MAX_UNICODE) {
   2692             PyErr_SetString(PyExc_OverflowError,
   2693                             "character argument not in range(0x110000)");
   2694             return NULL;
   2695         }
   2696         if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
   2697             return NULL;
   2698         break;
   2699     }
   2700 
   2701     case 'i':
   2702     case 'd':
   2703     case 'u':
   2704     case 'x':
   2705     {
   2706         /* used by sprintf */
   2707         char buffer[MAX_LONG_LONG_CHARS];
   2708         Py_ssize_t arglen;
   2709 
   2710         if (*f == 'u') {
   2711             if (longflag)
   2712                 len = sprintf(buffer, "%lu",
   2713                         va_arg(*vargs, unsigned long));
   2714             else if (longlongflag)
   2715                 len = sprintf(buffer, "%llu",
   2716                         va_arg(*vargs, unsigned long long));
   2717             else if (size_tflag)
   2718                 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
   2719                         va_arg(*vargs, size_t));
   2720             else
   2721                 len = sprintf(buffer, "%u",
   2722                         va_arg(*vargs, unsigned int));
   2723         }
   2724         else if (*f == 'x') {
   2725             len = sprintf(buffer, "%x", va_arg(*vargs, int));
   2726         }
   2727         else {
   2728             if (longflag)
   2729                 len = sprintf(buffer, "%li",
   2730                         va_arg(*vargs, long));
   2731             else if (longlongflag)
   2732                 len = sprintf(buffer, "%lli",
   2733                         va_arg(*vargs, long long));
   2734             else if (size_tflag)
   2735                 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
   2736                         va_arg(*vargs, Py_ssize_t));
   2737             else
   2738                 len = sprintf(buffer, "%i",
   2739                         va_arg(*vargs, int));
   2740         }
   2741         assert(len >= 0);
   2742 
   2743         if (precision < len)
   2744             precision = len;
   2745 
   2746         arglen = Py_MAX(precision, width);
   2747         if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
   2748             return NULL;
   2749 
   2750         if (width > precision) {
   2751             Py_UCS4 fillchar;
   2752             fill = width - precision;
   2753             fillchar = zeropad?'0':' ';
   2754             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
   2755                 return NULL;
   2756             writer->pos += fill;
   2757         }
   2758         if (precision > len) {
   2759             fill = precision - len;
   2760             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
   2761                 return NULL;
   2762             writer->pos += fill;
   2763         }
   2764 
   2765         if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
   2766             return NULL;
   2767         break;
   2768     }
   2769 
   2770     case 'p':
   2771     {
   2772         char number[MAX_LONG_LONG_CHARS];
   2773 
   2774         len = sprintf(number, "%p", va_arg(*vargs, void*));
   2775         assert(len >= 0);
   2776 
   2777         /* %p is ill-defined:  ensure leading 0x. */
   2778         if (number[1] == 'X')
   2779             number[1] = 'x';
   2780         else if (number[1] != 'x') {
   2781             memmove(number + 2, number,
   2782                     strlen(number) + 1);
   2783             number[0] = '0';
   2784             number[1] = 'x';
   2785             len += 2;
   2786         }
   2787 
   2788         if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
   2789             return NULL;
   2790         break;
   2791     }
   2792 
   2793     case 's':
   2794     {
   2795         /* UTF-8 */
   2796         const char *s = va_arg(*vargs, const char*);
   2797         if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
   2798             return NULL;
   2799         break;
   2800     }
   2801 
   2802     case 'U':
   2803     {
   2804         PyObject *obj = va_arg(*vargs, PyObject *);
   2805         assert(obj && _PyUnicode_CHECK(obj));
   2806 
   2807         if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
   2808             return NULL;
   2809         break;
   2810     }
   2811 
   2812     case 'V':
   2813     {
   2814         PyObject *obj = va_arg(*vargs, PyObject *);
   2815         const char *str = va_arg(*vargs, const char *);
   2816         if (obj) {
   2817             assert(_PyUnicode_CHECK(obj));
   2818             if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
   2819                 return NULL;
   2820         }
   2821         else {
   2822             assert(str != NULL);
   2823             if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
   2824                 return NULL;
   2825         }
   2826         break;
   2827     }
   2828 
   2829     case 'S':
   2830     {
   2831         PyObject *obj = va_arg(*vargs, PyObject *);
   2832         PyObject *str;
   2833         assert(obj);
   2834         str = PyObject_Str(obj);
   2835         if (!str)
   2836             return NULL;
   2837         if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
   2838             Py_DECREF(str);
   2839             return NULL;
   2840         }
   2841         Py_DECREF(str);
   2842         break;
   2843     }
   2844 
   2845     case 'R':
   2846     {
   2847         PyObject *obj = va_arg(*vargs, PyObject *);
   2848         PyObject *repr;
   2849         assert(obj);
   2850         repr = PyObject_Repr(obj);
   2851         if (!repr)
   2852             return NULL;
   2853         if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
   2854             Py_DECREF(repr);
   2855             return NULL;
   2856         }
   2857         Py_DECREF(repr);
   2858         break;
   2859     }
   2860 
   2861     case 'A':
   2862     {
   2863         PyObject *obj = va_arg(*vargs, PyObject *);
   2864         PyObject *ascii;
   2865         assert(obj);
   2866         ascii = PyObject_ASCII(obj);
   2867         if (!ascii)
   2868             return NULL;
   2869         if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
   2870             Py_DECREF(ascii);
   2871             return NULL;
   2872         }
   2873         Py_DECREF(ascii);
   2874         break;
   2875     }
   2876 
   2877     case '%':
   2878         if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
   2879             return NULL;
   2880         break;
   2881 
   2882     default:
   2883         /* if we stumble upon an unknown formatting code, copy the rest
   2884            of the format string to the output string. (we cannot just
   2885            skip the code, since there's no way to know what's in the
   2886            argument list) */
   2887         len = strlen(p);
   2888         if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
   2889             return NULL;
   2890         f = p+len;
   2891         return f;
   2892     }
   2893 
   2894     f++;
   2895     return f;
   2896 }
   2897 
   2898 PyObject *
   2899 PyUnicode_FromFormatV(const char *format, va_list vargs)
   2900 {
   2901     va_list vargs2;
   2902     const char *f;
   2903     _PyUnicodeWriter writer;
   2904 
   2905     _PyUnicodeWriter_Init(&writer);
   2906     writer.min_length = strlen(format) + 100;
   2907     writer.overallocate = 1;
   2908 
   2909     // Copy varags to be able to pass a reference to a subfunction.
   2910     va_copy(vargs2, vargs);
   2911 
   2912     for (f = format; *f; ) {
   2913         if (*f == '%') {
   2914             f = unicode_fromformat_arg(&writer, f, &vargs2);
   2915             if (f == NULL)
   2916                 goto fail;
   2917         }
   2918         else {
   2919             const char *p;
   2920             Py_ssize_t len;
   2921 
   2922             p = f;
   2923             do
   2924             {
   2925                 if ((unsigned char)*p > 127) {
   2926                     PyErr_Format(PyExc_ValueError,
   2927                         "PyUnicode_FromFormatV() expects an ASCII-encoded format "
   2928                         "string, got a non-ASCII byte: 0x%02x",
   2929                         (unsigned char)*p);
   2930                     goto fail;
   2931                 }
   2932                 p++;
   2933             }
   2934             while (*p != '\0' && *p != '%');
   2935             len = p - f;
   2936 
   2937             if (*p == '\0')
   2938                 writer.overallocate = 0;
   2939 
   2940             if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
   2941                 goto fail;
   2942 
   2943             f = p;
   2944         }
   2945     }
   2946     va_end(vargs2);
   2947     return _PyUnicodeWriter_Finish(&writer);
   2948 
   2949   fail:
   2950     va_end(vargs2);
   2951     _PyUnicodeWriter_Dealloc(&writer);
   2952     return NULL;
   2953 }
   2954 
   2955 PyObject *
   2956 PyUnicode_FromFormat(const char *format, ...)
   2957 {
   2958     PyObject* ret;
   2959     va_list vargs;
   2960 
   2961 #ifdef HAVE_STDARG_PROTOTYPES
   2962     va_start(vargs, format);
   2963 #else
   2964     va_start(vargs);
   2965 #endif
   2966     ret = PyUnicode_FromFormatV(format, vargs);
   2967     va_end(vargs);
   2968     return ret;
   2969 }
   2970 
   2971 #ifdef HAVE_WCHAR_H
   2972 
   2973 /* Convert a Unicode object to a wide character string.
   2974 
   2975    - If w is NULL: return the number of wide characters (including the null
   2976      character) required to convert the unicode object. Ignore size argument.
   2977 
   2978    - Otherwise: return the number of wide characters (excluding the null
   2979      character) written into w. Write at most size wide characters (including
   2980      the null character). */
   2981 Py_ssize_t
   2982 PyUnicode_AsWideChar(PyObject *unicode,
   2983                      wchar_t *w,
   2984                      Py_ssize_t size)
   2985 {
   2986     Py_ssize_t res;
   2987     const wchar_t *wstr;
   2988 
   2989     if (unicode == NULL) {
   2990         PyErr_BadInternalCall();
   2991         return -1;
   2992     }
   2993     wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
   2994     if (wstr == NULL)
   2995         return -1;
   2996 
   2997     if (w != NULL) {
   2998         if (size > res)
   2999             size = res + 1;
   3000         else
   3001             res = size;
   3002         memcpy(w, wstr, size * sizeof(wchar_t));
   3003         return res;
   3004     }
   3005     else
   3006         return res + 1;
   3007 }
   3008 
   3009 wchar_t*
   3010 PyUnicode_AsWideCharString(PyObject *unicode,
   3011                            Py_ssize_t *size)
   3012 {
   3013     const wchar_t *wstr;
   3014     wchar_t *buffer;
   3015     Py_ssize_t buflen;
   3016 
   3017     if (unicode == NULL) {
   3018         PyErr_BadInternalCall();
   3019         return NULL;
   3020     }
   3021 
   3022     wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
   3023     if (wstr == NULL) {
   3024         return NULL;
   3025     }
   3026     if (size == NULL && wcslen(wstr) != (size_t)buflen) {
   3027         PyErr_SetString(PyExc_ValueError,
   3028                         "embedded null character");
   3029         return NULL;
   3030     }
   3031 
   3032     buffer = PyMem_NEW(wchar_t, buflen + 1);
   3033     if (buffer == NULL) {
   3034         PyErr_NoMemory();
   3035         return NULL;
   3036     }
   3037     memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
   3038     if (size != NULL)
   3039         *size = buflen;
   3040     return buffer;
   3041 }
   3042 
   3043 #endif /* HAVE_WCHAR_H */
   3044 
   3045 PyObject *
   3046 PyUnicode_FromOrdinal(int ordinal)
   3047 {
   3048     if (ordinal < 0 || ordinal > MAX_UNICODE) {
   3049         PyErr_SetString(PyExc_ValueError,
   3050                         "chr() arg not in range(0x110000)");
   3051         return NULL;
   3052     }
   3053 
   3054     return unicode_char((Py_UCS4)ordinal);
   3055 }
   3056 
   3057 PyObject *
   3058 PyUnicode_FromObject(PyObject *obj)
   3059 {
   3060     /* XXX Perhaps we should make this API an alias of
   3061        PyObject_Str() instead ?! */
   3062     if (PyUnicode_CheckExact(obj)) {
   3063         if (PyUnicode_READY(obj) == -1)
   3064             return NULL;
   3065         Py_INCREF(obj);
   3066         return obj;
   3067     }
   3068     if (PyUnicode_Check(obj)) {
   3069         /* For a Unicode subtype that's not a Unicode object,
   3070            return a true Unicode object with the same data. */
   3071         return _PyUnicode_Copy(obj);
   3072     }
   3073     PyErr_Format(PyExc_TypeError,
   3074                  "Can't convert '%.100s' object to str implicitly",
   3075                  Py_TYPE(obj)->tp_name);
   3076     return NULL;
   3077 }
   3078 
   3079 PyObject *
   3080 PyUnicode_FromEncodedObject(PyObject *obj,
   3081                             const char *encoding,
   3082                             const char *errors)
   3083 {
   3084     Py_buffer buffer;
   3085     PyObject *v;
   3086 
   3087     if (obj == NULL) {
   3088         PyErr_BadInternalCall();
   3089         return NULL;
   3090     }
   3091 
   3092     /* Decoding bytes objects is the most common case and should be fast */
   3093     if (PyBytes_Check(obj)) {
   3094         if (PyBytes_GET_SIZE(obj) == 0)
   3095             _Py_RETURN_UNICODE_EMPTY();
   3096         v = PyUnicode_Decode(
   3097                 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
   3098                 encoding, errors);
   3099         return v;
   3100     }
   3101 
   3102     if (PyUnicode_Check(obj)) {
   3103         PyErr_SetString(PyExc_TypeError,
   3104                         "decoding str is not supported");
   3105         return NULL;
   3106     }
   3107 
   3108     /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
   3109     if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
   3110         PyErr_Format(PyExc_TypeError,
   3111                      "decoding to str: need a bytes-like object, %.80s found",
   3112                      Py_TYPE(obj)->tp_name);
   3113         return NULL;
   3114     }
   3115 
   3116     if (buffer.len == 0) {
   3117         PyBuffer_Release(&buffer);
   3118         _Py_RETURN_UNICODE_EMPTY();
   3119     }
   3120 
   3121     v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
   3122     PyBuffer_Release(&buffer);
   3123     return v;
   3124 }
   3125 
   3126 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
   3127    also convert to lowercase. Return 1 on success, or 0 on error (encoding is
   3128    longer than lower_len-1). */
   3129 int
   3130 _Py_normalize_encoding(const char *encoding,
   3131                        char *lower,
   3132                        size_t lower_len)
   3133 {
   3134     const char *e;
   3135     char *l;
   3136     char *l_end;
   3137     int punct;
   3138 
   3139     assert(encoding != NULL);
   3140 
   3141     e = encoding;
   3142     l = lower;
   3143     l_end = &lower[lower_len - 1];
   3144     punct = 0;
   3145     while (1) {
   3146         char c = *e;
   3147         if (c == 0) {
   3148             break;
   3149         }
   3150 
   3151         if (Py_ISALNUM(c) || c == '.') {
   3152             if (punct && l != lower) {
   3153                 if (l == l_end) {
   3154                     return 0;
   3155                 }
   3156                 *l++ = '_';
   3157             }
   3158             punct = 0;
   3159 
   3160             if (l == l_end) {
   3161                 return 0;
   3162             }
   3163             *l++ = Py_TOLOWER(c);
   3164         }
   3165         else {
   3166             punct = 1;
   3167         }
   3168 
   3169         e++;
   3170     }
   3171     *l = '\0';
   3172     return 1;
   3173 }
   3174 
   3175 PyObject *
   3176 PyUnicode_Decode(const char *s,
   3177                  Py_ssize_t size,
   3178                  const char *encoding,
   3179                  const char *errors)
   3180 {
   3181     PyObject *buffer = NULL, *unicode;
   3182     Py_buffer info;
   3183     char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
   3184 
   3185     if (encoding == NULL) {
   3186         return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
   3187     }
   3188 
   3189     /* Shortcuts for common default encodings */
   3190     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
   3191         char *lower = buflower;
   3192 
   3193         /* Fast paths */
   3194         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
   3195             lower += 3;
   3196             if (*lower == '_') {
   3197                 /* Match "utf8" and "utf_8" */
   3198                 lower++;
   3199             }
   3200 
   3201             if (lower[0] == '8' && lower[1] == 0) {
   3202                 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
   3203             }
   3204             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
   3205                 return PyUnicode_DecodeUTF16(s, size, errors, 0);
   3206             }
   3207             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
   3208                 return PyUnicode_DecodeUTF32(s, size, errors, 0);
   3209             }
   3210         }
   3211         else {
   3212             if (strcmp(lower, "ascii") == 0
   3213                 || strcmp(lower, "us_ascii") == 0) {
   3214                 return PyUnicode_DecodeASCII(s, size, errors);
   3215             }
   3216     #ifdef MS_WINDOWS
   3217             else if (strcmp(lower, "mbcs") == 0) {
   3218                 return PyUnicode_DecodeMBCS(s, size, errors);
   3219             }
   3220     #endif
   3221             else if (strcmp(lower, "latin1") == 0
   3222                      || strcmp(lower, "latin_1") == 0
   3223                      || strcmp(lower, "iso_8859_1") == 0
   3224                      || strcmp(lower, "iso8859_1") == 0) {
   3225                 return PyUnicode_DecodeLatin1(s, size, errors);
   3226             }
   3227         }
   3228     }
   3229 
   3230     /* Decode via the codec registry */
   3231     buffer = NULL;
   3232     if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
   3233         goto onError;
   3234     buffer = PyMemoryView_FromBuffer(&info);
   3235     if (buffer == NULL)
   3236         goto onError;
   3237     unicode = _PyCodec_DecodeText(buffer, encoding, errors);
   3238     if (unicode == NULL)
   3239         goto onError;
   3240     if (!PyUnicode_Check(unicode)) {
   3241         PyErr_Format(PyExc_TypeError,
   3242                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
   3243                      "use codecs.decode() to decode to arbitrary types",
   3244                      encoding,
   3245                      Py_TYPE(unicode)->tp_name);
   3246         Py_DECREF(unicode);
   3247         goto onError;
   3248     }
   3249     Py_DECREF(buffer);
   3250     return unicode_result(unicode);
   3251 
   3252   onError:
   3253     Py_XDECREF(buffer);
   3254     return NULL;
   3255 }
   3256 
   3257 PyObject *
   3258 PyUnicode_AsDecodedObject(PyObject *unicode,
   3259                           const char *encoding,
   3260                           const char *errors)
   3261 {
   3262     if (!PyUnicode_Check(unicode)) {
   3263         PyErr_BadArgument();
   3264         return NULL;
   3265     }
   3266 
   3267     if (PyErr_WarnEx(PyExc_DeprecationWarning,
   3268                      "PyUnicode_AsDecodedObject() is deprecated; "
   3269                      "use PyCodec_Decode() to decode from str", 1) < 0)
   3270         return NULL;
   3271 
   3272     if (encoding == NULL)
   3273         encoding = PyUnicode_GetDefaultEncoding();
   3274 
   3275     /* Decode via the codec registry */
   3276     return PyCodec_Decode(unicode, encoding, errors);
   3277 }
   3278 
   3279 PyObject *
   3280 PyUnicode_AsDecodedUnicode(PyObject *unicode,
   3281                            const char *encoding,
   3282                            const char *errors)
   3283 {
   3284     PyObject *v;
   3285 
   3286     if (!PyUnicode_Check(unicode)) {
   3287         PyErr_BadArgument();
   3288         goto onError;
   3289     }
   3290 
   3291     if (PyErr_WarnEx(PyExc_DeprecationWarning,
   3292                      "PyUnicode_AsDecodedUnicode() is deprecated; "
   3293                      "use PyCodec_Decode() to decode from str to str", 1) < 0)
   3294         return NULL;
   3295 
   3296     if (encoding == NULL)
   3297         encoding = PyUnicode_GetDefaultEncoding();
   3298 
   3299     /* Decode via the codec registry */
   3300     v = PyCodec_Decode(unicode, encoding, errors);
   3301     if (v == NULL)
   3302         goto onError;
   3303     if (!PyUnicode_Check(v)) {
   3304         PyErr_Format(PyExc_TypeError,
   3305                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
   3306                      "use codecs.decode() to decode to arbitrary types",
   3307                      encoding,
   3308                      Py_TYPE(unicode)->tp_name);
   3309         Py_DECREF(v);
   3310         goto onError;
   3311     }
   3312     return unicode_result(v);
   3313 
   3314   onError:
   3315     return NULL;
   3316 }
   3317 
   3318 PyObject *
   3319 PyUnicode_Encode(const Py_UNICODE *s,
   3320                  Py_ssize_t size,
   3321                  const char *encoding,
   3322                  const char *errors)
   3323 {
   3324     PyObject *v, *unicode;
   3325 
   3326     unicode = PyUnicode_FromWideChar(s, size);
   3327     if (unicode == NULL)
   3328         return NULL;
   3329     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
   3330     Py_DECREF(unicode);
   3331     return v;
   3332 }
   3333 
   3334 PyObject *
   3335 PyUnicode_AsEncodedObject(PyObject *unicode,
   3336                           const char *encoding,
   3337                           const char *errors)
   3338 {
   3339     PyObject *v;
   3340 
   3341     if (!PyUnicode_Check(unicode)) {
   3342         PyErr_BadArgument();
   3343         goto onError;
   3344     }
   3345 
   3346     if (PyErr_WarnEx(PyExc_DeprecationWarning,
   3347                      "PyUnicode_AsEncodedObject() is deprecated; "
   3348                      "use PyUnicode_AsEncodedString() to encode from str to bytes "
   3349                      "or PyCodec_Encode() for generic encoding", 1) < 0)
   3350         return NULL;
   3351 
   3352     if (encoding == NULL)
   3353         encoding = PyUnicode_GetDefaultEncoding();
   3354 
   3355     /* Encode via the codec registry */
   3356     v = PyCodec_Encode(unicode, encoding, errors);
   3357     if (v == NULL)
   3358         goto onError;
   3359     return v;
   3360 
   3361   onError:
   3362     return NULL;
   3363 }
   3364 
   3365 static int
   3366 locale_error_handler(const char *errors, int *surrogateescape)
   3367 {
   3368     _Py_error_handler error_handler = get_error_handler(errors);
   3369     switch (error_handler)
   3370     {
   3371     case _Py_ERROR_STRICT:
   3372         *surrogateescape = 0;
   3373         return 0;
   3374     case _Py_ERROR_SURROGATEESCAPE:
   3375         *surrogateescape = 1;
   3376         return 0;
   3377     default:
   3378         PyErr_Format(PyExc_ValueError,
   3379                      "only 'strict' and 'surrogateescape' error handlers "
   3380                      "are supported, not '%s'",
   3381                      errors);
   3382         return -1;
   3383     }
   3384 }
   3385 
   3386 static PyObject *
   3387 unicode_encode_locale(PyObject *unicode, const char *errors,
   3388                       int current_locale)
   3389 {
   3390     int surrogateescape;
   3391     if (locale_error_handler(errors, &surrogateescape) < 0)
   3392         return NULL;
   3393 
   3394     Py_ssize_t wlen;
   3395     wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
   3396     if (wstr == NULL) {
   3397         return NULL;
   3398     }
   3399 
   3400     if ((size_t)wlen != wcslen(wstr)) {
   3401         PyErr_SetString(PyExc_ValueError, "embedded null character");
   3402         PyMem_Free(wstr);
   3403         return NULL;
   3404     }
   3405 
   3406     char *str;
   3407     size_t error_pos;
   3408     const char *reason;
   3409     int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
   3410                                  current_locale, surrogateescape);
   3411     PyMem_Free(wstr);
   3412 
   3413     if (res != 0) {
   3414         if (res == -2) {
   3415             PyObject *exc;
   3416             exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
   3417                     "locale", unicode,
   3418                     (Py_ssize_t)error_pos,
   3419                     (Py_ssize_t)(error_pos+1),
   3420                     reason);
   3421             if (exc != NULL) {
   3422                 PyCodec_StrictErrors(exc);
   3423                 Py_DECREF(exc);
   3424             }
   3425         }
   3426         else {
   3427             PyErr_NoMemory();
   3428         }
   3429         return NULL;
   3430     }
   3431 
   3432     PyObject *bytes = PyBytes_FromString(str);
   3433     PyMem_RawFree(str);
   3434     return bytes;
   3435 }
   3436 
   3437 PyObject *
   3438 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
   3439 {
   3440     return unicode_encode_locale(unicode, errors, 1);
   3441 }
   3442 
   3443 PyObject *
   3444 PyUnicode_EncodeFSDefault(PyObject *unicode)
   3445 {
   3446 #if defined(__APPLE__)
   3447     return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
   3448 #else
   3449     PyInterpreterState *interp = PyThreadState_GET()->interp;
   3450     /* Bootstrap check: if the filesystem codec is implemented in Python, we
   3451        cannot use it to encode and decode filenames before it is loaded. Load
   3452        the Python codec requires to encode at least its own filename. Use the C
   3453        version of the locale codec until the codec registry is initialized and
   3454        the Python codec is loaded.
   3455 
   3456        Py_FileSystemDefaultEncoding is shared between all interpreters, we
   3457        cannot only rely on it: check also interp->fscodec_initialized for
   3458        subinterpreters. */
   3459     if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
   3460         return PyUnicode_AsEncodedString(unicode,
   3461                                          Py_FileSystemDefaultEncoding,
   3462                                          Py_FileSystemDefaultEncodeErrors);
   3463     }
   3464     else {
   3465         return unicode_encode_locale(unicode,
   3466                                      Py_FileSystemDefaultEncodeErrors, 0);
   3467     }
   3468 #endif
   3469 }
   3470 
   3471 PyObject *
   3472 PyUnicode_AsEncodedString(PyObject *unicode,
   3473                           const char *encoding,
   3474                           const char *errors)
   3475 {
   3476     PyObject *v;
   3477     char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
   3478 
   3479     if (!PyUnicode_Check(unicode)) {
   3480         PyErr_BadArgument();
   3481         return NULL;
   3482     }
   3483 
   3484     if (encoding == NULL) {
   3485         return _PyUnicode_AsUTF8String(unicode, errors);
   3486     }
   3487 
   3488     /* Shortcuts for common default encodings */
   3489     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
   3490         char *lower = buflower;
   3491 
   3492         /* Fast paths */
   3493         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
   3494             lower += 3;
   3495             if (*lower == '_') {
   3496                 /* Match "utf8" and "utf_8" */
   3497                 lower++;
   3498             }
   3499 
   3500             if (lower[0] == '8' && lower[1] == 0) {
   3501                 return _PyUnicode_AsUTF8String(unicode, errors);
   3502             }
   3503             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
   3504                 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
   3505             }
   3506             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
   3507                 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
   3508             }
   3509         }
   3510         else {
   3511             if (strcmp(lower, "ascii") == 0
   3512                 || strcmp(lower, "us_ascii") == 0) {
   3513                 return _PyUnicode_AsASCIIString(unicode, errors);
   3514             }
   3515 #ifdef MS_WINDOWS
   3516             else if (strcmp(lower, "mbcs") == 0) {
   3517                 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
   3518             }
   3519 #endif
   3520             else if (strcmp(lower, "latin1") == 0 ||
   3521                      strcmp(lower, "latin_1") == 0 ||
   3522                      strcmp(lower, "iso_8859_1") == 0 ||
   3523                      strcmp(lower, "iso8859_1") == 0) {
   3524                 return _PyUnicode_AsLatin1String(unicode, errors);
   3525             }
   3526         }
   3527     }
   3528 
   3529     /* Encode via the codec registry */
   3530     v = _PyCodec_EncodeText(unicode, encoding, errors);
   3531     if (v == NULL)
   3532         return NULL;
   3533 
   3534     /* The normal path */
   3535     if (PyBytes_Check(v))
   3536         return v;
   3537 
   3538     /* If the codec returns a buffer, raise a warning and convert to bytes */
   3539     if (PyByteArray_Check(v)) {
   3540         int error;
   3541         PyObject *b;
   3542 
   3543         error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
   3544             "encoder %s returned bytearray instead of bytes; "
   3545             "use codecs.encode() to encode to arbitrary types",
   3546             encoding);
   3547         if (error) {
   3548             Py_DECREF(v);
   3549             return NULL;
   3550         }
   3551 
   3552         b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
   3553                                       PyByteArray_GET_SIZE(v));
   3554         Py_DECREF(v);
   3555         return b;
   3556     }
   3557 
   3558     PyErr_Format(PyExc_TypeError,
   3559                  "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
   3560                  "use codecs.encode() to encode to arbitrary types",
   3561                  encoding,
   3562                  Py_TYPE(v)->tp_name);
   3563     Py_DECREF(v);
   3564     return NULL;
   3565 }
   3566 
   3567 PyObject *
   3568 PyUnicode_AsEncodedUnicode(PyObject *unicode,
   3569                            const char *encoding,
   3570                            const char *errors)
   3571 {
   3572     PyObject *v;
   3573 
   3574     if (!PyUnicode_Check(unicode)) {
   3575         PyErr_BadArgument();
   3576         goto onError;
   3577     }
   3578 
   3579     if (PyErr_WarnEx(PyExc_DeprecationWarning,
   3580                      "PyUnicode_AsEncodedUnicode() is deprecated; "
   3581                      "use PyCodec_Encode() to encode from str to str", 1) < 0)
   3582         return NULL;
   3583 
   3584     if (encoding == NULL)
   3585         encoding = PyUnicode_GetDefaultEncoding();
   3586 
   3587     /* Encode via the codec registry */
   3588     v = PyCodec_Encode(unicode, encoding, errors);
   3589     if (v == NULL)
   3590         goto onError;
   3591     if (!PyUnicode_Check(v)) {
   3592         PyErr_Format(PyExc_TypeError,
   3593                      "'%.400s' encoder returned '%.400s' instead of 'str'; "
   3594                      "use codecs.encode() to encode to arbitrary types",
   3595                      encoding,
   3596                      Py_TYPE(v)->tp_name);
   3597         Py_DECREF(v);
   3598         goto onError;
   3599     }
   3600     return v;
   3601 
   3602   onError:
   3603     return NULL;
   3604 }
   3605 
   3606 static PyObject*
   3607 unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
   3608                       int current_locale)
   3609 {
   3610     int surrogateescape;
   3611     if (locale_error_handler(errors, &surrogateescape) < 0)
   3612         return NULL;
   3613 
   3614     if (str[len] != '\0' || (size_t)len != strlen(str))  {
   3615         PyErr_SetString(PyExc_ValueError, "embedded null byte");
   3616         return NULL;
   3617     }
   3618 
   3619     wchar_t *wstr;
   3620     size_t wlen;
   3621     const char *reason;
   3622     int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
   3623                                  current_locale, surrogateescape);
   3624     if (res != 0) {
   3625         if (res == -2) {
   3626             PyObject *exc;
   3627             exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
   3628                                         "locale", str, len,
   3629                                         (Py_ssize_t)wlen,
   3630                                         (Py_ssize_t)(wlen + 1),
   3631                                         reason);
   3632             if (exc != NULL) {
   3633                 PyCodec_StrictErrors(exc);
   3634                 Py_DECREF(exc);
   3635             }
   3636         }
   3637         else {
   3638             PyErr_NoMemory();
   3639         }
   3640         return NULL;
   3641     }
   3642 
   3643     PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
   3644     PyMem_RawFree(wstr);
   3645     return unicode;
   3646 }
   3647 
   3648 PyObject*
   3649 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
   3650                               const char *errors)
   3651 {
   3652     return unicode_decode_locale(str, len, errors, 1);
   3653 }
   3654 
   3655 PyObject*
   3656 PyUnicode_DecodeLocale(const char *str, const char *errors)
   3657 {
   3658     Py_ssize_t size = (Py_ssize_t)strlen(str);
   3659     return unicode_decode_locale(str, size, errors, 1);
   3660 }
   3661 
   3662 
   3663 PyObject*
   3664 PyUnicode_DecodeFSDefault(const char *s) {
   3665     Py_ssize_t size = (Py_ssize_t)strlen(s);
   3666     return PyUnicode_DecodeFSDefaultAndSize(s, size);
   3667 }
   3668 
   3669 PyObject*
   3670 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
   3671 {
   3672 #if defined(__APPLE__)
   3673     return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
   3674 #else
   3675     PyInterpreterState *interp = PyThreadState_GET()->interp;
   3676     /* Bootstrap check: if the filesystem codec is implemented in Python, we
   3677        cannot use it to encode and decode filenames before it is loaded. Load
   3678        the Python codec requires to encode at least its own filename. Use the C
   3679        version of the locale codec until the codec registry is initialized and
   3680        the Python codec is loaded.
   3681 
   3682        Py_FileSystemDefaultEncoding is shared between all interpreters, we
   3683        cannot only rely on it: check also interp->fscodec_initialized for
   3684        subinterpreters. */
   3685     if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
   3686         return PyUnicode_Decode(s, size,
   3687                                 Py_FileSystemDefaultEncoding,
   3688                                 Py_FileSystemDefaultEncodeErrors);
   3689     }
   3690     else {
   3691         return unicode_decode_locale(s, size,
   3692                                      Py_FileSystemDefaultEncodeErrors, 0);
   3693     }
   3694 #endif
   3695 }
   3696 
   3697 
   3698 int
   3699 PyUnicode_FSConverter(PyObject* arg, void* addr)
   3700 {
   3701     PyObject *path = NULL;
   3702     PyObject *output = NULL;
   3703     Py_ssize_t size;
   3704     void *data;
   3705     if (arg == NULL) {
   3706         Py_DECREF(*(PyObject**)addr);
   3707         *(PyObject**)addr = NULL;
   3708         return 1;
   3709     }
   3710     path = PyOS_FSPath(arg);
   3711     if (path == NULL) {
   3712         return 0;
   3713     }
   3714     if (PyBytes_Check(path)) {
   3715         output = path;
   3716     }
   3717     else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
   3718         output = PyUnicode_EncodeFSDefault(path);
   3719         Py_DECREF(path);
   3720         if (!output) {
   3721             return 0;
   3722         }
   3723         assert(PyBytes_Check(output));
   3724     }
   3725 
   3726     size = PyBytes_GET_SIZE(output);
   3727     data = PyBytes_AS_STRING(output);
   3728     if ((size_t)size != strlen(data)) {
   3729         PyErr_SetString(PyExc_ValueError, "embedded null byte");
   3730         Py_DECREF(output);
   3731         return 0;
   3732     }
   3733     *(PyObject**)addr = output;
   3734     return Py_CLEANUP_SUPPORTED;
   3735 }
   3736 
   3737 
   3738 int
   3739 PyUnicode_FSDecoder(PyObject* arg, void* addr)
   3740 {
   3741     int is_buffer = 0;
   3742     PyObject *path = NULL;
   3743     PyObject *output = NULL;
   3744     if (arg == NULL) {
   3745         Py_DECREF(*(PyObject**)addr);
   3746         *(PyObject**)addr = NULL;
   3747         return 1;
   3748     }
   3749 
   3750     is_buffer = PyObject_CheckBuffer(arg);
   3751     if (!is_buffer) {
   3752         path = PyOS_FSPath(arg);
   3753         if (path == NULL) {
   3754             return 0;
   3755         }
   3756     }
   3757     else {
   3758         path = arg;
   3759         Py_INCREF(arg);
   3760     }
   3761 
   3762     if (PyUnicode_Check(path)) {
   3763         if (PyUnicode_READY(path) == -1) {
   3764             Py_DECREF(path);
   3765             return 0;
   3766         }
   3767         output = path;
   3768     }
   3769     else if (PyBytes_Check(path) || is_buffer) {
   3770         PyObject *path_bytes = NULL;
   3771 
   3772         if (!PyBytes_Check(path) &&
   3773             PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
   3774             "path should be string, bytes, or os.PathLike, not %.200s",
   3775             Py_TYPE(arg)->tp_name)) {
   3776                 Py_DECREF(path);
   3777             return 0;
   3778         }
   3779         path_bytes = PyBytes_FromObject(path);
   3780         Py_DECREF(path);
   3781         if (!path_bytes) {
   3782             return 0;
   3783         }
   3784         output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
   3785                                                   PyBytes_GET_SIZE(path_bytes));
   3786         Py_DECREF(path_bytes);
   3787         if (!output) {
   3788             return 0;
   3789         }
   3790     }
   3791     else {
   3792         PyErr_Format(PyExc_TypeError,
   3793                      "path should be string, bytes, or os.PathLike, not %.200s",
   3794                      Py_TYPE(arg)->tp_name);
   3795         Py_DECREF(path);
   3796         return 0;
   3797     }
   3798     if (PyUnicode_READY(output) == -1) {
   3799         Py_DECREF(output);
   3800         return 0;
   3801     }
   3802     if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
   3803                  PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
   3804         PyErr_SetString(PyExc_ValueError, "embedded null character");
   3805         Py_DECREF(output);
   3806         return 0;
   3807     }
   3808     *(PyObject**)addr = output;
   3809     return Py_CLEANUP_SUPPORTED;
   3810 }
   3811 
   3812 
   3813 const char *
   3814 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
   3815 {
   3816     PyObject *bytes;
   3817 
   3818     if (!PyUnicode_Check(unicode)) {
   3819         PyErr_BadArgument();
   3820         return NULL;
   3821     }
   3822     if (PyUnicode_READY(unicode) == -1)
   3823         return NULL;
   3824 
   3825     if (PyUnicode_UTF8(unicode) == NULL) {
   3826         assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
   3827         bytes = _PyUnicode_AsUTF8String(unicode, NULL);
   3828         if (bytes == NULL)
   3829             return NULL;
   3830         _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
   3831         if (_PyUnicode_UTF8(unicode) == NULL) {
   3832             PyErr_NoMemory();
   3833             Py_DECREF(bytes);
   3834             return NULL;
   3835         }
   3836         _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
   3837         memcpy(_PyUnicode_UTF8(unicode),
   3838                   PyBytes_AS_STRING(bytes),
   3839                   _PyUnicode_UTF8_LENGTH(unicode) + 1);
   3840         Py_DECREF(bytes);
   3841     }
   3842 
   3843     if (psize)
   3844         *psize = PyUnicode_UTF8_LENGTH(unicode);
   3845     return PyUnicode_UTF8(unicode);
   3846 }
   3847 
   3848 const char *
   3849 PyUnicode_AsUTF8(PyObject *unicode)
   3850 {
   3851     return PyUnicode_AsUTF8AndSize(unicode, NULL);
   3852 }
   3853 
   3854 Py_UNICODE *
   3855 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
   3856 {
   3857     const unsigned char *one_byte;
   3858 #if SIZEOF_WCHAR_T == 4
   3859     const Py_UCS2 *two_bytes;
   3860 #else
   3861     const Py_UCS4 *four_bytes;
   3862     const Py_UCS4 *ucs4_end;
   3863     Py_ssize_t num_surrogates;
   3864 #endif
   3865     wchar_t *w;
   3866     wchar_t *wchar_end;
   3867 
   3868     if (!PyUnicode_Check(unicode)) {
   3869         PyErr_BadArgument();
   3870         return NULL;
   3871     }
   3872     if (_PyUnicode_WSTR(unicode) == NULL) {
   3873         /* Non-ASCII compact unicode object */
   3874         assert(_PyUnicode_KIND(unicode) != 0);
   3875         assert(PyUnicode_IS_READY(unicode));
   3876 
   3877         if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
   3878 #if SIZEOF_WCHAR_T == 2
   3879             four_bytes = PyUnicode_4BYTE_DATA(unicode);
   3880             ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
   3881             num_surrogates = 0;
   3882 
   3883             for (; four_bytes < ucs4_end; ++four_bytes) {
   3884                 if (*four_bytes > 0xFFFF)
   3885                     ++num_surrogates;
   3886             }
   3887 
   3888             _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
   3889                     sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
   3890             if (!_PyUnicode_WSTR(unicode)) {
   3891                 PyErr_NoMemory();
   3892                 return NULL;
   3893             }
   3894             _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
   3895 
   3896             w = _PyUnicode_WSTR(unicode);
   3897             wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
   3898             four_bytes = PyUnicode_4BYTE_DATA(unicode);
   3899             for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
   3900                 if (*four_bytes > 0xFFFF) {
   3901                     assert(*four_bytes <= MAX_UNICODE);
   3902                     /* encode surrogate pair in this case */
   3903                     *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
   3904                     *w   = Py_UNICODE_LOW_SURROGATE(*four_bytes);
   3905                 }
   3906                 else
   3907                     *w = *four_bytes;
   3908 
   3909                 if (w > wchar_end) {
   3910                     Py_UNREACHABLE();
   3911                 }
   3912             }
   3913             *w = 0;
   3914 #else
   3915             /* sizeof(wchar_t) == 4 */
   3916             Py_FatalError("Impossible unicode object state, wstr and str "
   3917                           "should share memory already.");
   3918             return NULL;
   3919 #endif
   3920         }
   3921         else {
   3922             if ((size_t)_PyUnicode_LENGTH(unicode) >
   3923                     PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
   3924                 PyErr_NoMemory();
   3925                 return NULL;
   3926             }
   3927             _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
   3928                                                   (_PyUnicode_LENGTH(unicode) + 1));
   3929             if (!_PyUnicode_WSTR(unicode)) {
   3930                 PyErr_NoMemory();
   3931                 return NULL;
   3932             }
   3933             if (!PyUnicode_IS_COMPACT_ASCII(unicode))
   3934                 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
   3935             w = _PyUnicode_WSTR(unicode);
   3936             wchar_end = w + _PyUnicode_LENGTH(unicode);
   3937 
   3938             if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
   3939                 one_byte = PyUnicode_1BYTE_DATA(unicode);
   3940                 for (; w < wchar_end; ++one_byte, ++w)
   3941                     *w = *one_byte;
   3942                 /* null-terminate the wstr */
   3943                 *w = 0;
   3944             }
   3945             else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
   3946 #if SIZEOF_WCHAR_T == 4
   3947                 two_bytes = PyUnicode_2BYTE_DATA(unicode);
   3948                 for (; w < wchar_end; ++two_bytes, ++w)
   3949                     *w = *two_bytes;
   3950                 /* null-terminate the wstr */
   3951                 *w = 0;
   3952 #else
   3953                 /* sizeof(wchar_t) == 2 */
   3954                 PyObject_FREE(_PyUnicode_WSTR(unicode));
   3955                 _PyUnicode_WSTR(unicode) = NULL;
   3956                 Py_FatalError("Impossible unicode object state, wstr "
   3957                               "and str should share memory already.");
   3958                 return NULL;
   3959 #endif
   3960             }
   3961             else {
   3962                 Py_UNREACHABLE();
   3963             }
   3964         }
   3965     }
   3966     if (size != NULL)
   3967         *size = PyUnicode_WSTR_LENGTH(unicode);
   3968     return _PyUnicode_WSTR(unicode);
   3969 }
   3970 
   3971 Py_UNICODE *
   3972 PyUnicode_AsUnicode(PyObject *unicode)
   3973 {
   3974     return PyUnicode_AsUnicodeAndSize(unicode, NULL);
   3975 }
   3976 
   3977 const Py_UNICODE *
   3978 _PyUnicode_AsUnicode(PyObject *unicode)
   3979 {
   3980     Py_ssize_t size;
   3981     const Py_UNICODE *wstr;
   3982 
   3983     wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
   3984     if (wstr && wcslen(wstr) != (size_t)size) {
   3985         PyErr_SetString(PyExc_ValueError, "embedded null character");
   3986         return NULL;
   3987     }
   3988     return wstr;
   3989 }
   3990 
   3991 
   3992 Py_ssize_t
   3993 PyUnicode_GetSize(PyObject *unicode)
   3994 {
   3995     if (!PyUnicode_Check(unicode)) {
   3996         PyErr_BadArgument();
   3997         goto onError;
   3998     }
   3999     if (_PyUnicode_WSTR(unicode) == NULL) {
   4000         if (PyUnicode_AsUnicode(unicode) == NULL)
   4001             goto onError;
   4002     }
   4003     return PyUnicode_WSTR_LENGTH(unicode);
   4004 
   4005   onError:
   4006     return -1;
   4007 }
   4008 
   4009 Py_ssize_t
   4010 PyUnicode_GetLength(PyObject *unicode)
   4011 {
   4012     if (!PyUnicode_Check(unicode)) {
   4013         PyErr_BadArgument();
   4014         return -1;
   4015     }
   4016     if (PyUnicode_READY(unicode) == -1)
   4017         return -1;
   4018     return PyUnicode_GET_LENGTH(unicode);
   4019 }
   4020 
   4021 Py_UCS4
   4022 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
   4023 {
   4024     void *data;
   4025     int kind;
   4026 
   4027     if (!PyUnicode_Check(unicode)) {
   4028         PyErr_BadArgument();
   4029         return (Py_UCS4)-1;
   4030     }
   4031     if (PyUnicode_READY(unicode) == -1) {
   4032         return (Py_UCS4)-1;
   4033     }
   4034     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
   4035         PyErr_SetString(PyExc_IndexError, "string index out of range");
   4036         return (Py_UCS4)-1;
   4037     }
   4038     data = PyUnicode_DATA(unicode);
   4039     kind = PyUnicode_KIND(unicode);
   4040     return PyUnicode_READ(kind, data, index);
   4041 }
   4042 
   4043 int
   4044 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
   4045 {
   4046     if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
   4047         PyErr_BadArgument();
   4048         return -1;
   4049     }
   4050     assert(PyUnicode_IS_READY(unicode));
   4051     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
   4052         PyErr_SetString(PyExc_IndexError, "string index out of range");
   4053         return -1;
   4054     }
   4055     if (unicode_check_modifiable(unicode))
   4056         return -1;
   4057     if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
   4058         PyErr_SetString(PyExc_ValueError, "character out of range");
   4059         return -1;
   4060     }
   4061     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
   4062                     index, ch);
   4063     return 0;
   4064 }
   4065 
   4066 const char *
   4067 PyUnicode_GetDefaultEncoding(void)
   4068 {
   4069     return "utf-8";
   4070 }
   4071 
   4072 /* create or adjust a UnicodeDecodeError */
   4073 static void
   4074 make_decode_exception(PyObject **exceptionObject,
   4075                       const char *encoding,
   4076                       const char *input, Py_ssize_t length,
   4077                       Py_ssize_t startpos, Py_ssize_t endpos,
   4078                       const char *reason)
   4079 {
   4080     if (*exceptionObject == NULL) {
   4081         *exceptionObject = PyUnicodeDecodeError_Create(
   4082             encoding, input, length, startpos, endpos, reason);
   4083     }
   4084     else {
   4085         if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
   4086             goto onError;
   4087         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
   4088             goto onError;
   4089         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
   4090             goto onError;
   4091     }
   4092     return;
   4093 
   4094 onError:
   4095     Py_CLEAR(*exceptionObject);
   4096 }
   4097 
   4098 #ifdef MS_WINDOWS
   4099 /* error handling callback helper:
   4100    build arguments, call the callback and check the arguments,
   4101    if no exception occurred, copy the replacement to the output
   4102    and adjust various state variables.
   4103    return 0 on success, -1 on error
   4104 */
   4105 
   4106 static int
   4107 unicode_decode_call_errorhandler_wchar(
   4108     const char *errors, PyObject **errorHandler,
   4109     const char *encoding, const char *reason,
   4110     const char **input, const char **inend, Py_ssize_t *startinpos,
   4111     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
   4112     PyObject **output, Py_ssize_t *outpos)
   4113 {
   4114     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
   4115 
   4116     PyObject *restuple = NULL;
   4117     PyObject *repunicode = NULL;
   4118     Py_ssize_t outsize;
   4119     Py_ssize_t insize;
   4120     Py_ssize_t requiredsize;
   4121     Py_ssize_t newpos;
   4122     PyObject *inputobj = NULL;
   4123     wchar_t *repwstr;
   4124     Py_ssize_t repwlen;
   4125 
   4126     assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
   4127     outsize = _PyUnicode_WSTR_LENGTH(*output);
   4128 
   4129     if (*errorHandler == NULL) {
   4130         *errorHandler = PyCodec_LookupError(errors);
   4131         if (*errorHandler == NULL)
   4132             goto onError;
   4133     }
   4134 
   4135     make_decode_exception(exceptionObject,
   4136         encoding,
   4137         *input, *inend - *input,
   4138         *startinpos, *endinpos,
   4139         reason);
   4140     if (*exceptionObject == NULL)
   4141         goto onError;
   4142 
   4143     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
   4144     if (restuple == NULL)
   4145         goto onError;
   4146     if (!PyTuple_Check(restuple)) {
   4147         PyErr_SetString(PyExc_TypeError, &argparse[3]);
   4148         goto onError;
   4149     }
   4150     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
   4151         goto onError;
   4152 
   4153     /* Copy back the bytes variables, which might have been modified by the
   4154        callback */
   4155     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
   4156     if (!inputobj)
   4157         goto onError;
   4158     *input = PyBytes_AS_STRING(inputobj);
   4159     insize = PyBytes_GET_SIZE(inputobj);
   4160     *inend = *input + insize;
   4161     /* we can DECREF safely, as the exception has another reference,
   4162        so the object won't go away. */
   4163     Py_DECREF(inputobj);
   4164 
   4165     if (newpos<0)
   4166         newpos = insize+newpos;
   4167     if (newpos<0 || newpos>insize) {
   4168         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
   4169         goto onError;
   4170     }
   4171 
   4172     repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
   4173     if (repwstr == NULL)
   4174         goto onError;
   4175     /* need more space? (at least enough for what we
   4176        have+the replacement+the rest of the string (starting
   4177        at the new input position), so we won't have to check space
   4178        when there are no errors in the rest of the string) */
   4179     requiredsize = *outpos;
   4180     if (requiredsize > PY_SSIZE_T_MAX - repwlen)
   4181         goto overflow;
   4182     requiredsize += repwlen;
   4183     if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
   4184         goto overflow;
   4185     requiredsize += insize - newpos;
   4186     if (requiredsize > outsize) {
   4187         if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
   4188             requiredsize = 2*outsize;
   4189         if (unicode_resize(output, requiredsize) < 0)
   4190             goto onError;
   4191     }
   4192     wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
   4193     *outpos += repwlen;
   4194     *endinpos = newpos;
   4195     *inptr = *input + newpos;
   4196 
   4197     /* we made it! */
   4198     Py_DECREF(restuple);
   4199     return 0;
   4200 
   4201   overflow:
   4202     PyErr_SetString(PyExc_OverflowError,
   4203                     "decoded result is too long for a Python string");
   4204 
   4205   onError:
   4206     Py_XDECREF(restuple);
   4207     return -1;
   4208 }
   4209 #endif   /* MS_WINDOWS */
   4210 
   4211 static int
   4212 unicode_decode_call_errorhandler_writer(
   4213     const char *errors, PyObject **errorHandler,
   4214     const char *encoding, const char *reason,
   4215     const char **input, const char **inend, Py_ssize_t *startinpos,
   4216     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
   4217     _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
   4218 {
   4219     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
   4220 
   4221     PyObject *restuple = NULL;
   4222     PyObject *repunicode = NULL;
   4223     Py_ssize_t insize;
   4224     Py_ssize_t newpos;
   4225     Py_ssize_t replen;
   4226     Py_ssize_t remain;
   4227     PyObject *inputobj = NULL;
   4228     int need_to_grow = 0;
   4229     const char *new_inptr;
   4230 
   4231     if (*errorHandler == NULL) {
   4232         *errorHandler = PyCodec_LookupError(errors);
   4233         if (*errorHandler == NULL)
   4234             goto onError;
   4235     }
   4236 
   4237     make_decode_exception(exceptionObject,
   4238         encoding,
   4239         *input, *inend - *input,
   4240         *startinpos, *endinpos,
   4241         reason);
   4242     if (*exceptionObject == NULL)
   4243         goto onError;
   4244 
   4245     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
   4246     if (restuple == NULL)
   4247         goto onError;
   4248     if (!PyTuple_Check(restuple)) {
   4249         PyErr_SetString(PyExc_TypeError, &argparse[3]);
   4250         goto onError;
   4251     }
   4252     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
   4253         goto onError;
   4254 
   4255     /* Copy back the bytes variables, which might have been modified by the
   4256        callback */
   4257     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
   4258     if (!inputobj)
   4259         goto onError;
   4260     remain = *inend - *input - *endinpos;
   4261     *input = PyBytes_AS_STRING(inputobj);
   4262     insize = PyBytes_GET_SIZE(inputobj);
   4263     *inend = *input + insize;
   4264     /* we can DECREF safely, as the exception has another reference,
   4265        so the object won't go away. */
   4266     Py_DECREF(inputobj);
   4267 
   4268     if (newpos<0)
   4269         newpos = insize+newpos;
   4270     if (newpos<0 || newpos>insize) {
   4271         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
   4272         goto onError;
   4273     }
   4274 
   4275     replen = PyUnicode_GET_LENGTH(repunicode);
   4276     if (replen > 1) {
   4277         writer->min_length += replen - 1;
   4278         need_to_grow = 1;
   4279     }
   4280     new_inptr = *input + newpos;
   4281     if (*inend - new_inptr > remain) {
   4282         /* We don't know the decoding algorithm here so we make the worst
   4283            assumption that one byte decodes to one unicode character.
   4284            If unfortunately one byte could decode to more unicode characters,
   4285            the decoder may write out-of-bound then.  Is it possible for the
   4286            algorithms using this function? */
   4287         writer->min_length += *inend - new_inptr - remain;
   4288         need_to_grow = 1;
   4289     }
   4290     if (need_to_grow) {
   4291         writer->overallocate = 1;
   4292         if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
   4293                             PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
   4294             goto onError;
   4295     }
   4296     if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
   4297         goto onError;
   4298 
   4299     *endinpos = newpos;
   4300     *inptr = new_inptr;
   4301 
   4302     /* we made it! */
   4303     Py_DECREF(restuple);
   4304     return 0;
   4305 
   4306   onError:
   4307     Py_XDECREF(restuple);
   4308     return -1;
   4309 }
   4310 
   4311 /* --- UTF-7 Codec -------------------------------------------------------- */
   4312 
   4313 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
   4314 
   4315 /* Three simple macros defining base-64. */
   4316 
   4317 /* Is c a base-64 character? */
   4318 
   4319 #define IS_BASE64(c) \
   4320     (((c) >= 'A' && (c) <= 'Z') ||     \
   4321      ((c) >= 'a' && (c) <= 'z') ||     \
   4322      ((c) >= '0' && (c) <= '9') ||     \
   4323      (c) == '+' || (c) == '/')
   4324 
   4325 /* given that c is a base-64 character, what is its base-64 value? */
   4326 
   4327 #define FROM_BASE64(c)                                                  \
   4328     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
   4329      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
   4330      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
   4331      (c) == '+' ? 62 : 63)
   4332 
   4333 /* What is the base-64 character of the bottom 6 bits of n? */
   4334 
   4335 #define TO_BASE64(n)  \
   4336     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
   4337 
   4338 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
   4339  * decoded as itself.  We are permissive on decoding; the only ASCII
   4340  * byte not decoding to itself is the + which begins a base64
   4341  * string. */
   4342 
   4343 #define DECODE_DIRECT(c)                                \
   4344     ((c) <= 127 && (c) != '+')
   4345 
   4346 /* The UTF-7 encoder treats ASCII characters differently according to
   4347  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
   4348  * the above).  See RFC2152.  This array identifies these different
   4349  * sets:
   4350  * 0 : "Set D"
   4351  *     alphanumeric and '(),-./:?
   4352  * 1 : "Set O"
   4353  *     !"#$%&*;<=>@[]^_`{|}
   4354  * 2 : "whitespace"
   4355  *     ht nl cr sp
   4356  * 3 : special (must be base64 encoded)
   4357  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
   4358  */
   4359 
   4360 static
   4361 char utf7_category[128] = {
   4362 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
   4363     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
   4364 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
   4365     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
   4366 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
   4367     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
   4368 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
   4369     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
   4370 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
   4371     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   4372 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
   4373     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
   4374 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
   4375     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   4376 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
   4377     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
   4378 };
   4379 
   4380 /* ENCODE_DIRECT: this character should be encoded as itself.  The
   4381  * answer depends on whether we are encoding set O as itself, and also
   4382  * on whether we are encoding whitespace as itself.  RFC2152 makes it
   4383  * clear that the answers to these questions vary between
   4384  * applications, so this code needs to be flexible.  */
   4385 
   4386 #define ENCODE_DIRECT(c, directO, directWS)             \
   4387     ((c) < 128 && (c) > 0 &&                            \
   4388      ((utf7_category[(c)] == 0) ||                      \
   4389       (directWS && (utf7_category[(c)] == 2)) ||        \
   4390       (directO && (utf7_category[(c)] == 1))))
   4391 
   4392 PyObject *
   4393 PyUnicode_DecodeUTF7(const char *s,
   4394                      Py_ssize_t size,
   4395                      const char *errors)
   4396 {
   4397     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
   4398 }
   4399 
   4400 /* The decoder.  The only state we preserve is our read position,
   4401  * i.e. how many characters we have consumed.  So if we end in the
   4402  * middle of a shift sequence we have to back off the read position
   4403  * and the output to the beginning of the sequence, otherwise we lose
   4404  * all the shift state (seen bits, number of bits seen, high
   4405  * surrogate). */
   4406 
   4407 PyObject *
   4408 PyUnicode_DecodeUTF7Stateful(const char *s,
   4409                              Py_ssize_t size,
   4410                              const char *errors,
   4411                              Py_ssize_t *consumed)
   4412 {
   4413     const char *starts = s;
   4414     Py_ssize_t startinpos;
   4415     Py_ssize_t endinpos;
   4416     const char *e;
   4417     _PyUnicodeWriter writer;
   4418     const char *errmsg = "";
   4419     int inShift = 0;
   4420     Py_ssize_t shiftOutStart;
   4421     unsigned int base64bits = 0;
   4422     unsigned long base64buffer = 0;
   4423     Py_UCS4 surrogate = 0;
   4424     PyObject *errorHandler = NULL;
   4425     PyObject *exc = NULL;
   4426 
   4427     if (size == 0) {
   4428         if (consumed)
   4429             *consumed = 0;
   4430         _Py_RETURN_UNICODE_EMPTY();
   4431     }
   4432 
   4433     /* Start off assuming it's all ASCII. Widen later as necessary. */
   4434     _PyUnicodeWriter_Init(&writer);
   4435     writer.min_length = size;
   4436 
   4437     shiftOutStart = 0;
   4438     e = s + size;
   4439 
   4440     while (s < e) {
   4441         Py_UCS4 ch;
   4442       restart:
   4443         ch = (unsigned char) *s;
   4444 
   4445         if (inShift) { /* in a base-64 section */
   4446             if (IS_BASE64(ch)) { /* consume a base-64 character */
   4447                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
   4448                 base64bits += 6;
   4449                 s++;
   4450                 if (base64bits >= 16) {
   4451                     /* we have enough bits for a UTF-16 value */
   4452                     Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
   4453                     base64bits -= 16;
   4454                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
   4455                     assert(outCh <= 0xffff);
   4456                     if (surrogate) {
   4457                         /* expecting a second surrogate */
   4458                         if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
   4459                             Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
   4460                             if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
   4461                                 goto onError;
   4462                             surrogate = 0;
   4463                             continue;
   4464                         }
   4465                         else {
   4466                             if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
   4467                                 goto onError;
   4468                             surrogate = 0;
   4469                         }
   4470                     }
   4471                     if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
   4472                         /* first surrogate */
   4473                         surrogate = outCh;
   4474                     }
   4475                     else {
   4476                         if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
   4477                             goto onError;
   4478                     }
   4479                 }
   4480             }
   4481             else { /* now leaving a base-64 section */
   4482                 inShift = 0;
   4483                 if (base64bits > 0) { /* left-over bits */
   4484                     if (base64bits >= 6) {
   4485                         /* We've seen at least one base-64 character */
   4486                         s++;
   4487                         errmsg = "partial character in shift sequence";
   4488                         goto utf7Error;
   4489                     }
   4490                     else {
   4491                         /* Some bits remain; they should be zero */
   4492                         if (base64buffer != 0) {
   4493                             s++;
   4494                             errmsg = "non-zero padding bits in shift sequence";
   4495                             goto utf7Error;
   4496                         }
   4497                     }
   4498                 }
   4499                 if (surrogate && DECODE_DIRECT(ch)) {
   4500                     if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
   4501                         goto onError;
   4502                 }
   4503                 surrogate = 0;
   4504                 if (ch == '-') {
   4505                     /* '-' is absorbed; other terminating
   4506                        characters are preserved */
   4507                     s++;
   4508                 }
   4509             }
   4510         }
   4511         else if ( ch == '+' ) {
   4512             startinpos = s-starts;
   4513             s++; /* consume '+' */
   4514             if (s < e && *s == '-') { /* '+-' encodes '+' */
   4515                 s++;
   4516                 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
   4517                     goto onError;
   4518             }
   4519             else { /* begin base64-encoded section */
   4520                 inShift = 1;
   4521                 surrogate = 0;
   4522                 shiftOutStart = writer.pos;
   4523                 base64bits = 0;
   4524                 base64buffer = 0;
   4525             }
   4526         }
   4527         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
   4528             s++;
   4529             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
   4530                 goto onError;
   4531         }
   4532         else {
   4533             startinpos = s-starts;
   4534             s++;
   4535             errmsg = "unexpected special character";
   4536             goto utf7Error;
   4537         }
   4538         continue;
   4539 utf7Error:
   4540         endinpos = s-starts;
   4541         if (unicode_decode_call_errorhandler_writer(
   4542                 errors, &errorHandler,
   4543                 "utf7", errmsg,
   4544                 &starts, &e, &startinpos, &endinpos, &exc, &s,
   4545                 &writer))
   4546             goto onError;
   4547     }
   4548 
   4549     /* end of string */
   4550 
   4551     if (inShift && !consumed) { /* in shift sequence, no more to follow */
   4552         /* if we're in an inconsistent state, that's an error */
   4553         inShift = 0;
   4554         if (surrogate ||
   4555                 (base64bits >= 6) ||
   4556                 (base64bits > 0 && base64buffer != 0)) {
   4557             endinpos = size;
   4558             if (unicode_decode_call_errorhandler_writer(
   4559                     errors, &errorHandler,
   4560                     "utf7", "unterminated shift sequence",
   4561                     &starts, &e, &startinpos, &endinpos, &exc, &s,
   4562                     &writer))
   4563                 goto onError;
   4564             if (s < e)
   4565                 goto restart;
   4566         }
   4567     }
   4568 
   4569     /* return state */
   4570     if (consumed) {
   4571         if (inShift) {
   4572             *consumed = startinpos;
   4573             if (writer.pos != shiftOutStart && writer.maxchar > 127) {
   4574                 PyObject *result = PyUnicode_FromKindAndData(
   4575                         writer.kind, writer.data, shiftOutStart);
   4576                 Py_XDECREF(errorHandler);
   4577                 Py_XDECREF(exc);
   4578                 _PyUnicodeWriter_Dealloc(&writer);
   4579                 return result;
   4580             }
   4581             writer.pos = shiftOutStart; /* back off output */
   4582         }
   4583         else {
   4584             *consumed = s-starts;
   4585         }
   4586     }
   4587 
   4588     Py_XDECREF(errorHandler);
   4589     Py_XDECREF(exc);
   4590     return _PyUnicodeWriter_Finish(&writer);
   4591 
   4592   onError:
   4593     Py_XDECREF(errorHandler);
   4594     Py_XDECREF(exc);
   4595     _PyUnicodeWriter_Dealloc(&writer);
   4596     return NULL;
   4597 }
   4598 
   4599 
   4600 PyObject *
   4601 _PyUnicode_EncodeUTF7(PyObject *str,
   4602                       int base64SetO,
   4603                       int base64WhiteSpace,
   4604                       const char *errors)
   4605 {
   4606     int kind;
   4607     void *data;
   4608     Py_ssize_t len;
   4609     PyObject *v;
   4610     int inShift = 0;
   4611     Py_ssize_t i;
   4612     unsigned int base64bits = 0;
   4613     unsigned long base64buffer = 0;
   4614     char * out;
   4615     char * start;
   4616 
   4617     if (PyUnicode_READY(str) == -1)
   4618         return NULL;
   4619     kind = PyUnicode_KIND(str);
   4620     data = PyUnicode_DATA(str);
   4621     len = PyUnicode_GET_LENGTH(str);
   4622 
   4623     if (len == 0)
   4624         return PyBytes_FromStringAndSize(NULL, 0);
   4625 
   4626     /* It might be possible to tighten this worst case */
   4627     if (len > PY_SSIZE_T_MAX / 8)
   4628         return PyErr_NoMemory();
   4629     v = PyBytes_FromStringAndSize(NULL, len * 8);
   4630     if (v == NULL)
   4631         return NULL;
   4632 
   4633     start = out = PyBytes_AS_STRING(v);
   4634     for (i = 0; i < len; ++i) {
   4635         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   4636 
   4637         if (inShift) {
   4638             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
   4639                 /* shifting out */
   4640                 if (base64bits) { /* output remaining bits */
   4641                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
   4642                     base64buffer = 0;
   4643                     base64bits = 0;
   4644                 }
   4645                 inShift = 0;
   4646                 /* Characters not in the BASE64 set implicitly unshift the sequence
   4647                    so no '-' is required, except if the character is itself a '-' */
   4648                 if (IS_BASE64(ch) || ch == '-') {
   4649                     *out++ = '-';
   4650                 }
   4651                 *out++ = (char) ch;
   4652             }
   4653             else {
   4654                 goto encode_char;
   4655             }
   4656         }
   4657         else { /* not in a shift sequence */
   4658             if (ch == '+') {
   4659                 *out++ = '+';
   4660                         *out++ = '-';
   4661             }
   4662             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
   4663                 *out++ = (char) ch;
   4664             }
   4665             else {
   4666                 *out++ = '+';
   4667                 inShift = 1;
   4668                 goto encode_char;
   4669             }
   4670         }
   4671         continue;
   4672 encode_char:
   4673         if (ch >= 0x10000) {
   4674             assert(ch <= MAX_UNICODE);
   4675 
   4676             /* code first surrogate */
   4677             base64bits += 16;
   4678             base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
   4679             while (base64bits >= 6) {
   4680                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
   4681                 base64bits -= 6;
   4682             }
   4683             /* prepare second surrogate */
   4684             ch = Py_UNICODE_LOW_SURROGATE(ch);
   4685         }
   4686         base64bits += 16;
   4687         base64buffer = (base64buffer << 16) | ch;
   4688         while (base64bits >= 6) {
   4689             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
   4690             base64bits -= 6;
   4691         }
   4692     }
   4693     if (base64bits)
   4694         *out++= TO_BASE64(base64buffer << (6-base64bits) );
   4695     if (inShift)
   4696         *out++ = '-';
   4697     if (_PyBytes_Resize(&v, out - start) < 0)
   4698         return NULL;
   4699     return v;
   4700 }
   4701 PyObject *
   4702 PyUnicode_EncodeUTF7(const Py_UNICODE *s,
   4703                      Py_ssize_t size,
   4704                      int base64SetO,
   4705                      int base64WhiteSpace,
   4706                      const char *errors)
   4707 {
   4708     PyObject *result;
   4709     PyObject *tmp = PyUnicode_FromWideChar(s, size);
   4710     if (tmp == NULL)
   4711         return NULL;
   4712     result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
   4713                                    base64WhiteSpace, errors);
   4714     Py_DECREF(tmp);
   4715     return result;
   4716 }
   4717 
   4718 #undef IS_BASE64
   4719 #undef FROM_BASE64
   4720 #undef TO_BASE64
   4721 #undef DECODE_DIRECT
   4722 #undef ENCODE_DIRECT
   4723 
   4724 /* --- UTF-8 Codec -------------------------------------------------------- */
   4725 
   4726 PyObject *
   4727 PyUnicode_DecodeUTF8(const char *s,
   4728                      Py_ssize_t size,
   4729                      const char *errors)
   4730 {
   4731     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
   4732 }
   4733 
   4734 #include "stringlib/asciilib.h"
   4735 #include "stringlib/codecs.h"
   4736 #include "stringlib/undef.h"
   4737 
   4738 #include "stringlib/ucs1lib.h"
   4739 #include "stringlib/codecs.h"
   4740 #include "stringlib/undef.h"
   4741 
   4742 #include "stringlib/ucs2lib.h"
   4743 #include "stringlib/codecs.h"
   4744 #include "stringlib/undef.h"
   4745 
   4746 #include "stringlib/ucs4lib.h"
   4747 #include "stringlib/codecs.h"
   4748 #include "stringlib/undef.h"
   4749 
   4750 /* Mask to quickly check whether a C 'long' contains a
   4751    non-ASCII, UTF8-encoded char. */
   4752 #if (SIZEOF_LONG == 8)
   4753 # define ASCII_CHAR_MASK 0x8080808080808080UL
   4754 #elif (SIZEOF_LONG == 4)
   4755 # define ASCII_CHAR_MASK 0x80808080UL
   4756 #else
   4757 # error C 'long' size should be either 4 or 8!
   4758 #endif
   4759 
   4760 static Py_ssize_t
   4761 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
   4762 {
   4763     const char *p = start;
   4764     const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
   4765 
   4766     /*
   4767      * Issue #17237: m68k is a bit different from most architectures in
   4768      * that objects do not use "natural alignment" - for example, int and
   4769      * long are only aligned at 2-byte boundaries.  Therefore the assert()
   4770      * won't work; also, tests have shown that skipping the "optimised
   4771      * version" will even speed up m68k.
   4772      */
   4773 #if !defined(__m68k__)
   4774 #if SIZEOF_LONG <= SIZEOF_VOID_P
   4775     assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
   4776     if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
   4777         /* Fast path, see in STRINGLIB(utf8_decode) for
   4778            an explanation. */
   4779         /* Help allocation */
   4780         const char *_p = p;
   4781         Py_UCS1 * q = dest;
   4782         while (_p < aligned_end) {
   4783             unsigned long value = *(const unsigned long *) _p;
   4784             if (value & ASCII_CHAR_MASK)
   4785                 break;
   4786             *((unsigned long *)q) = value;
   4787             _p += SIZEOF_LONG;
   4788             q += SIZEOF_LONG;
   4789         }
   4790         p = _p;
   4791         while (p < end) {
   4792             if ((unsigned char)*p & 0x80)
   4793                 break;
   4794             *q++ = *p++;
   4795         }
   4796         return p - start;
   4797     }
   4798 #endif
   4799 #endif
   4800     while (p < end) {
   4801         /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
   4802            for an explanation. */
   4803         if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
   4804             /* Help allocation */
   4805             const char *_p = p;
   4806             while (_p < aligned_end) {
   4807                 unsigned long value = *(unsigned long *) _p;
   4808                 if (value & ASCII_CHAR_MASK)
   4809                     break;
   4810                 _p += SIZEOF_LONG;
   4811             }
   4812             p = _p;
   4813             if (_p == end)
   4814                 break;
   4815         }
   4816         if ((unsigned char)*p & 0x80)
   4817             break;
   4818         ++p;
   4819     }
   4820     memcpy(dest, start, p - start);
   4821     return p - start;
   4822 }
   4823 
   4824 PyObject *
   4825 PyUnicode_DecodeUTF8Stateful(const char *s,
   4826                              Py_ssize_t size,
   4827                              const char *errors,
   4828                              Py_ssize_t *consumed)
   4829 {
   4830     _PyUnicodeWriter writer;
   4831     const char *starts = s;
   4832     const char *end = s + size;
   4833 
   4834     Py_ssize_t startinpos;
   4835     Py_ssize_t endinpos;
   4836     const char *errmsg = "";
   4837     PyObject *error_handler_obj = NULL;
   4838     PyObject *exc = NULL;
   4839     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
   4840 
   4841     if (size == 0) {
   4842         if (consumed)
   4843             *consumed = 0;
   4844         _Py_RETURN_UNICODE_EMPTY();
   4845     }
   4846 
   4847     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
   4848     if (size == 1 && (unsigned char)s[0] < 128) {
   4849         if (consumed)
   4850             *consumed = 1;
   4851         return get_latin1_char((unsigned char)s[0]);
   4852     }
   4853 
   4854     _PyUnicodeWriter_Init(&writer);
   4855     writer.min_length = size;
   4856     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
   4857         goto onError;
   4858 
   4859     writer.pos = ascii_decode(s, end, writer.data);
   4860     s += writer.pos;
   4861     while (s < end) {
   4862         Py_UCS4 ch;
   4863         int kind = writer.kind;
   4864 
   4865         if (kind == PyUnicode_1BYTE_KIND) {
   4866             if (PyUnicode_IS_ASCII(writer.buffer))
   4867                 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
   4868             else
   4869                 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
   4870         } else if (kind == PyUnicode_2BYTE_KIND) {
   4871             ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
   4872         } else {
   4873             assert(kind == PyUnicode_4BYTE_KIND);
   4874             ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
   4875         }
   4876 
   4877         switch (ch) {
   4878         case 0:
   4879             if (s == end || consumed)
   4880                 goto End;
   4881             errmsg = "unexpected end of data";
   4882             startinpos = s - starts;
   4883             endinpos = end - starts;
   4884             break;
   4885         case 1:
   4886             errmsg = "invalid start byte";
   4887             startinpos = s - starts;
   4888             endinpos = startinpos + 1;
   4889             break;
   4890         case 2:
   4891         case 3:
   4892         case 4:
   4893             errmsg = "invalid continuation byte";
   4894             startinpos = s - starts;
   4895             endinpos = startinpos + ch - 1;
   4896             break;
   4897         default:
   4898             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
   4899                 goto onError;
   4900             continue;
   4901         }
   4902 
   4903         if (error_handler == _Py_ERROR_UNKNOWN)
   4904             error_handler = get_error_handler(errors);
   4905 
   4906         switch (error_handler) {
   4907         case _Py_ERROR_IGNORE:
   4908             s += (endinpos - startinpos);
   4909             break;
   4910 
   4911         case _Py_ERROR_REPLACE:
   4912             if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
   4913                 goto onError;
   4914             s += (endinpos - startinpos);
   4915             break;
   4916 
   4917         case _Py_ERROR_SURROGATEESCAPE:
   4918         {
   4919             Py_ssize_t i;
   4920 
   4921             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
   4922                 goto onError;
   4923             for (i=startinpos; i<endinpos; i++) {
   4924                 ch = (Py_UCS4)(unsigned char)(starts[i]);
   4925                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
   4926                                 ch + 0xdc00);
   4927                 writer.pos++;
   4928             }
   4929             s += (endinpos - startinpos);
   4930             break;
   4931         }
   4932 
   4933         default:
   4934             if (unicode_decode_call_errorhandler_writer(
   4935                     errors, &error_handler_obj,
   4936                     "utf-8", errmsg,
   4937                     &starts, &end, &startinpos, &endinpos, &exc, &s,
   4938                     &writer))
   4939                 goto onError;
   4940         }
   4941     }
   4942 
   4943 End:
   4944     if (consumed)
   4945         *consumed = s - starts;
   4946 
   4947     Py_XDECREF(error_handler_obj);
   4948     Py_XDECREF(exc);
   4949     return _PyUnicodeWriter_Finish(&writer);
   4950 
   4951 onError:
   4952     Py_XDECREF(error_handler_obj);
   4953     Py_XDECREF(exc);
   4954     _PyUnicodeWriter_Dealloc(&writer);
   4955     return NULL;
   4956 }
   4957 
   4958 
   4959 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
   4960    non-zero, use strict error handler otherwise.
   4961 
   4962    On success, write a pointer to a newly allocated wide character string into
   4963    *wstr (use PyMem_RawFree() to free the memory) and write the output length
   4964    (in number of wchar_t units) into *wlen (if wlen is set).
   4965 
   4966    On memory allocation failure, return -1.
   4967 
   4968    On decoding error (if surrogateescape is zero), return -2. If wlen is
   4969    non-NULL, write the start of the illegal byte sequence into *wlen. If reason
   4970    is not NULL, write the decoding error message into *reason. */
   4971 int
   4972 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
   4973                  const char **reason, int surrogateescape)
   4974 {
   4975     const char *orig_s = s;
   4976     const char *e;
   4977     wchar_t *unicode;
   4978     Py_ssize_t outpos;
   4979 
   4980     /* Note: size will always be longer than the resulting Unicode
   4981        character count */
   4982     if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
   4983         return -1;
   4984     }
   4985 
   4986     unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
   4987     if (!unicode) {
   4988         return -1;
   4989     }
   4990 
   4991     /* Unpack UTF-8 encoded data */
   4992     e = s + size;
   4993     outpos = 0;
   4994     while (s < e) {
   4995         Py_UCS4 ch;
   4996 #if SIZEOF_WCHAR_T == 4
   4997         ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
   4998 #else
   4999         ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
   5000 #endif
   5001         if (ch > 0xFF) {
   5002 #if SIZEOF_WCHAR_T == 4
   5003             Py_UNREACHABLE();
   5004 #else
   5005             assert(ch > 0xFFFF && ch <= MAX_UNICODE);
   5006             /* write a surrogate pair */
   5007             unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
   5008             unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
   5009 #endif
   5010         }
   5011         else {
   5012             if (!ch && s == e)
   5013                 break;
   5014             if (!surrogateescape) {
   5015                 PyMem_RawFree(unicode );
   5016                 if (reason != NULL) {
   5017                     switch (ch) {
   5018                     case 0:
   5019                         *reason = "unexpected end of data";
   5020                         break;
   5021                     case 1:
   5022                         *reason = "invalid start byte";
   5023                         break;
   5024                     /* 2, 3, 4 */
   5025                     default:
   5026                         *reason = "invalid continuation byte";
   5027                         break;
   5028                     }
   5029                 }
   5030                 if (wlen != NULL) {
   5031                     *wlen = s - orig_s;
   5032                 }
   5033                 return -2;
   5034             }
   5035             /* surrogateescape */
   5036             unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
   5037         }
   5038     }
   5039     unicode[outpos] = L'\0';
   5040     if (wlen) {
   5041         *wlen = outpos;
   5042     }
   5043     *wstr = unicode;
   5044     return 0;
   5045 }
   5046 
   5047 wchar_t*
   5048 _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
   5049 {
   5050     wchar_t *wstr;
   5051     int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
   5052     if (res != 0) {
   5053         return NULL;
   5054     }
   5055     return wstr;
   5056 }
   5057 
   5058 
   5059 /* UTF-8 encoder using the surrogateescape error handler .
   5060 
   5061    On success, return 0 and write the newly allocated character string (use
   5062    PyMem_Free() to free the memory) into *str.
   5063 
   5064    On encoding failure, return -2 and write the position of the invalid
   5065    surrogate character into *error_pos (if error_pos is set) and the decoding
   5066    error message into *reason (if reason is set).
   5067 
   5068    On memory allocation failure, return -1. */
   5069 int
   5070 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
   5071                  const char **reason, int raw_malloc, int surrogateescape)
   5072 {
   5073     const Py_ssize_t max_char_size = 4;
   5074     Py_ssize_t len = wcslen(text);
   5075 
   5076     assert(len >= 0);
   5077 
   5078     if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
   5079         return -1;
   5080     }
   5081     char *bytes;
   5082     if (raw_malloc) {
   5083         bytes = PyMem_RawMalloc((len + 1) * max_char_size);
   5084     }
   5085     else {
   5086         bytes = PyMem_Malloc((len + 1) * max_char_size);
   5087     }
   5088     if (bytes == NULL) {
   5089         return -1;
   5090     }
   5091 
   5092     char *p = bytes;
   5093     Py_ssize_t i;
   5094     for (i = 0; i < len; i++) {
   5095         Py_UCS4 ch = text[i];
   5096 
   5097         if (ch < 0x80) {
   5098             /* Encode ASCII */
   5099             *p++ = (char) ch;
   5100 
   5101         }
   5102         else if (ch < 0x0800) {
   5103             /* Encode Latin-1 */
   5104             *p++ = (char)(0xc0 | (ch >> 6));
   5105             *p++ = (char)(0x80 | (ch & 0x3f));
   5106         }
   5107         else if (Py_UNICODE_IS_SURROGATE(ch)) {
   5108             /* surrogateescape error handler */
   5109             if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
   5110                 if (error_pos != NULL) {
   5111                     *error_pos = (size_t)i;
   5112                 }
   5113                 if (reason != NULL) {
   5114                     *reason = "encoding error";
   5115                 }
   5116                 if (raw_malloc) {
   5117                     PyMem_RawFree(bytes);
   5118                 }
   5119                 else {
   5120                     PyMem_Free(bytes);
   5121                 }
   5122                 return -2;
   5123             }
   5124             *p++ = (char)(ch & 0xff);
   5125         }
   5126         else if (ch < 0x10000) {
   5127             *p++ = (char)(0xe0 | (ch >> 12));
   5128             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
   5129             *p++ = (char)(0x80 | (ch & 0x3f));
   5130         }
   5131         else {  /* ch >= 0x10000 */
   5132             assert(ch <= MAX_UNICODE);
   5133             /* Encode UCS4 Unicode ordinals */
   5134             *p++ = (char)(0xf0 | (ch >> 18));
   5135             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
   5136             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
   5137             *p++ = (char)(0x80 | (ch & 0x3f));
   5138         }
   5139     }
   5140     *p++ = '\0';
   5141 
   5142     size_t final_size = (p - bytes);
   5143     char *bytes2;
   5144     if (raw_malloc) {
   5145         bytes2 = PyMem_RawRealloc(bytes, final_size);
   5146     }
   5147     else {
   5148         bytes2 = PyMem_Realloc(bytes, final_size);
   5149     }
   5150     if (bytes2 == NULL) {
   5151         if (error_pos != NULL) {
   5152             *error_pos = (size_t)-1;
   5153         }
   5154         if (raw_malloc) {
   5155             PyMem_RawFree(bytes);
   5156         }
   5157         else {
   5158             PyMem_Free(bytes);
   5159         }
   5160         return -1;
   5161     }
   5162     *str = bytes2;
   5163     return 0;
   5164 }
   5165 
   5166 
   5167 /* Primary internal function which creates utf8 encoded bytes objects.
   5168 
   5169    Allocation strategy:  if the string is short, convert into a stack buffer
   5170    and allocate exactly as much space needed at the end.  Else allocate the
   5171    maximum possible needed (4 result bytes per Unicode character), and return
   5172    the excess memory at the end.
   5173 */
   5174 PyObject *
   5175 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
   5176 {
   5177     enum PyUnicode_Kind kind;
   5178     void *data;
   5179     Py_ssize_t size;
   5180 
   5181     if (!PyUnicode_Check(unicode)) {
   5182         PyErr_BadArgument();
   5183         return NULL;
   5184     }
   5185 
   5186     if (PyUnicode_READY(unicode) == -1)
   5187         return NULL;
   5188 
   5189     if (PyUnicode_UTF8(unicode))
   5190         return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
   5191                                          PyUnicode_UTF8_LENGTH(unicode));
   5192 
   5193     kind = PyUnicode_KIND(unicode);
   5194     data = PyUnicode_DATA(unicode);
   5195     size = PyUnicode_GET_LENGTH(unicode);
   5196 
   5197     switch (kind) {
   5198     default:
   5199         Py_UNREACHABLE();
   5200     case PyUnicode_1BYTE_KIND:
   5201         /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
   5202         assert(!PyUnicode_IS_ASCII(unicode));
   5203         return ucs1lib_utf8_encoder(unicode, data, size, errors);
   5204     case PyUnicode_2BYTE_KIND:
   5205         return ucs2lib_utf8_encoder(unicode, data, size, errors);
   5206     case PyUnicode_4BYTE_KIND:
   5207         return ucs4lib_utf8_encoder(unicode, data, size, errors);
   5208     }
   5209 }
   5210 
   5211 PyObject *
   5212 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
   5213                      Py_ssize_t size,
   5214                      const char *errors)
   5215 {
   5216     PyObject *v, *unicode;
   5217 
   5218     unicode = PyUnicode_FromWideChar(s, size);
   5219     if (unicode == NULL)
   5220         return NULL;
   5221     v = _PyUnicode_AsUTF8String(unicode, errors);
   5222     Py_DECREF(unicode);
   5223     return v;
   5224 }
   5225 
   5226 PyObject *
   5227 PyUnicode_AsUTF8String(PyObject *unicode)
   5228 {
   5229     return _PyUnicode_AsUTF8String(unicode, NULL);
   5230 }
   5231 
   5232 /* --- UTF-32 Codec ------------------------------------------------------- */
   5233 
   5234 PyObject *
   5235 PyUnicode_DecodeUTF32(const char *s,
   5236                       Py_ssize_t size,
   5237                       const char *errors,
   5238                       int *byteorder)
   5239 {
   5240     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
   5241 }
   5242 
   5243 PyObject *
   5244 PyUnicode_DecodeUTF32Stateful(const char *s,
   5245                               Py_ssize_t size,
   5246                               const char *errors,
   5247                               int *byteorder,
   5248                               Py_ssize_t *consumed)
   5249 {
   5250     const char *starts = s;
   5251     Py_ssize_t startinpos;
   5252     Py_ssize_t endinpos;
   5253     _PyUnicodeWriter writer;
   5254     const unsigned char *q, *e;
   5255     int le, bo = 0;       /* assume native ordering by default */
   5256     const char *encoding;
   5257     const char *errmsg = "";
   5258     PyObject *errorHandler = NULL;
   5259     PyObject *exc = NULL;
   5260 
   5261     q = (unsigned char *)s;
   5262     e = q + size;
   5263 
   5264     if (byteorder)
   5265         bo = *byteorder;
   5266 
   5267     /* Check for BOM marks (U+FEFF) in the input and adjust current
   5268        byte order setting accordingly. In native mode, the leading BOM
   5269        mark is skipped, in all other modes, it is copied to the output
   5270        stream as-is (giving a ZWNBSP character). */
   5271     if (bo == 0 && size >= 4) {
   5272         Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
   5273         if (bom == 0x0000FEFF) {
   5274             bo = -1;
   5275             q += 4;
   5276         }
   5277         else if (bom == 0xFFFE0000) {
   5278             bo = 1;
   5279             q += 4;
   5280         }
   5281         if (byteorder)
   5282             *byteorder = bo;
   5283     }
   5284 
   5285     if (q == e) {
   5286         if (consumed)
   5287             *consumed = size;
   5288         _Py_RETURN_UNICODE_EMPTY();
   5289     }
   5290 
   5291 #ifdef WORDS_BIGENDIAN
   5292     le = bo < 0;
   5293 #else
   5294     le = bo <= 0;
   5295 #endif
   5296     encoding = le ? "utf-32-le" : "utf-32-be";
   5297 
   5298     _PyUnicodeWriter_Init(&writer);
   5299     writer.min_length = (e - q + 3) / 4;
   5300     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
   5301         goto onError;
   5302 
   5303     while (1) {
   5304         Py_UCS4 ch = 0;
   5305         Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
   5306 
   5307         if (e - q >= 4) {
   5308             enum PyUnicode_Kind kind = writer.kind;
   5309             void *data = writer.data;
   5310             const unsigned char *last = e - 4;
   5311             Py_ssize_t pos = writer.pos;
   5312             if (le) {
   5313                 do {
   5314                     ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
   5315                     if (ch > maxch)
   5316                         break;
   5317                     if (kind != PyUnicode_1BYTE_KIND &&
   5318                         Py_UNICODE_IS_SURROGATE(ch))
   5319                         break;
   5320                     PyUnicode_WRITE(kind, data, pos++, ch);
   5321                     q += 4;
   5322                 } while (q <= last);
   5323             }
   5324             else {
   5325                 do {
   5326                     ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
   5327                     if (ch > maxch)
   5328                         break;
   5329                     if (kind != PyUnicode_1BYTE_KIND &&
   5330                         Py_UNICODE_IS_SURROGATE(ch))
   5331                         break;
   5332                     PyUnicode_WRITE(kind, data, pos++, ch);
   5333                     q += 4;
   5334                 } while (q <= last);
   5335             }
   5336             writer.pos = pos;
   5337         }
   5338 
   5339         if (Py_UNICODE_IS_SURROGATE(ch)) {
   5340             errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
   5341             startinpos = ((const char *)q) - starts;
   5342             endinpos = startinpos + 4;
   5343         }
   5344         else if (ch <= maxch) {
   5345             if (q == e || consumed)
   5346                 break;
   5347             /* remaining bytes at the end? (size should be divisible by 4) */
   5348             errmsg = "truncated data";
   5349             startinpos = ((const char *)q) - starts;
   5350             endinpos = ((const char *)e) - starts;
   5351         }
   5352         else {
   5353             if (ch < 0x110000) {
   5354                 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
   5355                     goto onError;
   5356                 q += 4;
   5357                 continue;
   5358             }
   5359             errmsg = "code point not in range(0x110000)";
   5360             startinpos = ((const char *)q) - starts;
   5361             endinpos = startinpos + 4;
   5362         }
   5363 
   5364         /* The remaining input chars are ignored if the callback
   5365            chooses to skip the input */
   5366         if (unicode_decode_call_errorhandler_writer(
   5367                 errors, &errorHandler,
   5368                 encoding, errmsg,
   5369                 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
   5370                 &writer))
   5371             goto onError;
   5372     }
   5373 
   5374     if (consumed)
   5375         *consumed = (const char *)q-starts;
   5376 
   5377     Py_XDECREF(errorHandler);
   5378     Py_XDECREF(exc);
   5379     return _PyUnicodeWriter_Finish(&writer);
   5380 
   5381   onError:
   5382     _PyUnicodeWriter_Dealloc(&writer);
   5383     Py_XDECREF(errorHandler);
   5384     Py_XDECREF(exc);
   5385     return NULL;
   5386 }
   5387 
   5388 PyObject *
   5389 _PyUnicode_EncodeUTF32(PyObject *str,
   5390                        const char *errors,
   5391                        int byteorder)
   5392 {
   5393     enum PyUnicode_Kind kind;
   5394     const void *data;
   5395     Py_ssize_t len;
   5396     PyObject *v;
   5397     uint32_t *out;
   5398 #if PY_LITTLE_ENDIAN
   5399     int native_ordering = byteorder <= 0;
   5400 #else
   5401     int native_ordering = byteorder >= 0;
   5402 #endif
   5403     const char *encoding;
   5404     Py_ssize_t nsize, pos;
   5405     PyObject *errorHandler = NULL;
   5406     PyObject *exc = NULL;
   5407     PyObject *rep = NULL;
   5408 
   5409     if (!PyUnicode_Check(str)) {
   5410         PyErr_BadArgument();
   5411         return NULL;
   5412     }
   5413     if (PyUnicode_READY(str) == -1)
   5414         return NULL;
   5415     kind = PyUnicode_KIND(str);
   5416     data = PyUnicode_DATA(str);
   5417     len = PyUnicode_GET_LENGTH(str);
   5418 
   5419     if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
   5420         return PyErr_NoMemory();
   5421     nsize = len + (byteorder == 0);
   5422     v = PyBytes_FromStringAndSize(NULL, nsize * 4);
   5423     if (v == NULL)
   5424         return NULL;
   5425 
   5426     /* output buffer is 4-bytes aligned */
   5427     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
   5428     out = (uint32_t *)PyBytes_AS_STRING(v);
   5429     if (byteorder == 0)
   5430         *out++ = 0xFEFF;
   5431     if (len == 0)
   5432         goto done;
   5433 
   5434     if (byteorder == -1)
   5435         encoding = "utf-32-le";
   5436     else if (byteorder == 1)
   5437         encoding = "utf-32-be";
   5438     else
   5439         encoding = "utf-32";
   5440 
   5441     if (kind == PyUnicode_1BYTE_KIND) {
   5442         ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
   5443         goto done;
   5444     }
   5445 
   5446     pos = 0;
   5447     while (pos < len) {
   5448         Py_ssize_t repsize, moreunits;
   5449 
   5450         if (kind == PyUnicode_2BYTE_KIND) {
   5451             pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
   5452                                         &out, native_ordering);
   5453         }
   5454         else {
   5455             assert(kind == PyUnicode_4BYTE_KIND);
   5456             pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
   5457                                         &out, native_ordering);
   5458         }
   5459         if (pos == len)
   5460             break;
   5461 
   5462         rep = unicode_encode_call_errorhandler(
   5463                 errors, &errorHandler,
   5464                 encoding, "surrogates not allowed",
   5465                 str, &exc, pos, pos + 1, &pos);
   5466         if (!rep)
   5467             goto error;
   5468 
   5469         if (PyBytes_Check(rep)) {
   5470             repsize = PyBytes_GET_SIZE(rep);
   5471             if (repsize & 3) {
   5472                 raise_encode_exception(&exc, encoding,
   5473                                        str, pos - 1, pos,
   5474                                        "surrogates not allowed");
   5475                 goto error;
   5476             }
   5477             moreunits = repsize / 4;
   5478         }
   5479         else {
   5480             assert(PyUnicode_Check(rep));
   5481             if (PyUnicode_READY(rep) < 0)
   5482                 goto error;
   5483             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
   5484             if (!PyUnicode_IS_ASCII(rep)) {
   5485                 raise_encode_exception(&exc, encoding,
   5486                                        str, pos - 1, pos,
   5487                                        "surrogates not allowed");
   5488                 goto error;
   5489             }
   5490         }
   5491 
   5492         /* four bytes are reserved for each surrogate */
   5493         if (moreunits > 1) {
   5494             Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
   5495             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
   5496                 /* integer overflow */
   5497                 PyErr_NoMemory();
   5498                 goto error;
   5499             }
   5500             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
   5501                 goto error;
   5502             out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
   5503         }
   5504 
   5505         if (PyBytes_Check(rep)) {
   5506             memcpy(out, PyBytes_AS_STRING(rep), repsize);
   5507             out += moreunits;
   5508         } else /* rep is unicode */ {
   5509             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
   5510             ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
   5511                                  &out, native_ordering);
   5512         }
   5513 
   5514         Py_CLEAR(rep);
   5515     }
   5516 
   5517     /* Cut back to size actually needed. This is necessary for, for example,
   5518        encoding of a string containing isolated surrogates and the 'ignore'
   5519        handler is used. */
   5520     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
   5521     if (nsize != PyBytes_GET_SIZE(v))
   5522       _PyBytes_Resize(&v, nsize);
   5523     Py_XDECREF(errorHandler);
   5524     Py_XDECREF(exc);
   5525   done:
   5526     return v;
   5527   error:
   5528     Py_XDECREF(rep);
   5529     Py_XDECREF(errorHandler);
   5530     Py_XDECREF(exc);
   5531     Py_XDECREF(v);
   5532     return NULL;
   5533 }
   5534 
   5535 PyObject *
   5536 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
   5537                       Py_ssize_t size,
   5538                       const char *errors,
   5539                       int byteorder)
   5540 {
   5541     PyObject *result;
   5542     PyObject *tmp = PyUnicode_FromWideChar(s, size);
   5543     if (tmp == NULL)
   5544         return NULL;
   5545     result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
   5546     Py_DECREF(tmp);
   5547     return result;
   5548 }
   5549 
   5550 PyObject *
   5551 PyUnicode_AsUTF32String(PyObject *unicode)
   5552 {
   5553     return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
   5554 }
   5555 
   5556 /* --- UTF-16 Codec ------------------------------------------------------- */
   5557 
   5558 PyObject *
   5559 PyUnicode_DecodeUTF16(const char *s,
   5560                       Py_ssize_t size,
   5561                       const char *errors,
   5562                       int *byteorder)
   5563 {
   5564     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
   5565 }
   5566 
   5567 PyObject *
   5568 PyUnicode_DecodeUTF16Stateful(const char *s,
   5569                               Py_ssize_t size,
   5570                               const char *errors,
   5571                               int *byteorder,
   5572                               Py_ssize_t *consumed)
   5573 {
   5574     const char *starts = s;
   5575     Py_ssize_t startinpos;
   5576     Py_ssize_t endinpos;
   5577     _PyUnicodeWriter writer;
   5578     const unsigned char *q, *e;
   5579     int bo = 0;       /* assume native ordering by default */
   5580     int native_ordering;
   5581     const char *errmsg = "";
   5582     PyObject *errorHandler = NULL;
   5583     PyObject *exc = NULL;
   5584     const char *encoding;
   5585 
   5586     q = (unsigned char *)s;
   5587     e = q + size;
   5588 
   5589     if (byteorder)
   5590         bo = *byteorder;
   5591 
   5592     /* Check for BOM marks (U+FEFF) in the input and adjust current
   5593        byte order setting accordingly. In native mode, the leading BOM
   5594        mark is skipped, in all other modes, it is copied to the output
   5595        stream as-is (giving a ZWNBSP character). */
   5596     if (bo == 0 && size >= 2) {
   5597         const Py_UCS4 bom = (q[1] << 8) | q[0];
   5598         if (bom == 0xFEFF) {
   5599             q += 2;
   5600             bo = -1;
   5601         }
   5602         else if (bom == 0xFFFE) {
   5603             q += 2;
   5604             bo = 1;
   5605         }
   5606         if (byteorder)
   5607             *byteorder = bo;
   5608     }
   5609 
   5610     if (q == e) {
   5611         if (consumed)
   5612             *consumed = size;
   5613         _Py_RETURN_UNICODE_EMPTY();
   5614     }
   5615 
   5616 #if PY_LITTLE_ENDIAN
   5617     native_ordering = bo <= 0;
   5618     encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
   5619 #else
   5620     native_ordering = bo >= 0;
   5621     encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
   5622 #endif
   5623 
   5624     /* Note: size will always be longer than the resulting Unicode
   5625        character count normally.  Error handler will take care of
   5626        resizing when needed. */
   5627     _PyUnicodeWriter_Init(&writer);
   5628     writer.min_length = (e - q + 1) / 2;
   5629     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
   5630         goto onError;
   5631 
   5632     while (1) {
   5633         Py_UCS4 ch = 0;
   5634         if (e - q >= 2) {
   5635             int kind = writer.kind;
   5636             if (kind == PyUnicode_1BYTE_KIND) {
   5637                 if (PyUnicode_IS_ASCII(writer.buffer))
   5638                     ch = asciilib_utf16_decode(&q, e,
   5639                             (Py_UCS1*)writer.data, &writer.pos,
   5640                             native_ordering);
   5641                 else
   5642                     ch = ucs1lib_utf16_decode(&q, e,
   5643                             (Py_UCS1*)writer.data, &writer.pos,
   5644                             native_ordering);
   5645             } else if (kind == PyUnicode_2BYTE_KIND) {
   5646                 ch = ucs2lib_utf16_decode(&q, e,
   5647                         (Py_UCS2*)writer.data, &writer.pos,
   5648                         native_ordering);
   5649             } else {
   5650                 assert(kind == PyUnicode_4BYTE_KIND);
   5651                 ch = ucs4lib_utf16_decode(&q, e,
   5652                         (Py_UCS4*)writer.data, &writer.pos,
   5653                         native_ordering);
   5654             }
   5655         }
   5656 
   5657         switch (ch)
   5658         {
   5659         case 0:
   5660             /* remaining byte at the end? (size should be even) */
   5661             if (q == e || consumed)
   5662                 goto End;
   5663             errmsg = "truncated data";
   5664             startinpos = ((const char *)q) - starts;
   5665             endinpos = ((const char *)e) - starts;
   5666             break;
   5667             /* The remaining input chars are ignored if the callback
   5668                chooses to skip the input */
   5669         case 1:
   5670             q -= 2;
   5671             if (consumed)
   5672                 goto End;
   5673             errmsg = "unexpected end of data";
   5674             startinpos = ((const char *)q) - starts;
   5675             endinpos = ((const char *)e) - starts;
   5676             break;
   5677         case 2:
   5678             errmsg = "illegal encoding";
   5679             startinpos = ((const char *)q) - 2 - starts;
   5680             endinpos = startinpos + 2;
   5681             break;
   5682         case 3:
   5683             errmsg = "illegal UTF-16 surrogate";
   5684             startinpos = ((const char *)q) - 4 - starts;
   5685             endinpos = startinpos + 2;
   5686             break;
   5687         default:
   5688             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
   5689                 goto onError;
   5690             continue;
   5691         }
   5692 
   5693         if (unicode_decode_call_errorhandler_writer(
   5694                 errors,
   5695                 &errorHandler,
   5696                 encoding, errmsg,
   5697                 &starts,
   5698                 (const char **)&e,
   5699                 &startinpos,
   5700                 &endinpos,
   5701                 &exc,
   5702                 (const char **)&q,
   5703                 &writer))
   5704             goto onError;
   5705     }
   5706 
   5707 End:
   5708     if (consumed)
   5709         *consumed = (const char *)q-starts;
   5710 
   5711     Py_XDECREF(errorHandler);
   5712     Py_XDECREF(exc);
   5713     return _PyUnicodeWriter_Finish(&writer);
   5714 
   5715   onError:
   5716     _PyUnicodeWriter_Dealloc(&writer);
   5717     Py_XDECREF(errorHandler);
   5718     Py_XDECREF(exc);
   5719     return NULL;
   5720 }
   5721 
   5722 PyObject *
   5723 _PyUnicode_EncodeUTF16(PyObject *str,
   5724                        const char *errors,
   5725                        int byteorder)
   5726 {
   5727     enum PyUnicode_Kind kind;
   5728     const void *data;
   5729     Py_ssize_t len;
   5730     PyObject *v;
   5731     unsigned short *out;
   5732     Py_ssize_t pairs;
   5733 #if PY_BIG_ENDIAN
   5734     int native_ordering = byteorder >= 0;
   5735 #else
   5736     int native_ordering = byteorder <= 0;
   5737 #endif
   5738     const char *encoding;
   5739     Py_ssize_t nsize, pos;
   5740     PyObject *errorHandler = NULL;
   5741     PyObject *exc = NULL;
   5742     PyObject *rep = NULL;
   5743 
   5744     if (!PyUnicode_Check(str)) {
   5745         PyErr_BadArgument();
   5746         return NULL;
   5747     }
   5748     if (PyUnicode_READY(str) == -1)
   5749         return NULL;
   5750     kind = PyUnicode_KIND(str);
   5751     data = PyUnicode_DATA(str);
   5752     len = PyUnicode_GET_LENGTH(str);
   5753 
   5754     pairs = 0;
   5755     if (kind == PyUnicode_4BYTE_KIND) {
   5756         const Py_UCS4 *in = (const Py_UCS4 *)data;
   5757         const Py_UCS4 *end = in + len;
   5758         while (in < end) {
   5759             if (*in++ >= 0x10000) {
   5760                 pairs++;
   5761             }
   5762         }
   5763     }
   5764     if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
   5765         return PyErr_NoMemory();
   5766     }
   5767     nsize = len + pairs + (byteorder == 0);
   5768     v = PyBytes_FromStringAndSize(NULL, nsize * 2);
   5769     if (v == NULL) {
   5770         return NULL;
   5771     }
   5772 
   5773     /* output buffer is 2-bytes aligned */
   5774     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
   5775     out = (unsigned short *)PyBytes_AS_STRING(v);
   5776     if (byteorder == 0) {
   5777         *out++ = 0xFEFF;
   5778     }
   5779     if (len == 0) {
   5780         goto done;
   5781     }
   5782 
   5783     if (kind == PyUnicode_1BYTE_KIND) {
   5784         ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
   5785         goto done;
   5786     }
   5787 
   5788     if (byteorder < 0) {
   5789         encoding = "utf-16-le";
   5790     }
   5791     else if (byteorder > 0) {
   5792         encoding = "utf-16-be";
   5793     }
   5794     else {
   5795         encoding = "utf-16";
   5796     }
   5797 
   5798     pos = 0;
   5799     while (pos < len) {
   5800         Py_ssize_t repsize, moreunits;
   5801 
   5802         if (kind == PyUnicode_2BYTE_KIND) {
   5803             pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
   5804                                         &out, native_ordering);
   5805         }
   5806         else {
   5807             assert(kind == PyUnicode_4BYTE_KIND);
   5808             pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
   5809                                         &out, native_ordering);
   5810         }
   5811         if (pos == len)
   5812             break;
   5813 
   5814         rep = unicode_encode_call_errorhandler(
   5815                 errors, &errorHandler,
   5816                 encoding, "surrogates not allowed",
   5817                 str, &exc, pos, pos + 1, &pos);
   5818         if (!rep)
   5819             goto error;
   5820 
   5821         if (PyBytes_Check(rep)) {
   5822             repsize = PyBytes_GET_SIZE(rep);
   5823             if (repsize & 1) {
   5824                 raise_encode_exception(&exc, encoding,
   5825                                        str, pos - 1, pos,
   5826                                        "surrogates not allowed");
   5827                 goto error;
   5828             }
   5829             moreunits = repsize / 2;
   5830         }
   5831         else {
   5832             assert(PyUnicode_Check(rep));
   5833             if (PyUnicode_READY(rep) < 0)
   5834                 goto error;
   5835             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
   5836             if (!PyUnicode_IS_ASCII(rep)) {
   5837                 raise_encode_exception(&exc, encoding,
   5838                                        str, pos - 1, pos,
   5839                                        "surrogates not allowed");
   5840                 goto error;
   5841             }
   5842         }
   5843 
   5844         /* two bytes are reserved for each surrogate */
   5845         if (moreunits > 1) {
   5846             Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
   5847             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
   5848                 /* integer overflow */
   5849                 PyErr_NoMemory();
   5850                 goto error;
   5851             }
   5852             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
   5853                 goto error;
   5854             out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
   5855         }
   5856 
   5857         if (PyBytes_Check(rep)) {
   5858             memcpy(out, PyBytes_AS_STRING(rep), repsize);
   5859             out += moreunits;
   5860         } else /* rep is unicode */ {
   5861             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
   5862             ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
   5863                                  &out, native_ordering);
   5864         }
   5865 
   5866         Py_CLEAR(rep);
   5867     }
   5868 
   5869     /* Cut back to size actually needed. This is necessary for, for example,
   5870     encoding of a string containing isolated surrogates and the 'ignore' handler
   5871     is used. */
   5872     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
   5873     if (nsize != PyBytes_GET_SIZE(v))
   5874       _PyBytes_Resize(&v, nsize);
   5875     Py_XDECREF(errorHandler);
   5876     Py_XDECREF(exc);
   5877   done:
   5878     return v;
   5879   error:
   5880     Py_XDECREF(rep);
   5881     Py_XDECREF(errorHandler);
   5882     Py_XDECREF(exc);
   5883     Py_XDECREF(v);
   5884     return NULL;
   5885 #undef STORECHAR
   5886 }
   5887 
   5888 PyObject *
   5889 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
   5890                       Py_ssize_t size,
   5891                       const char *errors,
   5892                       int byteorder)
   5893 {
   5894     PyObject *result;
   5895     PyObject *tmp = PyUnicode_FromWideChar(s, size);
   5896     if (tmp == NULL)
   5897         return NULL;
   5898     result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
   5899     Py_DECREF(tmp);
   5900     return result;
   5901 }
   5902 
   5903 PyObject *
   5904 PyUnicode_AsUTF16String(PyObject *unicode)
   5905 {
   5906     return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
   5907 }
   5908 
   5909 /* --- Unicode Escape Codec ----------------------------------------------- */
   5910 
   5911 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
   5912 
   5913 PyObject *
   5914 _PyUnicode_DecodeUnicodeEscape(const char *s,
   5915                                Py_ssize_t size,
   5916                                const char *errors,
   5917                                const char **first_invalid_escape)
   5918 {
   5919     const char *starts = s;
   5920     _PyUnicodeWriter writer;
   5921     const char *end;
   5922     PyObject *errorHandler = NULL;
   5923     PyObject *exc = NULL;
   5924 
   5925     // so we can remember if we've seen an invalid escape char or not
   5926     *first_invalid_escape = NULL;
   5927 
   5928     if (size == 0) {
   5929         _Py_RETURN_UNICODE_EMPTY();
   5930     }
   5931     /* Escaped strings will always be longer than the resulting
   5932        Unicode string, so we start with size here and then reduce the
   5933        length after conversion to the true value.
   5934        (but if the error callback returns a long replacement string
   5935        we'll have to allocate more space) */
   5936     _PyUnicodeWriter_Init(&writer);
   5937     writer.min_length = size;
   5938     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
   5939         goto onError;
   5940     }
   5941 
   5942     end = s + size;
   5943     while (s < end) {
   5944         unsigned char c = (unsigned char) *s++;
   5945         Py_UCS4 ch;
   5946         int count;
   5947         Py_ssize_t startinpos;
   5948         Py_ssize_t endinpos;
   5949         const char *message;
   5950 
   5951 #define WRITE_ASCII_CHAR(ch)                                                  \
   5952             do {                                                              \
   5953                 assert(ch <= 127);                                            \
   5954                 assert(writer.pos < writer.size);                             \
   5955                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
   5956             } while(0)
   5957 
   5958 #define WRITE_CHAR(ch)                                                        \
   5959             do {                                                              \
   5960                 if (ch <= writer.maxchar) {                                   \
   5961                     assert(writer.pos < writer.size);                         \
   5962                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
   5963                 }                                                             \
   5964                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
   5965                     goto onError;                                             \
   5966                 }                                                             \
   5967             } while(0)
   5968 
   5969         /* Non-escape characters are interpreted as Unicode ordinals */
   5970         if (c != '\\') {
   5971             WRITE_CHAR(c);
   5972             continue;
   5973         }
   5974 
   5975         startinpos = s - starts - 1;
   5976         /* \ - Escapes */
   5977         if (s >= end) {
   5978             message = "\\ at end of string";
   5979             goto error;
   5980         }
   5981         c = (unsigned char) *s++;
   5982 
   5983         assert(writer.pos < writer.size);
   5984         switch (c) {
   5985 
   5986             /* \x escapes */
   5987         case '\n': continue;
   5988         case '\\': WRITE_ASCII_CHAR('\\'); continue;
   5989         case '\'': WRITE_ASCII_CHAR('\''); continue;
   5990         case '\"': WRITE_ASCII_CHAR('\"'); continue;
   5991         case 'b': WRITE_ASCII_CHAR('\b'); continue;
   5992         /* FF */
   5993         case 'f': WRITE_ASCII_CHAR('\014'); continue;
   5994         case 't': WRITE_ASCII_CHAR('\t'); continue;
   5995         case 'n': WRITE_ASCII_CHAR('\n'); continue;
   5996         case 'r': WRITE_ASCII_CHAR('\r'); continue;
   5997         /* VT */
   5998         case 'v': WRITE_ASCII_CHAR('\013'); continue;
   5999         /* BEL, not classic C */
   6000         case 'a': WRITE_ASCII_CHAR('\007'); continue;
   6001 
   6002             /* \OOO (octal) escapes */
   6003         case '0': case '1': case '2': case '3':
   6004         case '4': case '5': case '6': case '7':
   6005             ch = c - '0';
   6006             if (s < end && '0' <= *s && *s <= '7') {
   6007                 ch = (ch<<3) + *s++ - '0';
   6008                 if (s < end && '0' <= *s && *s <= '7') {
   6009                     ch = (ch<<3) + *s++ - '0';
   6010                 }
   6011             }
   6012             WRITE_CHAR(ch);
   6013             continue;
   6014 
   6015             /* hex escapes */
   6016             /* \xXX */
   6017         case 'x':
   6018             count = 2;
   6019             message = "truncated \\xXX escape";
   6020             goto hexescape;
   6021 
   6022             /* \uXXXX */
   6023         case 'u':
   6024             count = 4;
   6025             message = "truncated \\uXXXX escape";
   6026             goto hexescape;
   6027 
   6028             /* \UXXXXXXXX */
   6029         case 'U':
   6030             count = 8;
   6031             message = "truncated \\UXXXXXXXX escape";
   6032         hexescape:
   6033             for (ch = 0; count && s < end; ++s, --count) {
   6034                 c = (unsigned char)*s;
   6035                 ch <<= 4;
   6036                 if (c >= '0' && c <= '9') {
   6037                     ch += c - '0';
   6038                 }
   6039                 else if (c >= 'a' && c <= 'f') {
   6040                     ch += c - ('a' - 10);
   6041                 }
   6042                 else if (c >= 'A' && c <= 'F') {
   6043                     ch += c - ('A' - 10);
   6044                 }
   6045                 else {
   6046                     break;
   6047                 }
   6048             }
   6049             if (count) {
   6050                 goto error;
   6051             }
   6052 
   6053             /* when we get here, ch is a 32-bit unicode character */
   6054             if (ch > MAX_UNICODE) {
   6055                 message = "illegal Unicode character";
   6056                 goto error;
   6057             }
   6058 
   6059             WRITE_CHAR(ch);
   6060             continue;
   6061 
   6062             /* \N{name} */
   6063         case 'N':
   6064             if (ucnhash_CAPI == NULL) {
   6065                 /* load the unicode data module */
   6066                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
   6067                                                 PyUnicodeData_CAPSULE_NAME, 1);
   6068                 if (ucnhash_CAPI == NULL) {
   6069                     PyErr_SetString(
   6070                         PyExc_UnicodeError,
   6071                         "\\N escapes not supported (can't load unicodedata module)"
   6072                         );
   6073                     goto onError;
   6074                 }
   6075             }
   6076 
   6077             message = "malformed \\N character escape";
   6078             if (s < end && *s == '{') {
   6079                 const char *start = ++s;
   6080                 size_t namelen;
   6081                 /* look for the closing brace */
   6082                 while (s < end && *s != '}')
   6083                     s++;
   6084                 namelen = s - start;
   6085                 if (namelen && s < end) {
   6086                     /* found a name.  look it up in the unicode database */
   6087                     s++;
   6088                     ch = 0xffffffff; /* in case 'getcode' messes up */
   6089                     if (namelen <= INT_MAX &&
   6090                         ucnhash_CAPI->getcode(NULL, start, (int)namelen,
   6091                                               &ch, 0)) {
   6092                         assert(ch <= MAX_UNICODE);
   6093                         WRITE_CHAR(ch);
   6094                         continue;
   6095                     }
   6096                     message = "unknown Unicode character name";
   6097                 }
   6098             }
   6099             goto error;
   6100 
   6101         default:
   6102             if (*first_invalid_escape == NULL) {
   6103                 *first_invalid_escape = s-1; /* Back up one char, since we've
   6104                                                 already incremented s. */
   6105             }
   6106             WRITE_ASCII_CHAR('\\');
   6107             WRITE_CHAR(c);
   6108             continue;
   6109         }
   6110 
   6111       error:
   6112         endinpos = s-starts;
   6113         writer.min_length = end - s + writer.pos;
   6114         if (unicode_decode_call_errorhandler_writer(
   6115                 errors, &errorHandler,
   6116                 "unicodeescape", message,
   6117                 &starts, &end, &startinpos, &endinpos, &exc, &s,
   6118                 &writer)) {
   6119             goto onError;
   6120         }
   6121         assert(end - s <= writer.size - writer.pos);
   6122 
   6123 #undef WRITE_ASCII_CHAR
   6124 #undef WRITE_CHAR
   6125     }
   6126 
   6127     Py_XDECREF(errorHandler);
   6128     Py_XDECREF(exc);
   6129     return _PyUnicodeWriter_Finish(&writer);
   6130 
   6131   onError:
   6132     _PyUnicodeWriter_Dealloc(&writer);
   6133     Py_XDECREF(errorHandler);
   6134     Py_XDECREF(exc);
   6135     return NULL;
   6136 }
   6137 
   6138 PyObject *
   6139 PyUnicode_DecodeUnicodeEscape(const char *s,
   6140                               Py_ssize_t size,
   6141                               const char *errors)
   6142 {
   6143     const char *first_invalid_escape;
   6144     PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
   6145                                                       &first_invalid_escape);
   6146     if (result == NULL)
   6147         return NULL;
   6148     if (first_invalid_escape != NULL) {
   6149         if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
   6150                              "invalid escape sequence '\\%c'",
   6151                              (unsigned char)*first_invalid_escape) < 0) {
   6152             Py_DECREF(result);
   6153             return NULL;
   6154         }
   6155     }
   6156     return result;
   6157 }
   6158 
   6159 /* Return a Unicode-Escape string version of the Unicode object. */
   6160 
   6161 PyObject *
   6162 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
   6163 {
   6164     Py_ssize_t i, len;
   6165     PyObject *repr;
   6166     char *p;
   6167     enum PyUnicode_Kind kind;
   6168     void *data;
   6169     Py_ssize_t expandsize;
   6170 
   6171     /* Initial allocation is based on the longest-possible character
   6172        escape.
   6173 
   6174        For UCS1 strings it's '\xxx', 4 bytes per source character.
   6175        For UCS2 strings it's '\uxxxx', 6 bytes per source character.
   6176        For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
   6177     */
   6178 
   6179     if (!PyUnicode_Check(unicode)) {
   6180         PyErr_BadArgument();
   6181         return NULL;
   6182     }
   6183     if (PyUnicode_READY(unicode) == -1) {
   6184         return NULL;
   6185     }
   6186 
   6187     len = PyUnicode_GET_LENGTH(unicode);
   6188     if (len == 0) {
   6189         return PyBytes_FromStringAndSize(NULL, 0);
   6190     }
   6191 
   6192     kind = PyUnicode_KIND(unicode);
   6193     data = PyUnicode_DATA(unicode);
   6194     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
   6195        bytes, and 1 byte characters 4. */
   6196     expandsize = kind * 2 + 2;
   6197     if (len > PY_SSIZE_T_MAX / expandsize) {
   6198         return PyErr_NoMemory();
   6199     }
   6200     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
   6201     if (repr == NULL) {
   6202         return NULL;
   6203     }
   6204 
   6205     p = PyBytes_AS_STRING(repr);
   6206     for (i = 0; i < len; i++) {
   6207         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   6208 
   6209         /* U+0000-U+00ff range */
   6210         if (ch < 0x100) {
   6211             if (ch >= ' ' && ch < 127) {
   6212                 if (ch != '\\') {
   6213                     /* Copy printable US ASCII as-is */
   6214                     *p++ = (char) ch;
   6215                 }
   6216                 /* Escape backslashes */
   6217                 else {
   6218                     *p++ = '\\';
   6219                     *p++ = '\\';
   6220                 }
   6221             }
   6222 
   6223             /* Map special whitespace to '\t', \n', '\r' */
   6224             else if (ch == '\t') {
   6225                 *p++ = '\\';
   6226                 *p++ = 't';
   6227             }
   6228             else if (ch == '\n') {
   6229                 *p++ = '\\';
   6230                 *p++ = 'n';
   6231             }
   6232             else if (ch == '\r') {
   6233                 *p++ = '\\';
   6234                 *p++ = 'r';
   6235             }
   6236 
   6237             /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
   6238             else {
   6239                 *p++ = '\\';
   6240                 *p++ = 'x';
   6241                 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
   6242                 *p++ = Py_hexdigits[ch & 0x000F];
   6243             }
   6244         }
   6245         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
   6246         else if (ch < 0x10000) {
   6247             *p++ = '\\';
   6248             *p++ = 'u';
   6249             *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
   6250             *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
   6251             *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
   6252             *p++ = Py_hexdigits[ch & 0x000F];
   6253         }
   6254         /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
   6255         else {
   6256 
   6257             /* Make sure that the first two digits are zero */
   6258             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
   6259             *p++ = '\\';
   6260             *p++ = 'U';
   6261             *p++ = '0';
   6262             *p++ = '0';
   6263             *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
   6264             *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
   6265             *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
   6266             *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
   6267             *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
   6268             *p++ = Py_hexdigits[ch & 0x0000000F];
   6269         }
   6270     }
   6271 
   6272     assert(p - PyBytes_AS_STRING(repr) > 0);
   6273     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
   6274         return NULL;
   6275     }
   6276     return repr;
   6277 }
   6278 
   6279 PyObject *
   6280 PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
   6281                               Py_ssize_t size)
   6282 {
   6283     PyObject *result;
   6284     PyObject *tmp = PyUnicode_FromWideChar(s, size);
   6285     if (tmp == NULL) {
   6286         return NULL;
   6287     }
   6288 
   6289     result = PyUnicode_AsUnicodeEscapeString(tmp);
   6290     Py_DECREF(tmp);
   6291     return result;
   6292 }
   6293 
   6294 /* --- Raw Unicode Escape Codec ------------------------------------------- */
   6295 
   6296 PyObject *
   6297 PyUnicode_DecodeRawUnicodeEscape(const char *s,
   6298                                  Py_ssize_t size,
   6299                                  const char *errors)
   6300 {
   6301     const char *starts = s;
   6302     _PyUnicodeWriter writer;
   6303     const char *end;
   6304     PyObject *errorHandler = NULL;
   6305     PyObject *exc = NULL;
   6306 
   6307     if (size == 0) {
   6308         _Py_RETURN_UNICODE_EMPTY();
   6309     }
   6310 
   6311     /* Escaped strings will always be longer than the resulting
   6312        Unicode string, so we start with size here and then reduce the
   6313        length after conversion to the true value. (But decoding error
   6314        handler might have to resize the string) */
   6315     _PyUnicodeWriter_Init(&writer);
   6316      writer.min_length = size;
   6317     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
   6318         goto onError;
   6319     }
   6320 
   6321     end = s + size;
   6322     while (s < end) {
   6323         unsigned char c = (unsigned char) *s++;
   6324         Py_UCS4 ch;
   6325         int count;
   6326         Py_ssize_t startinpos;
   6327         Py_ssize_t endinpos;
   6328         const char *message;
   6329 
   6330 #define WRITE_CHAR(ch)                                                        \
   6331             do {                                                              \
   6332                 if (ch <= writer.maxchar) {                                   \
   6333                     assert(writer.pos < writer.size);                         \
   6334                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
   6335                 }                                                             \
   6336                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
   6337                     goto onError;                                             \
   6338                 }                                                             \
   6339             } while(0)
   6340 
   6341         /* Non-escape characters are interpreted as Unicode ordinals */
   6342         if (c != '\\' || s >= end) {
   6343             WRITE_CHAR(c);
   6344             continue;
   6345         }
   6346 
   6347         c = (unsigned char) *s++;
   6348         if (c == 'u') {
   6349             count = 4;
   6350             message = "truncated \\uXXXX escape";
   6351         }
   6352         else if (c == 'U') {
   6353             count = 8;
   6354             message = "truncated \\UXXXXXXXX escape";
   6355         }
   6356         else {
   6357             assert(writer.pos < writer.size);
   6358             PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
   6359             WRITE_CHAR(c);
   6360             continue;
   6361         }
   6362         startinpos = s - starts - 2;
   6363 
   6364         /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
   6365         for (ch = 0; count && s < end; ++s, --count) {
   6366             c = (unsigned char)*s;
   6367             ch <<= 4;
   6368             if (c >= '0' && c <= '9') {
   6369                 ch += c - '0';
   6370             }
   6371             else if (c >= 'a' && c <= 'f') {
   6372                 ch += c - ('a' - 10);
   6373             }
   6374             else if (c >= 'A' && c <= 'F') {
   6375                 ch += c - ('A' - 10);
   6376             }
   6377             else {
   6378                 break;
   6379             }
   6380         }
   6381         if (!count) {
   6382             if (ch <= MAX_UNICODE) {
   6383                 WRITE_CHAR(ch);
   6384                 continue;
   6385             }
   6386             message = "\\Uxxxxxxxx out of range";
   6387         }
   6388 
   6389         endinpos = s-starts;
   6390         writer.min_length = end - s + writer.pos;
   6391         if (unicode_decode_call_errorhandler_writer(
   6392                 errors, &errorHandler,
   6393                 "rawunicodeescape", message,
   6394                 &starts, &end, &startinpos, &endinpos, &exc, &s,
   6395                 &writer)) {
   6396             goto onError;
   6397         }
   6398         assert(end - s <= writer.size - writer.pos);
   6399 
   6400 #undef WRITE_CHAR
   6401     }
   6402     Py_XDECREF(errorHandler);
   6403     Py_XDECREF(exc);
   6404     return _PyUnicodeWriter_Finish(&writer);
   6405 
   6406   onError:
   6407     _PyUnicodeWriter_Dealloc(&writer);
   6408     Py_XDECREF(errorHandler);
   6409     Py_XDECREF(exc);
   6410     return NULL;
   6411 
   6412 }
   6413 
   6414 
   6415 PyObject *
   6416 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
   6417 {
   6418     PyObject *repr;
   6419     char *p;
   6420     Py_ssize_t expandsize, pos;
   6421     int kind;
   6422     void *data;
   6423     Py_ssize_t len;
   6424 
   6425     if (!PyUnicode_Check(unicode)) {
   6426         PyErr_BadArgument();
   6427         return NULL;
   6428     }
   6429     if (PyUnicode_READY(unicode) == -1) {
   6430         return NULL;
   6431     }
   6432     kind = PyUnicode_KIND(unicode);
   6433     data = PyUnicode_DATA(unicode);
   6434     len = PyUnicode_GET_LENGTH(unicode);
   6435     if (kind == PyUnicode_1BYTE_KIND) {
   6436         return PyBytes_FromStringAndSize(data, len);
   6437     }
   6438 
   6439     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
   6440        bytes, and 1 byte characters 4. */
   6441     expandsize = kind * 2 + 2;
   6442 
   6443     if (len > PY_SSIZE_T_MAX / expandsize) {
   6444         return PyErr_NoMemory();
   6445     }
   6446     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
   6447     if (repr == NULL) {
   6448         return NULL;
   6449     }
   6450     if (len == 0) {
   6451         return repr;
   6452     }
   6453 
   6454     p = PyBytes_AS_STRING(repr);
   6455     for (pos = 0; pos < len; pos++) {
   6456         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
   6457 
   6458         /* U+0000-U+00ff range: Copy 8-bit characters as-is */
   6459         if (ch < 0x100) {
   6460             *p++ = (char) ch;
   6461         }
   6462         /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
   6463         else if (ch < 0x10000) {
   6464             *p++ = '\\';
   6465             *p++ = 'u';
   6466             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
   6467             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
   6468             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
   6469             *p++ = Py_hexdigits[ch & 15];
   6470         }
   6471         /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
   6472         else {
   6473             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
   6474             *p++ = '\\';
   6475             *p++ = 'U';
   6476             *p++ = '0';
   6477             *p++ = '0';
   6478             *p++ = Py_hexdigits[(ch >> 20) & 0xf];
   6479             *p++ = Py_hexdigits[(ch >> 16) & 0xf];
   6480             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
   6481             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
   6482             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
   6483             *p++ = Py_hexdigits[ch & 15];
   6484         }
   6485     }
   6486 
   6487     assert(p > PyBytes_AS_STRING(repr));
   6488     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
   6489         return NULL;
   6490     }
   6491     return repr;
   6492 }
   6493 
   6494 PyObject *
   6495 PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
   6496                                  Py_ssize_t size)
   6497 {
   6498     PyObject *result;
   6499     PyObject *tmp = PyUnicode_FromWideChar(s, size);
   6500     if (tmp == NULL)
   6501         return NULL;
   6502     result = PyUnicode_AsRawUnicodeEscapeString(tmp);
   6503     Py_DECREF(tmp);
   6504     return result;
   6505 }
   6506 
   6507 /* --- Unicode Internal Codec ------------------------------------------- */
   6508 
   6509 PyObject *
   6510 _PyUnicode_DecodeUnicodeInternal(const char *s,
   6511                                  Py_ssize_t size,
   6512                                  const char *errors)
   6513 {
   6514     const char *starts = s;
   6515     Py_ssize_t startinpos;
   6516     Py_ssize_t endinpos;
   6517     _PyUnicodeWriter writer;
   6518     const char *end;
   6519     const char *reason;
   6520     PyObject *errorHandler = NULL;
   6521     PyObject *exc = NULL;
   6522 
   6523     if (PyErr_WarnEx(PyExc_DeprecationWarning,
   6524                      "unicode_internal codec has been deprecated",
   6525                      1))
   6526         return NULL;
   6527 
   6528     if (size < 0) {
   6529         PyErr_BadInternalCall();
   6530         return NULL;
   6531     }
   6532     if (size == 0)
   6533         _Py_RETURN_UNICODE_EMPTY();
   6534 
   6535     _PyUnicodeWriter_Init(&writer);
   6536     if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
   6537         PyErr_NoMemory();
   6538         goto onError;
   6539     }
   6540     writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
   6541 
   6542     end = s + size;
   6543     while (s < end) {
   6544         Py_UNICODE uch;
   6545         Py_UCS4 ch;
   6546         if (end - s < Py_UNICODE_SIZE) {
   6547             endinpos = end-starts;
   6548             reason = "truncated input";
   6549             goto error;
   6550         }
   6551         /* We copy the raw representation one byte at a time because the
   6552            pointer may be unaligned (see test_codeccallbacks). */
   6553         ((char *) &uch)[0] = s[0];
   6554         ((char *) &uch)[1] = s[1];
   6555 #ifdef Py_UNICODE_WIDE
   6556         ((char *) &uch)[2] = s[2];
   6557         ((char *) &uch)[3] = s[3];
   6558 #endif
   6559         ch = uch;
   6560 #ifdef Py_UNICODE_WIDE
   6561         /* We have to sanity check the raw data, otherwise doom looms for
   6562            some malformed UCS-4 data. */
   6563         if (ch > 0x10ffff) {
   6564             endinpos = s - starts + Py_UNICODE_SIZE;
   6565             reason = "illegal code point (> 0x10FFFF)";
   6566             goto error;
   6567         }
   6568 #endif
   6569         s += Py_UNICODE_SIZE;
   6570 #ifndef Py_UNICODE_WIDE
   6571         if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
   6572         {
   6573             Py_UNICODE uch2;
   6574             ((char *) &uch2)[0] = s[0];
   6575             ((char *) &uch2)[1] = s[1];
   6576             if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
   6577             {
   6578                 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
   6579                 s += Py_UNICODE_SIZE;
   6580             }
   6581         }
   6582 #endif
   6583 
   6584         if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
   6585             goto onError;
   6586         continue;
   6587 
   6588   error:
   6589         startinpos = s - starts;
   6590         if (unicode_decode_call_errorhandler_writer(
   6591                 errors, &errorHandler,
   6592                 "unicode_internal", reason,
   6593                 &starts, &end, &startinpos, &endinpos, &exc, &s,
   6594                 &writer))
   6595             goto onError;
   6596     }
   6597 
   6598     Py_XDECREF(errorHandler);
   6599     Py_XDECREF(exc);
   6600     return _PyUnicodeWriter_Finish(&writer);
   6601 
   6602   onError:
   6603     _PyUnicodeWriter_Dealloc(&writer);
   6604     Py_XDECREF(errorHandler);
   6605     Py_XDECREF(exc);
   6606     return NULL;
   6607 }
   6608 
   6609 /* --- Latin-1 Codec ------------------------------------------------------ */
   6610 
   6611 PyObject *
   6612 PyUnicode_DecodeLatin1(const char *s,
   6613                        Py_ssize_t size,
   6614                        const char *errors)
   6615 {
   6616     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
   6617     return _PyUnicode_FromUCS1((unsigned char*)s, size);
   6618 }
   6619 
   6620 /* create or adjust a UnicodeEncodeError */
   6621 static void
   6622 make_encode_exception(PyObject **exceptionObject,
   6623                       const char *encoding,
   6624                       PyObject *unicode,
   6625                       Py_ssize_t startpos, Py_ssize_t endpos,
   6626                       const char *reason)
   6627 {
   6628     if (*exceptionObject == NULL) {
   6629         *exceptionObject = PyObject_CallFunction(
   6630             PyExc_UnicodeEncodeError, "sOnns",
   6631             encoding, unicode, startpos, endpos, reason);
   6632     }
   6633     else {
   6634         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
   6635             goto onError;
   6636         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
   6637             goto onError;
   6638         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
   6639             goto onError;
   6640         return;
   6641       onError:
   6642         Py_CLEAR(*exceptionObject);
   6643     }
   6644 }
   6645 
   6646 /* raises a UnicodeEncodeError */
   6647 static void
   6648 raise_encode_exception(PyObject **exceptionObject,
   6649                        const char *encoding,
   6650                        PyObject *unicode,
   6651                        Py_ssize_t startpos, Py_ssize_t endpos,
   6652                        const char *reason)
   6653 {
   6654     make_encode_exception(exceptionObject,
   6655                           encoding, unicode, startpos, endpos, reason);
   6656     if (*exceptionObject != NULL)
   6657         PyCodec_StrictErrors(*exceptionObject);
   6658 }
   6659 
   6660 /* error handling callback helper:
   6661    build arguments, call the callback and check the arguments,
   6662    put the result into newpos and return the replacement string, which
   6663    has to be freed by the caller */
   6664 static PyObject *
   6665 unicode_encode_call_errorhandler(const char *errors,
   6666                                  PyObject **errorHandler,
   6667                                  const char *encoding, const char *reason,
   6668                                  PyObject *unicode, PyObject **exceptionObject,
   6669                                  Py_ssize_t startpos, Py_ssize_t endpos,
   6670                                  Py_ssize_t *newpos)
   6671 {
   6672     static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
   6673     Py_ssize_t len;
   6674     PyObject *restuple;
   6675     PyObject *resunicode;
   6676 
   6677     if (*errorHandler == NULL) {
   6678         *errorHandler = PyCodec_LookupError(errors);
   6679         if (*errorHandler == NULL)
   6680             return NULL;
   6681     }
   6682 
   6683     if (PyUnicode_READY(unicode) == -1)
   6684         return NULL;
   6685     len = PyUnicode_GET_LENGTH(unicode);
   6686 
   6687     make_encode_exception(exceptionObject,
   6688                           encoding, unicode, startpos, endpos, reason);
   6689     if (*exceptionObject == NULL)
   6690         return NULL;
   6691 
   6692     restuple = PyObject_CallFunctionObjArgs(
   6693         *errorHandler, *exceptionObject, NULL);
   6694     if (restuple == NULL)
   6695         return NULL;
   6696     if (!PyTuple_Check(restuple)) {
   6697         PyErr_SetString(PyExc_TypeError, &argparse[3]);
   6698         Py_DECREF(restuple);
   6699         return NULL;
   6700     }
   6701     if (!PyArg_ParseTuple(restuple, argparse,
   6702                           &resunicode, newpos)) {
   6703         Py_DECREF(restuple);
   6704         return NULL;
   6705     }
   6706     if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
   6707         PyErr_SetString(PyExc_TypeError, &argparse[3]);
   6708         Py_DECREF(restuple);
   6709         return NULL;
   6710     }
   6711     if (*newpos<0)
   6712         *newpos = len + *newpos;
   6713     if (*newpos<0 || *newpos>len) {
   6714         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
   6715         Py_DECREF(restuple);
   6716         return NULL;
   6717     }
   6718     Py_INCREF(resunicode);
   6719     Py_DECREF(restuple);
   6720     return resunicode;
   6721 }
   6722 
   6723 static PyObject *
   6724 unicode_encode_ucs1(PyObject *unicode,
   6725                     const char *errors,
   6726                     const Py_UCS4 limit)
   6727 {
   6728     /* input state */
   6729     Py_ssize_t pos=0, size;
   6730     int kind;
   6731     void *data;
   6732     /* pointer into the output */
   6733     char *str;
   6734     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
   6735     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
   6736     PyObject *error_handler_obj = NULL;
   6737     PyObject *exc = NULL;
   6738     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
   6739     PyObject *rep = NULL;
   6740     /* output object */
   6741     _PyBytesWriter writer;
   6742 
   6743     if (PyUnicode_READY(unicode) == -1)
   6744         return NULL;
   6745     size = PyUnicode_GET_LENGTH(unicode);
   6746     kind = PyUnicode_KIND(unicode);
   6747     data = PyUnicode_DATA(unicode);
   6748     /* allocate enough for a simple encoding without
   6749        replacements, if we need more, we'll resize */
   6750     if (size == 0)
   6751         return PyBytes_FromStringAndSize(NULL, 0);
   6752 
   6753     _PyBytesWriter_Init(&writer);
   6754     str = _PyBytesWriter_Alloc(&writer, size);
   6755     if (str == NULL)
   6756         return NULL;
   6757 
   6758     while (pos < size) {
   6759         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
   6760 
   6761         /* can we encode this? */
   6762         if (ch < limit) {
   6763             /* no overflow check, because we know that the space is enough */
   6764             *str++ = (char)ch;
   6765             ++pos;
   6766         }
   6767         else {
   6768             Py_ssize_t newpos, i;
   6769             /* startpos for collecting unencodable chars */
   6770             Py_ssize_t collstart = pos;
   6771             Py_ssize_t collend = collstart + 1;
   6772             /* find all unecodable characters */
   6773 
   6774             while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
   6775                 ++collend;
   6776 
   6777             /* Only overallocate the buffer if it's not the last write */
   6778             writer.overallocate = (collend < size);
   6779 
   6780             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
   6781             if (error_handler == _Py_ERROR_UNKNOWN)
   6782                 error_handler = get_error_handler(errors);
   6783 
   6784             switch (error_handler) {
   6785             case _Py_ERROR_STRICT:
   6786                 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
   6787                 goto onError;
   6788 
   6789             case _Py_ERROR_REPLACE:
   6790                 memset(str, '?', collend - collstart);
   6791                 str += (collend - collstart);
   6792                 /* fall through */
   6793             case _Py_ERROR_IGNORE:
   6794                 pos = collend;
   6795                 break;
   6796 
   6797             case _Py_ERROR_BACKSLASHREPLACE:
   6798                 /* subtract preallocated bytes */
   6799                 writer.min_size -= (collend - collstart);
   6800                 str = backslashreplace(&writer, str,
   6801                                        unicode, collstart, collend);
   6802                 if (str == NULL)
   6803                     goto onError;
   6804                 pos = collend;
   6805                 break;
   6806 
   6807             case _Py_ERROR_XMLCHARREFREPLACE:
   6808                 /* subtract preallocated bytes */
   6809                 writer.min_size -= (collend - collstart);
   6810                 str = xmlcharrefreplace(&writer, str,
   6811                                         unicode, collstart, collend);
   6812                 if (str == NULL)
   6813                     goto onError;
   6814                 pos = collend;
   6815                 break;
   6816 
   6817             case _Py_ERROR_SURROGATEESCAPE:
   6818                 for (i = collstart; i < collend; ++i) {
   6819                     ch = PyUnicode_READ(kind, data, i);
   6820                     if (ch < 0xdc80 || 0xdcff < ch) {
   6821                         /* Not a UTF-8b surrogate */
   6822                         break;
   6823                     }
   6824                     *str++ = (char)(ch - 0xdc00);
   6825                     ++pos;
   6826                 }
   6827                 if (i >= collend)
   6828                     break;
   6829                 collstart = pos;
   6830                 assert(collstart != collend);
   6831                 /* fall through */
   6832 
   6833             default:
   6834                 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
   6835                                                        encoding, reason, unicode, &exc,
   6836                                                        collstart, collend, &newpos);
   6837                 if (rep == NULL)
   6838                     goto onError;
   6839 
   6840                 /* subtract preallocated bytes */
   6841                 writer.min_size -= newpos - collstart;
   6842 
   6843                 if (PyBytes_Check(rep)) {
   6844                     /* Directly copy bytes result to output. */
   6845                     str = _PyBytesWriter_WriteBytes(&writer, str,
   6846                                                     PyBytes_AS_STRING(rep),
   6847                                                     PyBytes_GET_SIZE(rep));
   6848                 }
   6849                 else {
   6850                     assert(PyUnicode_Check(rep));
   6851 
   6852                     if (PyUnicode_READY(rep) < 0)
   6853                         goto onError;
   6854 
   6855                     if (limit == 256 ?
   6856                         PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
   6857                         !PyUnicode_IS_ASCII(rep))
   6858                     {
   6859                         /* Not all characters are smaller than limit */
   6860                         raise_encode_exception(&exc, encoding, unicode,
   6861                                                collstart, collend, reason);
   6862                         goto onError;
   6863                     }
   6864                     assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
   6865                     str = _PyBytesWriter_WriteBytes(&writer, str,
   6866                                                     PyUnicode_DATA(rep),
   6867                                                     PyUnicode_GET_LENGTH(rep));
   6868                 }
   6869                 if (str == NULL)
   6870                     goto onError;
   6871 
   6872                 pos = newpos;
   6873                 Py_CLEAR(rep);
   6874             }
   6875 
   6876             /* If overallocation was disabled, ensure that it was the last
   6877                write. Otherwise, we missed an optimization */
   6878             assert(writer.overallocate || pos == size);
   6879         }
   6880     }
   6881 
   6882     Py_XDECREF(error_handler_obj);
   6883     Py_XDECREF(exc);
   6884     return _PyBytesWriter_Finish(&writer, str);
   6885 
   6886   onError:
   6887     Py_XDECREF(rep);
   6888     _PyBytesWriter_Dealloc(&writer);
   6889     Py_XDECREF(error_handler_obj);
   6890     Py_XDECREF(exc);
   6891     return NULL;
   6892 }
   6893 
   6894 /* Deprecated */
   6895 PyObject *
   6896 PyUnicode_EncodeLatin1(const Py_UNICODE *p,
   6897                        Py_ssize_t size,
   6898                        const char *errors)
   6899 {
   6900     PyObject *result;
   6901     PyObject *unicode = PyUnicode_FromWideChar(p, size);
   6902     if (unicode == NULL)
   6903         return NULL;
   6904     result = unicode_encode_ucs1(unicode, errors, 256);
   6905     Py_DECREF(unicode);
   6906     return result;
   6907 }
   6908 
   6909 PyObject *
   6910 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
   6911 {
   6912     if (!PyUnicode_Check(unicode)) {
   6913         PyErr_BadArgument();
   6914         return NULL;
   6915     }
   6916     if (PyUnicode_READY(unicode) == -1)
   6917         return NULL;
   6918     /* Fast path: if it is a one-byte string, construct
   6919        bytes object directly. */
   6920     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
   6921         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
   6922                                          PyUnicode_GET_LENGTH(unicode));
   6923     /* Non-Latin-1 characters present. Defer to above function to
   6924        raise the exception. */
   6925     return unicode_encode_ucs1(unicode, errors, 256);
   6926 }
   6927 
   6928 PyObject*
   6929 PyUnicode_AsLatin1String(PyObject *unicode)
   6930 {
   6931     return _PyUnicode_AsLatin1String(unicode, NULL);
   6932 }
   6933 
   6934 /* --- 7-bit ASCII Codec -------------------------------------------------- */
   6935 
   6936 PyObject *
   6937 PyUnicode_DecodeASCII(const char *s,
   6938                       Py_ssize_t size,
   6939                       const char *errors)
   6940 {
   6941     const char *starts = s;
   6942     _PyUnicodeWriter writer;
   6943     int kind;
   6944     void *data;
   6945     Py_ssize_t startinpos;
   6946     Py_ssize_t endinpos;
   6947     Py_ssize_t outpos;
   6948     const char *e;
   6949     PyObject *error_handler_obj = NULL;
   6950     PyObject *exc = NULL;
   6951     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
   6952 
   6953     if (size == 0)
   6954         _Py_RETURN_UNICODE_EMPTY();
   6955 
   6956     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
   6957     if (size == 1 && (unsigned char)s[0] < 128)
   6958         return get_latin1_char((unsigned char)s[0]);
   6959 
   6960     _PyUnicodeWriter_Init(&writer);
   6961     writer.min_length = size;
   6962     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
   6963         return NULL;
   6964 
   6965     e = s + size;
   6966     data = writer.data;
   6967     outpos = ascii_decode(s, e, (Py_UCS1 *)data);
   6968     writer.pos = outpos;
   6969     if (writer.pos == size)
   6970         return _PyUnicodeWriter_Finish(&writer);
   6971 
   6972     s += writer.pos;
   6973     kind = writer.kind;
   6974     while (s < e) {
   6975         unsigned char c = (unsigned char)*s;
   6976         if (c < 128) {
   6977             PyUnicode_WRITE(kind, data, writer.pos, c);
   6978             writer.pos++;
   6979             ++s;
   6980             continue;
   6981         }
   6982 
   6983         /* byte outsize range 0x00..0x7f: call the error handler */
   6984 
   6985         if (error_handler == _Py_ERROR_UNKNOWN)
   6986             error_handler = get_error_handler(errors);
   6987 
   6988         switch (error_handler)
   6989         {
   6990         case _Py_ERROR_REPLACE:
   6991         case _Py_ERROR_SURROGATEESCAPE:
   6992             /* Fast-path: the error handler only writes one character,
   6993                but we may switch to UCS2 at the first write */
   6994             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
   6995                 goto onError;
   6996             kind = writer.kind;
   6997             data = writer.data;
   6998 
   6999             if (error_handler == _Py_ERROR_REPLACE)
   7000                 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
   7001             else
   7002                 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
   7003             writer.pos++;
   7004             ++s;
   7005             break;
   7006 
   7007         case _Py_ERROR_IGNORE:
   7008             ++s;
   7009             break;
   7010 
   7011         default:
   7012             startinpos = s-starts;
   7013             endinpos = startinpos + 1;
   7014             if (unicode_decode_call_errorhandler_writer(
   7015                     errors, &error_handler_obj,
   7016                     "ascii", "ordinal not in range(128)",
   7017                     &starts, &e, &startinpos, &endinpos, &exc, &s,
   7018                     &writer))
   7019                 goto onError;
   7020             kind = writer.kind;
   7021             data = writer.data;
   7022         }
   7023     }
   7024     Py_XDECREF(error_handler_obj);
   7025     Py_XDECREF(exc);
   7026     return _PyUnicodeWriter_Finish(&writer);
   7027 
   7028   onError:
   7029     _PyUnicodeWriter_Dealloc(&writer);
   7030     Py_XDECREF(error_handler_obj);
   7031     Py_XDECREF(exc);
   7032     return NULL;
   7033 }
   7034 
   7035 /* Deprecated */
   7036 PyObject *
   7037 PyUnicode_EncodeASCII(const Py_UNICODE *p,
   7038                       Py_ssize_t size,
   7039                       const char *errors)
   7040 {
   7041     PyObject *result;
   7042     PyObject *unicode = PyUnicode_FromWideChar(p, size);
   7043     if (unicode == NULL)
   7044         return NULL;
   7045     result = unicode_encode_ucs1(unicode, errors, 128);
   7046     Py_DECREF(unicode);
   7047     return result;
   7048 }
   7049 
   7050 PyObject *
   7051 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
   7052 {
   7053     if (!PyUnicode_Check(unicode)) {
   7054         PyErr_BadArgument();
   7055         return NULL;
   7056     }
   7057     if (PyUnicode_READY(unicode) == -1)
   7058         return NULL;
   7059     /* Fast path: if it is an ASCII-only string, construct bytes object
   7060        directly. Else defer to above function to raise the exception. */
   7061     if (PyUnicode_IS_ASCII(unicode))
   7062         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
   7063                                          PyUnicode_GET_LENGTH(unicode));
   7064     return unicode_encode_ucs1(unicode, errors, 128);
   7065 }
   7066 
   7067 PyObject *
   7068 PyUnicode_AsASCIIString(PyObject *unicode)
   7069 {
   7070     return _PyUnicode_AsASCIIString(unicode, NULL);
   7071 }
   7072 
   7073 #ifdef MS_WINDOWS
   7074 
   7075 /* --- MBCS codecs for Windows -------------------------------------------- */
   7076 
   7077 #if SIZEOF_INT < SIZEOF_SIZE_T
   7078 #define NEED_RETRY
   7079 #endif
   7080 
   7081 #ifndef WC_ERR_INVALID_CHARS
   7082 #  define WC_ERR_INVALID_CHARS 0x0080
   7083 #endif
   7084 
   7085 static const char*
   7086 code_page_name(UINT code_page, PyObject **obj)
   7087 {
   7088     *obj = NULL;
   7089     if (code_page == CP_ACP)
   7090         return "mbcs";
   7091     if (code_page == CP_UTF7)
   7092         return "CP_UTF7";
   7093     if (code_page == CP_UTF8)
   7094         return "CP_UTF8";
   7095 
   7096     *obj = PyBytes_FromFormat("cp%u", code_page);
   7097     if (*obj == NULL)
   7098         return NULL;
   7099     return PyBytes_AS_STRING(*obj);
   7100 }
   7101 
   7102 static DWORD
   7103 decode_code_page_flags(UINT code_page)
   7104 {
   7105     if (code_page == CP_UTF7) {
   7106         /* The CP_UTF7 decoder only supports flags=0 */
   7107         return 0;
   7108     }
   7109     else
   7110         return MB_ERR_INVALID_CHARS;
   7111 }
   7112 
   7113 /*
   7114  * Decode a byte string from a Windows code page into unicode object in strict
   7115  * mode.
   7116  *
   7117  * Returns consumed size if succeed, returns -2 on decode error, or raise an
   7118  * OSError and returns -1 on other error.
   7119  */
   7120 static int
   7121 decode_code_page_strict(UINT code_page,
   7122                         PyObject **v,
   7123                         const char *in,
   7124                         int insize)
   7125 {
   7126     const DWORD flags = decode_code_page_flags(code_page);
   7127     wchar_t *out;
   7128     DWORD outsize;
   7129 
   7130     /* First get the size of the result */
   7131     assert(insize > 0);
   7132     outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
   7133     if (outsize <= 0)
   7134         goto error;
   7135 
   7136     if (*v == NULL) {
   7137         /* Create unicode object */
   7138         /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
   7139         *v = (PyObject*)_PyUnicode_New(outsize);
   7140         if (*v == NULL)
   7141             return -1;
   7142         out = PyUnicode_AS_UNICODE(*v);
   7143     }
   7144     else {
   7145         /* Extend unicode object */
   7146         Py_ssize_t n = PyUnicode_GET_SIZE(*v);
   7147         if (unicode_resize(v, n + outsize) < 0)
   7148             return -1;
   7149         out = PyUnicode_AS_UNICODE(*v) + n;
   7150     }
   7151 
   7152     /* Do the conversion */
   7153     outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
   7154     if (outsize <= 0)
   7155         goto error;
   7156     return insize;
   7157 
   7158 error:
   7159     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
   7160         return -2;
   7161     PyErr_SetFromWindowsErr(0);
   7162     return -1;
   7163 }
   7164 
   7165 /*
   7166  * Decode a byte string from a code page into unicode object with an error
   7167  * handler.
   7168  *
   7169  * Returns consumed size if succeed, or raise an OSError or
   7170  * UnicodeDecodeError exception and returns -1 on error.
   7171  */
   7172 static int
   7173 decode_code_page_errors(UINT code_page,
   7174                         PyObject **v,
   7175                         const char *in, const int size,
   7176                         const char *errors, int final)
   7177 {
   7178     const char *startin = in;
   7179     const char *endin = in + size;
   7180     const DWORD flags = decode_code_page_flags(code_page);
   7181     /* Ideally, we should get reason from FormatMessage. This is the Windows
   7182        2000 English version of the message. */
   7183     const char *reason = "No mapping for the Unicode character exists "
   7184                          "in the target code page.";
   7185     /* each step cannot decode more than 1 character, but a character can be
   7186        represented as a surrogate pair */
   7187     wchar_t buffer[2], *out;
   7188     int insize;
   7189     Py_ssize_t outsize;
   7190     PyObject *errorHandler = NULL;
   7191     PyObject *exc = NULL;
   7192     PyObject *encoding_obj = NULL;
   7193     const char *encoding;
   7194     DWORD err;
   7195     int ret = -1;
   7196 
   7197     assert(size > 0);
   7198 
   7199     encoding = code_page_name(code_page, &encoding_obj);
   7200     if (encoding == NULL)
   7201         return -1;
   7202 
   7203     if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
   7204         /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
   7205            UnicodeDecodeError. */
   7206         make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
   7207         if (exc != NULL) {
   7208             PyCodec_StrictErrors(exc);
   7209             Py_CLEAR(exc);
   7210         }
   7211         goto error;
   7212     }
   7213 
   7214     if (*v == NULL) {
   7215         /* Create unicode object */
   7216         if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
   7217             PyErr_NoMemory();
   7218             goto error;
   7219         }
   7220         /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
   7221         *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
   7222         if (*v == NULL)
   7223             goto error;
   7224         out = PyUnicode_AS_UNICODE(*v);
   7225     }
   7226     else {
   7227         /* Extend unicode object */
   7228         Py_ssize_t n = PyUnicode_GET_SIZE(*v);
   7229         if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
   7230             PyErr_NoMemory();
   7231             goto error;
   7232         }
   7233         if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
   7234             goto error;
   7235         out = PyUnicode_AS_UNICODE(*v) + n;
   7236     }
   7237 
   7238     /* Decode the byte string character per character */
   7239     while (in < endin)
   7240     {
   7241         /* Decode a character */
   7242         insize = 1;
   7243         do
   7244         {
   7245             outsize = MultiByteToWideChar(code_page, flags,
   7246                                           in, insize,
   7247                                           buffer, Py_ARRAY_LENGTH(buffer));
   7248             if (outsize > 0)
   7249                 break;
   7250             err = GetLastError();
   7251             if (err != ERROR_NO_UNICODE_TRANSLATION
   7252                 && err != ERROR_INSUFFICIENT_BUFFER)
   7253             {
   7254                 PyErr_SetFromWindowsErr(0);
   7255                 goto error;
   7256             }
   7257             insize++;
   7258         }
   7259         /* 4=maximum length of a UTF-8 sequence */
   7260         while (insize <= 4 && (in + insize) <= endin);
   7261 
   7262         if (outsize <= 0) {
   7263             Py_ssize_t startinpos, endinpos, outpos;
   7264 
   7265             /* last character in partial decode? */
   7266             if (in + insize >= endin && !final)
   7267                 break;
   7268 
   7269             startinpos = in - startin;
   7270             endinpos = startinpos + 1;
   7271             outpos = out - PyUnicode_AS_UNICODE(*v);
   7272             if (unicode_decode_call_errorhandler_wchar(
   7273                     errors, &errorHandler,
   7274                     encoding, reason,
   7275                     &startin, &endin, &startinpos, &endinpos, &exc, &in,
   7276                     v, &outpos))
   7277             {
   7278                 goto error;
   7279             }
   7280             out = PyUnicode_AS_UNICODE(*v) + outpos;
   7281         }
   7282         else {
   7283             in += insize;
   7284             memcpy(out, buffer, outsize * sizeof(wchar_t));
   7285             out += outsize;
   7286         }
   7287     }
   7288 
   7289     /* write a NUL character at the end */
   7290     *out = 0;
   7291 
   7292     /* Extend unicode object */
   7293     outsize = out - PyUnicode_AS_UNICODE(*v);
   7294     assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
   7295     if (unicode_resize(v, outsize) < 0)
   7296         goto error;
   7297     /* (in - startin) <= size and size is an int */
   7298     ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
   7299 
   7300 error:
   7301     Py_XDECREF(encoding_obj);
   7302     Py_XDECREF(errorHandler);
   7303     Py_XDECREF(exc);
   7304     return ret;
   7305 }
   7306 
   7307 static PyObject *
   7308 decode_code_page_stateful(int code_page,
   7309                           const char *s, Py_ssize_t size,
   7310                           const char *errors, Py_ssize_t *consumed)
   7311 {
   7312     PyObject *v = NULL;
   7313     int chunk_size, final, converted, done;
   7314 
   7315     if (code_page < 0) {
   7316         PyErr_SetString(PyExc_ValueError, "invalid code page number");
   7317         return NULL;
   7318     }
   7319     if (size < 0) {
   7320         PyErr_BadInternalCall();
   7321         return NULL;
   7322     }
   7323 
   7324     if (consumed)
   7325         *consumed = 0;
   7326 
   7327     do
   7328     {
   7329 #ifdef NEED_RETRY
   7330         if (size > INT_MAX) {
   7331             chunk_size = INT_MAX;
   7332             final = 0;
   7333             done = 0;
   7334         }
   7335         else
   7336 #endif
   7337         {
   7338             chunk_size = (int)size;
   7339             final = (consumed == NULL);
   7340             done = 1;
   7341         }
   7342 
   7343         if (chunk_size == 0 && done) {
   7344             if (v != NULL)
   7345                 break;
   7346             _Py_RETURN_UNICODE_EMPTY();
   7347         }
   7348 
   7349         converted = decode_code_page_strict(code_page, &v,
   7350                                             s, chunk_size);
   7351         if (converted == -2)
   7352             converted = decode_code_page_errors(code_page, &v,
   7353                                                 s, chunk_size,
   7354                                                 errors, final);
   7355         assert(converted != 0 || done);
   7356 
   7357         if (converted < 0) {
   7358             Py_XDECREF(v);
   7359             return NULL;
   7360         }
   7361 
   7362         if (consumed)
   7363             *consumed += converted;
   7364 
   7365         s += converted;
   7366         size -= converted;
   7367     } while (!done);
   7368 
   7369     return unicode_result(v);
   7370 }
   7371 
   7372 PyObject *
   7373 PyUnicode_DecodeCodePageStateful(int code_page,
   7374                                  const char *s,
   7375                                  Py_ssize_t size,
   7376                                  const char *errors,
   7377                                  Py_ssize_t *consumed)
   7378 {
   7379     return decode_code_page_stateful(code_page, s, size, errors, consumed);
   7380 }
   7381 
   7382 PyObject *
   7383 PyUnicode_DecodeMBCSStateful(const char *s,
   7384                              Py_ssize_t size,
   7385                              const char *errors,
   7386                              Py_ssize_t *consumed)
   7387 {
   7388     return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
   7389 }
   7390 
   7391 PyObject *
   7392 PyUnicode_DecodeMBCS(const char *s,
   7393                      Py_ssize_t size,
   7394                      const char *errors)
   7395 {
   7396     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
   7397 }
   7398 
   7399 static DWORD
   7400 encode_code_page_flags(UINT code_page, const char *errors)
   7401 {
   7402     if (code_page == CP_UTF8) {
   7403         return WC_ERR_INVALID_CHARS;
   7404     }
   7405     else if (code_page == CP_UTF7) {
   7406         /* CP_UTF7 only supports flags=0 */
   7407         return 0;
   7408     }
   7409     else {
   7410         if (errors != NULL && strcmp(errors, "replace") == 0)
   7411             return 0;
   7412         else
   7413             return WC_NO_BEST_FIT_CHARS;
   7414     }
   7415 }
   7416 
   7417 /*
   7418  * Encode a Unicode string to a Windows code page into a byte string in strict
   7419  * mode.
   7420  *
   7421  * Returns consumed characters if succeed, returns -2 on encode error, or raise
   7422  * an OSError and returns -1 on other error.
   7423  */
   7424 static int
   7425 encode_code_page_strict(UINT code_page, PyObject **outbytes,
   7426                         PyObject *unicode, Py_ssize_t offset, int len,
   7427                         const char* errors)
   7428 {
   7429     BOOL usedDefaultChar = FALSE;
   7430     BOOL *pusedDefaultChar = &usedDefaultChar;
   7431     int outsize;
   7432     wchar_t *p;
   7433     Py_ssize_t size;
   7434     const DWORD flags = encode_code_page_flags(code_page, NULL);
   7435     char *out;
   7436     /* Create a substring so that we can get the UTF-16 representation
   7437        of just the slice under consideration. */
   7438     PyObject *substring;
   7439 
   7440     assert(len > 0);
   7441 
   7442     if (code_page != CP_UTF8 && code_page != CP_UTF7)
   7443         pusedDefaultChar = &usedDefaultChar;
   7444     else
   7445         pusedDefaultChar = NULL;
   7446 
   7447     substring = PyUnicode_Substring(unicode, offset, offset+len);
   7448     if (substring == NULL)
   7449         return -1;
   7450     p = PyUnicode_AsUnicodeAndSize(substring, &size);
   7451     if (p == NULL) {
   7452         Py_DECREF(substring);
   7453         return -1;
   7454     }
   7455     assert(size <= INT_MAX);
   7456 
   7457     /* First get the size of the result */
   7458     outsize = WideCharToMultiByte(code_page, flags,
   7459                                   p, (int)size,
   7460                                   NULL, 0,
   7461                                   NULL, pusedDefaultChar);
   7462     if (outsize <= 0)
   7463         goto error;
   7464     /* If we used a default char, then we failed! */
   7465     if (pusedDefaultChar && *pusedDefaultChar) {
   7466         Py_DECREF(substring);
   7467         return -2;
   7468     }
   7469 
   7470     if (*outbytes == NULL) {
   7471         /* Create string object */
   7472         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
   7473         if (*outbytes == NULL) {
   7474             Py_DECREF(substring);
   7475             return -1;
   7476         }
   7477         out = PyBytes_AS_STRING(*outbytes);
   7478     }
   7479     else {
   7480         /* Extend string object */
   7481         const Py_ssize_t n = PyBytes_Size(*outbytes);
   7482         if (outsize > PY_SSIZE_T_MAX - n) {
   7483             PyErr_NoMemory();
   7484             Py_DECREF(substring);
   7485             return -1;
   7486         }
   7487         if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
   7488             Py_DECREF(substring);
   7489             return -1;
   7490         }
   7491         out = PyBytes_AS_STRING(*outbytes) + n;
   7492     }
   7493 
   7494     /* Do the conversion */
   7495     outsize = WideCharToMultiByte(code_page, flags,
   7496                                   p, (int)size,
   7497                                   out, outsize,
   7498                                   NULL, pusedDefaultChar);
   7499     Py_CLEAR(substring);
   7500     if (outsize <= 0)
   7501         goto error;
   7502     if (pusedDefaultChar && *pusedDefaultChar)
   7503         return -2;
   7504     return 0;
   7505 
   7506 error:
   7507     Py_XDECREF(substring);
   7508     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
   7509         return -2;
   7510     PyErr_SetFromWindowsErr(0);
   7511     return -1;
   7512 }
   7513 
   7514 /*
   7515  * Encode a Unicode string to a Windows code page into a byte string using an
   7516  * error handler.
   7517  *
   7518  * Returns consumed characters if succeed, or raise an OSError and returns
   7519  * -1 on other error.
   7520  */
   7521 static int
   7522 encode_code_page_errors(UINT code_page, PyObject **outbytes,
   7523                         PyObject *unicode, Py_ssize_t unicode_offset,
   7524                         Py_ssize_t insize, const char* errors)
   7525 {
   7526     const DWORD flags = encode_code_page_flags(code_page, errors);
   7527     Py_ssize_t pos = unicode_offset;
   7528     Py_ssize_t endin = unicode_offset + insize;
   7529     /* Ideally, we should get reason from FormatMessage. This is the Windows
   7530        2000 English version of the message. */
   7531     const char *reason = "invalid character";
   7532     /* 4=maximum length of a UTF-8 sequence */
   7533     char buffer[4];
   7534     BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
   7535     Py_ssize_t outsize;
   7536     char *out;
   7537     PyObject *errorHandler = NULL;
   7538     PyObject *exc = NULL;
   7539     PyObject *encoding_obj = NULL;
   7540     const char *encoding;
   7541     Py_ssize_t newpos, newoutsize;
   7542     PyObject *rep;
   7543     int ret = -1;
   7544 
   7545     assert(insize > 0);
   7546 
   7547     encoding = code_page_name(code_page, &encoding_obj);
   7548     if (encoding == NULL)
   7549         return -1;
   7550 
   7551     if (errors == NULL || strcmp(errors, "strict") == 0) {
   7552         /* The last error was ERROR_NO_UNICODE_TRANSLATION,
   7553            then we raise a UnicodeEncodeError. */
   7554         make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
   7555         if (exc != NULL) {
   7556             PyCodec_StrictErrors(exc);
   7557             Py_DECREF(exc);
   7558         }
   7559         Py_XDECREF(encoding_obj);
   7560         return -1;
   7561     }
   7562 
   7563     if (code_page != CP_UTF8 && code_page != CP_UTF7)
   7564         pusedDefaultChar = &usedDefaultChar;
   7565     else
   7566         pusedDefaultChar = NULL;
   7567 
   7568     if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
   7569         PyErr_NoMemory();
   7570         goto error;
   7571     }
   7572     outsize = insize * Py_ARRAY_LENGTH(buffer);
   7573 
   7574     if (*outbytes == NULL) {
   7575         /* Create string object */
   7576         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
   7577         if (*outbytes == NULL)
   7578             goto error;
   7579         out = PyBytes_AS_STRING(*outbytes);
   7580     }
   7581     else {
   7582         /* Extend string object */
   7583         Py_ssize_t n = PyBytes_Size(*outbytes);
   7584         if (n > PY_SSIZE_T_MAX - outsize) {
   7585             PyErr_NoMemory();
   7586             goto error;
   7587         }
   7588         if (_PyBytes_Resize(outbytes, n + outsize) < 0)
   7589             goto error;
   7590         out = PyBytes_AS_STRING(*outbytes) + n;
   7591     }
   7592 
   7593     /* Encode the string character per character */
   7594     while (pos < endin)
   7595     {
   7596         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
   7597         wchar_t chars[2];
   7598         int charsize;
   7599         if (ch < 0x10000) {
   7600             chars[0] = (wchar_t)ch;
   7601             charsize = 1;
   7602         }
   7603         else {
   7604             chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
   7605             chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
   7606             charsize = 2;
   7607         }
   7608 
   7609         outsize = WideCharToMultiByte(code_page, flags,
   7610                                       chars, charsize,
   7611                                       buffer, Py_ARRAY_LENGTH(buffer),
   7612                                       NULL, pusedDefaultChar);
   7613         if (outsize > 0) {
   7614             if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
   7615             {
   7616                 pos++;
   7617                 memcpy(out, buffer, outsize);
   7618                 out += outsize;
   7619                 continue;
   7620             }
   7621         }
   7622         else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
   7623             PyErr_SetFromWindowsErr(0);
   7624             goto error;
   7625         }
   7626 
   7627         rep = unicode_encode_call_errorhandler(
   7628                   errors, &errorHandler, encoding, reason,
   7629                   unicode, &exc,
   7630                   pos, pos + 1, &newpos);
   7631         if (rep == NULL)
   7632             goto error;
   7633         pos = newpos;
   7634 
   7635         if (PyBytes_Check(rep)) {
   7636             outsize = PyBytes_GET_SIZE(rep);
   7637             if (outsize != 1) {
   7638                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
   7639                 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
   7640                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
   7641                     Py_DECREF(rep);
   7642                     goto error;
   7643                 }
   7644                 out = PyBytes_AS_STRING(*outbytes) + offset;
   7645             }
   7646             memcpy(out, PyBytes_AS_STRING(rep), outsize);
   7647             out += outsize;
   7648         }
   7649         else {
   7650             Py_ssize_t i;
   7651             enum PyUnicode_Kind kind;
   7652             void *data;
   7653 
   7654             if (PyUnicode_READY(rep) == -1) {
   7655                 Py_DECREF(rep);
   7656                 goto error;
   7657             }
   7658 
   7659             outsize = PyUnicode_GET_LENGTH(rep);
   7660             if (outsize != 1) {
   7661                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
   7662                 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
   7663                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
   7664                     Py_DECREF(rep);
   7665                     goto error;
   7666                 }
   7667                 out = PyBytes_AS_STRING(*outbytes) + offset;
   7668             }
   7669             kind = PyUnicode_KIND(rep);
   7670             data = PyUnicode_DATA(rep);
   7671             for (i=0; i < outsize; i++) {
   7672                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   7673                 if (ch > 127) {
   7674                     raise_encode_exception(&exc,
   7675                         encoding, unicode,
   7676                         pos, pos + 1,
   7677                         "unable to encode error handler result to ASCII");
   7678                     Py_DECREF(rep);
   7679                     goto error;
   7680                 }
   7681                 *out = (unsigned char)ch;
   7682                 out++;
   7683             }
   7684         }
   7685         Py_DECREF(rep);
   7686     }
   7687     /* write a NUL byte */
   7688     *out = 0;
   7689     outsize = out - PyBytes_AS_STRING(*outbytes);
   7690     assert(outsize <= PyBytes_GET_SIZE(*outbytes));
   7691     if (_PyBytes_Resize(outbytes, outsize) < 0)
   7692         goto error;
   7693     ret = 0;
   7694 
   7695 error:
   7696     Py_XDECREF(encoding_obj);
   7697     Py_XDECREF(errorHandler);
   7698     Py_XDECREF(exc);
   7699     return ret;
   7700 }
   7701 
   7702 static PyObject *
   7703 encode_code_page(int code_page,
   7704                  PyObject *unicode,
   7705                  const char *errors)
   7706 {
   7707     Py_ssize_t len;
   7708     PyObject *outbytes = NULL;
   7709     Py_ssize_t offset;
   7710     int chunk_len, ret, done;
   7711 
   7712     if (!PyUnicode_Check(unicode)) {
   7713         PyErr_BadArgument();
   7714         return NULL;
   7715     }
   7716 
   7717     if (PyUnicode_READY(unicode) == -1)
   7718         return NULL;
   7719     len = PyUnicode_GET_LENGTH(unicode);
   7720 
   7721     if (code_page < 0) {
   7722         PyErr_SetString(PyExc_ValueError, "invalid code page number");
   7723         return NULL;
   7724     }
   7725 
   7726     if (len == 0)
   7727         return PyBytes_FromStringAndSize(NULL, 0);
   7728 
   7729     offset = 0;
   7730     do
   7731     {
   7732 #ifdef NEED_RETRY
   7733         /* UTF-16 encoding may double the size, so use only INT_MAX/2
   7734            chunks. */
   7735         if (len > INT_MAX/2) {
   7736             chunk_len = INT_MAX/2;
   7737             done = 0;
   7738         }
   7739         else
   7740 #endif
   7741         {
   7742             chunk_len = (int)len;
   7743             done = 1;
   7744         }
   7745 
   7746         ret = encode_code_page_strict(code_page, &outbytes,
   7747                                       unicode, offset, chunk_len,
   7748                                       errors);
   7749         if (ret == -2)
   7750             ret = encode_code_page_errors(code_page, &outbytes,
   7751                                           unicode, offset,
   7752                                           chunk_len, errors);
   7753         if (ret < 0) {
   7754             Py_XDECREF(outbytes);
   7755             return NULL;
   7756         }
   7757 
   7758         offset += chunk_len;
   7759         len -= chunk_len;
   7760     } while (!done);
   7761 
   7762     return outbytes;
   7763 }
   7764 
   7765 PyObject *
   7766 PyUnicode_EncodeMBCS(const Py_UNICODE *p,
   7767                      Py_ssize_t size,
   7768                      const char *errors)
   7769 {
   7770     PyObject *unicode, *res;
   7771     unicode = PyUnicode_FromWideChar(p, size);
   7772     if (unicode == NULL)
   7773         return NULL;
   7774     res = encode_code_page(CP_ACP, unicode, errors);
   7775     Py_DECREF(unicode);
   7776     return res;
   7777 }
   7778 
   7779 PyObject *
   7780 PyUnicode_EncodeCodePage(int code_page,
   7781                          PyObject *unicode,
   7782                          const char *errors)
   7783 {
   7784     return encode_code_page(code_page, unicode, errors);
   7785 }
   7786 
   7787 PyObject *
   7788 PyUnicode_AsMBCSString(PyObject *unicode)
   7789 {
   7790     return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
   7791 }
   7792 
   7793 #undef NEED_RETRY
   7794 
   7795 #endif /* MS_WINDOWS */
   7796 
   7797 /* --- Character Mapping Codec -------------------------------------------- */
   7798 
   7799 static int
   7800 charmap_decode_string(const char *s,
   7801                       Py_ssize_t size,
   7802                       PyObject *mapping,
   7803                       const char *errors,
   7804                       _PyUnicodeWriter *writer)
   7805 {
   7806     const char *starts = s;
   7807     const char *e;
   7808     Py_ssize_t startinpos, endinpos;
   7809     PyObject *errorHandler = NULL, *exc = NULL;
   7810     Py_ssize_t maplen;
   7811     enum PyUnicode_Kind mapkind;
   7812     void *mapdata;
   7813     Py_UCS4 x;
   7814     unsigned char ch;
   7815 
   7816     if (PyUnicode_READY(mapping) == -1)
   7817         return -1;
   7818 
   7819     maplen = PyUnicode_GET_LENGTH(mapping);
   7820     mapdata = PyUnicode_DATA(mapping);
   7821     mapkind = PyUnicode_KIND(mapping);
   7822 
   7823     e = s + size;
   7824 
   7825     if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
   7826         /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
   7827          * is disabled in encoding aliases, latin1 is preferred because
   7828          * its implementation is faster. */
   7829         Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
   7830         Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
   7831         Py_UCS4 maxchar = writer->maxchar;
   7832 
   7833         assert (writer->kind == PyUnicode_1BYTE_KIND);
   7834         while (s < e) {
   7835             ch = *s;
   7836             x = mapdata_ucs1[ch];
   7837             if (x > maxchar) {
   7838                 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
   7839                     goto onError;
   7840                 maxchar = writer->maxchar;
   7841                 outdata = (Py_UCS1 *)writer->data;
   7842             }
   7843             outdata[writer->pos] = x;
   7844             writer->pos++;
   7845             ++s;
   7846         }
   7847         return 0;
   7848     }
   7849 
   7850     while (s < e) {
   7851         if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
   7852             enum PyUnicode_Kind outkind = writer->kind;
   7853             Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
   7854             if (outkind == PyUnicode_1BYTE_KIND) {
   7855                 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
   7856                 Py_UCS4 maxchar = writer->maxchar;
   7857                 while (s < e) {
   7858                     ch = *s;
   7859                     x = mapdata_ucs2[ch];
   7860                     if (x > maxchar)
   7861                         goto Error;
   7862                     outdata[writer->pos] = x;
   7863                     writer->pos++;
   7864                     ++s;
   7865                 }
   7866                 break;
   7867             }
   7868             else if (outkind == PyUnicode_2BYTE_KIND) {
   7869                 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
   7870                 while (s < e) {
   7871                     ch = *s;
   7872                     x = mapdata_ucs2[ch];
   7873                     if (x == 0xFFFE)
   7874                         goto Error;
   7875                     outdata[writer->pos] = x;
   7876                     writer->pos++;
   7877                     ++s;
   7878                 }
   7879                 break;
   7880             }
   7881         }
   7882         ch = *s;
   7883 
   7884         if (ch < maplen)
   7885             x = PyUnicode_READ(mapkind, mapdata, ch);
   7886         else
   7887             x = 0xfffe; /* invalid value */
   7888 Error:
   7889         if (x == 0xfffe)
   7890         {
   7891             /* undefined mapping */
   7892             startinpos = s-starts;
   7893             endinpos = startinpos+1;
   7894             if (unicode_decode_call_errorhandler_writer(
   7895                     errors, &errorHandler,
   7896                     "charmap", "character maps to <undefined>",
   7897                     &starts, &e, &startinpos, &endinpos, &exc, &s,
   7898                     writer)) {
   7899                 goto onError;
   7900             }
   7901             continue;
   7902         }
   7903 
   7904         if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
   7905             goto onError;
   7906         ++s;
   7907     }
   7908     Py_XDECREF(errorHandler);
   7909     Py_XDECREF(exc);
   7910     return 0;
   7911 
   7912 onError:
   7913     Py_XDECREF(errorHandler);
   7914     Py_XDECREF(exc);
   7915     return -1;
   7916 }
   7917 
   7918 static int
   7919 charmap_decode_mapping(const char *s,
   7920                        Py_ssize_t size,
   7921                        PyObject *mapping,
   7922                        const char *errors,
   7923                        _PyUnicodeWriter *writer)
   7924 {
   7925     const char *starts = s;
   7926     const char *e;
   7927     Py_ssize_t startinpos, endinpos;
   7928     PyObject *errorHandler = NULL, *exc = NULL;
   7929     unsigned char ch;
   7930     PyObject *key, *item = NULL;
   7931 
   7932     e = s + size;
   7933 
   7934     while (s < e) {
   7935         ch = *s;
   7936 
   7937         /* Get mapping (char ordinal -> integer, Unicode char or None) */
   7938         key = PyLong_FromLong((long)ch);
   7939         if (key == NULL)
   7940             goto onError;
   7941 
   7942         item = PyObject_GetItem(mapping, key);
   7943         Py_DECREF(key);
   7944         if (item == NULL) {
   7945             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
   7946                 /* No mapping found means: mapping is undefined. */
   7947                 PyErr_Clear();
   7948                 goto Undefined;
   7949             } else
   7950                 goto onError;
   7951         }
   7952 
   7953         /* Apply mapping */
   7954         if (item == Py_None)
   7955             goto Undefined;
   7956         if (PyLong_Check(item)) {
   7957             long value = PyLong_AS_LONG(item);
   7958             if (value == 0xFFFE)
   7959                 goto Undefined;
   7960             if (value < 0 || value > MAX_UNICODE) {
   7961                 PyErr_Format(PyExc_TypeError,
   7962                              "character mapping must be in range(0x%lx)",
   7963                              (unsigned long)MAX_UNICODE + 1);
   7964                 goto onError;
   7965             }
   7966 
   7967             if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
   7968                 goto onError;
   7969         }
   7970         else if (PyUnicode_Check(item)) {
   7971             if (PyUnicode_READY(item) == -1)
   7972                 goto onError;
   7973             if (PyUnicode_GET_LENGTH(item) == 1) {
   7974                 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
   7975                 if (value == 0xFFFE)
   7976                     goto Undefined;
   7977                 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
   7978                     goto onError;
   7979             }
   7980             else {
   7981                 writer->overallocate = 1;
   7982                 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
   7983                     goto onError;
   7984             }
   7985         }
   7986         else {
   7987             /* wrong return value */
   7988             PyErr_SetString(PyExc_TypeError,
   7989                             "character mapping must return integer, None or str");
   7990             goto onError;
   7991         }
   7992         Py_CLEAR(item);
   7993         ++s;
   7994         continue;
   7995 
   7996 Undefined:
   7997         /* undefined mapping */
   7998         Py_CLEAR(item);
   7999         startinpos = s-starts;
   8000         endinpos = startinpos+1;
   8001         if (unicode_decode_call_errorhandler_writer(
   8002                 errors, &errorHandler,
   8003                 "charmap", "character maps to <undefined>",
   8004                 &starts, &e, &startinpos, &endinpos, &exc, &s,
   8005                 writer)) {
   8006             goto onError;
   8007         }
   8008     }
   8009     Py_XDECREF(errorHandler);
   8010     Py_XDECREF(exc);
   8011     return 0;
   8012 
   8013 onError:
   8014     Py_XDECREF(item);
   8015     Py_XDECREF(errorHandler);
   8016     Py_XDECREF(exc);
   8017     return -1;
   8018 }
   8019 
   8020 PyObject *
   8021 PyUnicode_DecodeCharmap(const char *s,
   8022                         Py_ssize_t size,
   8023                         PyObject *mapping,
   8024                         const char *errors)
   8025 {
   8026     _PyUnicodeWriter writer;
   8027 
   8028     /* Default to Latin-1 */
   8029     if (mapping == NULL)
   8030         return PyUnicode_DecodeLatin1(s, size, errors);
   8031 
   8032     if (size == 0)
   8033         _Py_RETURN_UNICODE_EMPTY();
   8034     _PyUnicodeWriter_Init(&writer);
   8035     writer.min_length = size;
   8036     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
   8037         goto onError;
   8038 
   8039     if (PyUnicode_CheckExact(mapping)) {
   8040         if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
   8041             goto onError;
   8042     }
   8043     else {
   8044         if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
   8045             goto onError;
   8046     }
   8047     return _PyUnicodeWriter_Finish(&writer);
   8048 
   8049   onError:
   8050     _PyUnicodeWriter_Dealloc(&writer);
   8051     return NULL;
   8052 }
   8053 
   8054 /* Charmap encoding: the lookup table */
   8055 
   8056 struct encoding_map {
   8057     PyObject_HEAD
   8058     unsigned char level1[32];
   8059     int count2, count3;
   8060     unsigned char level23[1];
   8061 };
   8062 
   8063 static PyObject*
   8064 encoding_map_size(PyObject *obj, PyObject* args)
   8065 {
   8066     struct encoding_map *map = (struct encoding_map*)obj;
   8067     return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
   8068                            128*map->count3);
   8069 }
   8070 
   8071 static PyMethodDef encoding_map_methods[] = {
   8072     {"size", encoding_map_size, METH_NOARGS,
   8073      PyDoc_STR("Return the size (in bytes) of this object") },
   8074     { 0 }
   8075 };
   8076 
   8077 static void
   8078 encoding_map_dealloc(PyObject* o)
   8079 {
   8080     PyObject_FREE(o);
   8081 }
   8082 
   8083 static PyTypeObject EncodingMapType = {
   8084     PyVarObject_HEAD_INIT(NULL, 0)
   8085     "EncodingMap",          /*tp_name*/
   8086     sizeof(struct encoding_map),   /*tp_basicsize*/
   8087     0,                      /*tp_itemsize*/
   8088     /* methods */
   8089     encoding_map_dealloc,   /*tp_dealloc*/
   8090     0,                      /*tp_print*/
   8091     0,                      /*tp_getattr*/
   8092     0,                      /*tp_setattr*/
   8093     0,                      /*tp_reserved*/
   8094     0,                      /*tp_repr*/
   8095     0,                      /*tp_as_number*/
   8096     0,                      /*tp_as_sequence*/
   8097     0,                      /*tp_as_mapping*/
   8098     0,                      /*tp_hash*/
   8099     0,                      /*tp_call*/
   8100     0,                      /*tp_str*/
   8101     0,                      /*tp_getattro*/
   8102     0,                      /*tp_setattro*/
   8103     0,                      /*tp_as_buffer*/
   8104     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
   8105     0,                      /*tp_doc*/
   8106     0,                      /*tp_traverse*/
   8107     0,                      /*tp_clear*/
   8108     0,                      /*tp_richcompare*/
   8109     0,                      /*tp_weaklistoffset*/
   8110     0,                      /*tp_iter*/
   8111     0,                      /*tp_iternext*/
   8112     encoding_map_methods,   /*tp_methods*/
   8113     0,                      /*tp_members*/
   8114     0,                      /*tp_getset*/
   8115     0,                      /*tp_base*/
   8116     0,                      /*tp_dict*/
   8117     0,                      /*tp_descr_get*/
   8118     0,                      /*tp_descr_set*/
   8119     0,                      /*tp_dictoffset*/
   8120     0,                      /*tp_init*/
   8121     0,                      /*tp_alloc*/
   8122     0,                      /*tp_new*/
   8123     0,                      /*tp_free*/
   8124     0,                      /*tp_is_gc*/
   8125 };
   8126 
   8127 PyObject*
   8128 PyUnicode_BuildEncodingMap(PyObject* string)
   8129 {
   8130     PyObject *result;
   8131     struct encoding_map *mresult;
   8132     int i;
   8133     int need_dict = 0;
   8134     unsigned char level1[32];
   8135     unsigned char level2[512];
   8136     unsigned char *mlevel1, *mlevel2, *mlevel3;
   8137     int count2 = 0, count3 = 0;
   8138     int kind;
   8139     void *data;
   8140     Py_ssize_t length;
   8141     Py_UCS4 ch;
   8142 
   8143     if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
   8144         PyErr_BadArgument();
   8145         return NULL;
   8146     }
   8147     kind = PyUnicode_KIND(string);
   8148     data = PyUnicode_DATA(string);
   8149     length = PyUnicode_GET_LENGTH(string);
   8150     length = Py_MIN(length, 256);
   8151     memset(level1, 0xFF, sizeof level1);
   8152     memset(level2, 0xFF, sizeof level2);
   8153 
   8154     /* If there isn't a one-to-one mapping of NULL to \0,
   8155        or if there are non-BMP characters, we need to use
   8156        a mapping dictionary. */
   8157     if (PyUnicode_READ(kind, data, 0) != 0)
   8158         need_dict = 1;
   8159     for (i = 1; i < length; i++) {
   8160         int l1, l2;
   8161         ch = PyUnicode_READ(kind, data, i);
   8162         if (ch == 0 || ch > 0xFFFF) {
   8163             need_dict = 1;
   8164             break;
   8165         }
   8166         if (ch == 0xFFFE)
   8167             /* unmapped character */
   8168             continue;
   8169         l1 = ch >> 11;
   8170         l2 = ch >> 7;
   8171         if (level1[l1] == 0xFF)
   8172             level1[l1] = count2++;
   8173         if (level2[l2] == 0xFF)
   8174             level2[l2] = count3++;
   8175     }
   8176 
   8177     if (count2 >= 0xFF || count3 >= 0xFF)
   8178         need_dict = 1;
   8179 
   8180     if (need_dict) {
   8181         PyObject *result = PyDict_New();
   8182         PyObject *key, *value;
   8183         if (!result)
   8184             return NULL;
   8185         for (i = 0; i < length; i++) {
   8186             key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
   8187             value = PyLong_FromLong(i);
   8188             if (!key || !value)
   8189                 goto failed1;
   8190             if (PyDict_SetItem(result, key, value) == -1)
   8191                 goto failed1;
   8192             Py_DECREF(key);
   8193             Py_DECREF(value);
   8194         }
   8195         return result;
   8196       failed1:
   8197         Py_XDECREF(key);
   8198         Py_XDECREF(value);
   8199         Py_DECREF(result);
   8200         return NULL;
   8201     }
   8202 
   8203     /* Create a three-level trie */
   8204     result = PyObject_MALLOC(sizeof(struct encoding_map) +
   8205                              16*count2 + 128*count3 - 1);
   8206     if (!result)
   8207         return PyErr_NoMemory();
   8208     PyObject_Init(result, &EncodingMapType);
   8209     mresult = (struct encoding_map*)result;
   8210     mresult->count2 = count2;
   8211     mresult->count3 = count3;
   8212     mlevel1 = mresult->level1;
   8213     mlevel2 = mresult->level23;
   8214     mlevel3 = mresult->level23 + 16*count2;
   8215     memcpy(mlevel1, level1, 32);
   8216     memset(mlevel2, 0xFF, 16*count2);
   8217     memset(mlevel3, 0, 128*count3);
   8218     count3 = 0;
   8219     for (i = 1; i < length; i++) {
   8220         int o1, o2, o3, i2, i3;
   8221         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   8222         if (ch == 0xFFFE)
   8223             /* unmapped character */
   8224             continue;
   8225         o1 = ch>>11;
   8226         o2 = (ch>>7) & 0xF;
   8227         i2 = 16*mlevel1[o1] + o2;
   8228         if (mlevel2[i2] == 0xFF)
   8229             mlevel2[i2] = count3++;
   8230         o3 = ch & 0x7F;
   8231         i3 = 128*mlevel2[i2] + o3;
   8232         mlevel3[i3] = i;
   8233     }
   8234     return result;
   8235 }
   8236 
   8237 static int
   8238 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
   8239 {
   8240     struct encoding_map *map = (struct encoding_map*)mapping;
   8241     int l1 = c>>11;
   8242     int l2 = (c>>7) & 0xF;
   8243     int l3 = c & 0x7F;
   8244     int i;
   8245 
   8246     if (c > 0xFFFF)
   8247         return -1;
   8248     if (c == 0)
   8249         return 0;
   8250     /* level 1*/
   8251     i = map->level1[l1];
   8252     if (i == 0xFF) {
   8253         return -1;
   8254     }
   8255     /* level 2*/
   8256     i = map->level23[16*i+l2];
   8257     if (i == 0xFF) {
   8258         return -1;
   8259     }
   8260     /* level 3 */
   8261     i = map->level23[16*map->count2 + 128*i + l3];
   8262     if (i == 0) {
   8263         return -1;
   8264     }
   8265     return i;
   8266 }
   8267 
   8268 /* Lookup the character ch in the mapping. If the character
   8269    can't be found, Py_None is returned (or NULL, if another
   8270    error occurred). */
   8271 static PyObject *
   8272 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
   8273 {
   8274     PyObject *w = PyLong_FromLong((long)c);
   8275     PyObject *x;
   8276 
   8277     if (w == NULL)
   8278         return NULL;
   8279     x = PyObject_GetItem(mapping, w);
   8280     Py_DECREF(w);
   8281     if (x == NULL) {
   8282         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
   8283             /* No mapping found means: mapping is undefined. */
   8284             PyErr_Clear();
   8285             Py_RETURN_NONE;
   8286         } else
   8287             return NULL;
   8288     }
   8289     else if (x == Py_None)
   8290         return x;
   8291     else if (PyLong_Check(x)) {
   8292         long value = PyLong_AS_LONG(x);
   8293         if (value < 0 || value > 255) {
   8294             PyErr_SetString(PyExc_TypeError,
   8295                             "character mapping must be in range(256)");
   8296             Py_DECREF(x);
   8297             return NULL;
   8298         }
   8299         return x;
   8300     }
   8301     else if (PyBytes_Check(x))
   8302         return x;
   8303     else {
   8304         /* wrong return value */
   8305         PyErr_Format(PyExc_TypeError,
   8306                      "character mapping must return integer, bytes or None, not %.400s",
   8307                      x->ob_type->tp_name);
   8308         Py_DECREF(x);
   8309         return NULL;
   8310     }
   8311 }
   8312 
   8313 static int
   8314 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
   8315 {
   8316     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
   8317     /* exponentially overallocate to minimize reallocations */
   8318     if (requiredsize < 2*outsize)
   8319         requiredsize = 2*outsize;
   8320     if (_PyBytes_Resize(outobj, requiredsize))
   8321         return -1;
   8322     return 0;
   8323 }
   8324 
   8325 typedef enum charmapencode_result {
   8326     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
   8327 } charmapencode_result;
   8328 /* lookup the character, put the result in the output string and adjust
   8329    various state variables. Resize the output bytes object if not enough
   8330    space is available. Return a new reference to the object that
   8331    was put in the output buffer, or Py_None, if the mapping was undefined
   8332    (in which case no character was written) or NULL, if a
   8333    reallocation error occurred. The caller must decref the result */
   8334 static charmapencode_result
   8335 charmapencode_output(Py_UCS4 c, PyObject *mapping,
   8336                      PyObject **outobj, Py_ssize_t *outpos)
   8337 {
   8338     PyObject *rep;
   8339     char *outstart;
   8340     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
   8341 
   8342     if (Py_TYPE(mapping) == &EncodingMapType) {
   8343         int res = encoding_map_lookup(c, mapping);
   8344         Py_ssize_t requiredsize = *outpos+1;
   8345         if (res == -1)
   8346             return enc_FAILED;
   8347         if (outsize<requiredsize)
   8348             if (charmapencode_resize(outobj, outpos, requiredsize))
   8349                 return enc_EXCEPTION;
   8350         outstart = PyBytes_AS_STRING(*outobj);
   8351         outstart[(*outpos)++] = (char)res;
   8352         return enc_SUCCESS;
   8353     }
   8354 
   8355     rep = charmapencode_lookup(c, mapping);
   8356     if (rep==NULL)
   8357         return enc_EXCEPTION;
   8358     else if (rep==Py_None) {
   8359         Py_DECREF(rep);
   8360         return enc_FAILED;
   8361     } else {
   8362         if (PyLong_Check(rep)) {
   8363             Py_ssize_t requiredsize = *outpos+1;
   8364             if (outsize<requiredsize)
   8365                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
   8366                     Py_DECREF(rep);
   8367                     return enc_EXCEPTION;
   8368                 }
   8369             outstart = PyBytes_AS_STRING(*outobj);
   8370             outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
   8371         }
   8372         else {
   8373             const char *repchars = PyBytes_AS_STRING(rep);
   8374             Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
   8375             Py_ssize_t requiredsize = *outpos+repsize;
   8376             if (outsize<requiredsize)
   8377                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
   8378                     Py_DECREF(rep);
   8379                     return enc_EXCEPTION;
   8380                 }
   8381             outstart = PyBytes_AS_STRING(*outobj);
   8382             memcpy(outstart + *outpos, repchars, repsize);
   8383             *outpos += repsize;
   8384         }
   8385     }
   8386     Py_DECREF(rep);
   8387     return enc_SUCCESS;
   8388 }
   8389 
   8390 /* handle an error in PyUnicode_EncodeCharmap
   8391    Return 0 on success, -1 on error */
   8392 static int
   8393 charmap_encoding_error(
   8394     PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
   8395     PyObject **exceptionObject,
   8396     _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
   8397     PyObject **res, Py_ssize_t *respos)
   8398 {
   8399     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
   8400     Py_ssize_t size, repsize;
   8401     Py_ssize_t newpos;
   8402     enum PyUnicode_Kind kind;
   8403     void *data;
   8404     Py_ssize_t index;
   8405     /* startpos for collecting unencodable chars */
   8406     Py_ssize_t collstartpos = *inpos;
   8407     Py_ssize_t collendpos = *inpos+1;
   8408     Py_ssize_t collpos;
   8409     const char *encoding = "charmap";
   8410     const char *reason = "character maps to <undefined>";
   8411     charmapencode_result x;
   8412     Py_UCS4 ch;
   8413     int val;
   8414 
   8415     if (PyUnicode_READY(unicode) == -1)
   8416         return -1;
   8417     size = PyUnicode_GET_LENGTH(unicode);
   8418     /* find all unencodable characters */
   8419     while (collendpos < size) {
   8420         PyObject *rep;
   8421         if (Py_TYPE(mapping) == &EncodingMapType) {
   8422             ch = PyUnicode_READ_CHAR(unicode, collendpos);
   8423             val = encoding_map_lookup(ch, mapping);
   8424             if (val != -1)
   8425                 break;
   8426             ++collendpos;
   8427             continue;
   8428         }
   8429 
   8430         ch = PyUnicode_READ_CHAR(unicode, collendpos);
   8431         rep = charmapencode_lookup(ch, mapping);
   8432         if (rep==NULL)
   8433             return -1;
   8434         else if (rep!=Py_None) {
   8435             Py_DECREF(rep);
   8436             break;
   8437         }
   8438         Py_DECREF(rep);
   8439         ++collendpos;
   8440     }
   8441     /* cache callback name lookup
   8442      * (if not done yet, i.e. it's the first error) */
   8443     if (*error_handler == _Py_ERROR_UNKNOWN)
   8444         *error_handler = get_error_handler(errors);
   8445 
   8446     switch (*error_handler) {
   8447     case _Py_ERROR_STRICT:
   8448         raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
   8449         return -1;
   8450 
   8451     case _Py_ERROR_REPLACE:
   8452         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
   8453             x = charmapencode_output('?', mapping, res, respos);
   8454             if (x==enc_EXCEPTION) {
   8455                 return -1;
   8456             }
   8457             else if (x==enc_FAILED) {
   8458                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
   8459                 return -1;
   8460             }
   8461         }
   8462         /* fall through */
   8463     case _Py_ERROR_IGNORE:
   8464         *inpos = collendpos;
   8465         break;
   8466 
   8467     case _Py_ERROR_XMLCHARREFREPLACE:
   8468         /* generate replacement (temporarily (mis)uses p) */
   8469         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
   8470             char buffer[2+29+1+1];
   8471             char *cp;
   8472             sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
   8473             for (cp = buffer; *cp; ++cp) {
   8474                 x = charmapencode_output(*cp, mapping, res, respos);
   8475                 if (x==enc_EXCEPTION)
   8476                     return -1;
   8477                 else if (x==enc_FAILED) {
   8478                     raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
   8479                     return -1;
   8480                 }
   8481             }
   8482         }
   8483         *inpos = collendpos;
   8484         break;
   8485 
   8486     default:
   8487         repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
   8488                                                       encoding, reason, unicode, exceptionObject,
   8489                                                       collstartpos, collendpos, &newpos);
   8490         if (repunicode == NULL)
   8491             return -1;
   8492         if (PyBytes_Check(repunicode)) {
   8493             /* Directly copy bytes result to output. */
   8494             Py_ssize_t outsize = PyBytes_Size(*res);
   8495             Py_ssize_t requiredsize;
   8496             repsize = PyBytes_Size(repunicode);
   8497             requiredsize = *respos + repsize;
   8498             if (requiredsize > outsize)
   8499                 /* Make room for all additional bytes. */
   8500                 if (charmapencode_resize(res, respos, requiredsize)) {
   8501                     Py_DECREF(repunicode);
   8502                     return -1;
   8503                 }
   8504             memcpy(PyBytes_AsString(*res) + *respos,
   8505                    PyBytes_AsString(repunicode),  repsize);
   8506             *respos += repsize;
   8507             *inpos = newpos;
   8508             Py_DECREF(repunicode);
   8509             break;
   8510         }
   8511         /* generate replacement  */
   8512         if (PyUnicode_READY(repunicode) == -1) {
   8513             Py_DECREF(repunicode);
   8514             return -1;
   8515         }
   8516         repsize = PyUnicode_GET_LENGTH(repunicode);
   8517         data = PyUnicode_DATA(repunicode);
   8518         kind = PyUnicode_KIND(repunicode);
   8519         for (index = 0; index < repsize; index++) {
   8520             Py_UCS4 repch = PyUnicode_READ(kind, data, index);
   8521             x = charmapencode_output(repch, mapping, res, respos);
   8522             if (x==enc_EXCEPTION) {
   8523                 Py_DECREF(repunicode);
   8524                 return -1;
   8525             }
   8526             else if (x==enc_FAILED) {
   8527                 Py_DECREF(repunicode);
   8528                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
   8529                 return -1;
   8530             }
   8531         }
   8532         *inpos = newpos;
   8533         Py_DECREF(repunicode);
   8534     }
   8535     return 0;
   8536 }
   8537 
   8538 PyObject *
   8539 _PyUnicode_EncodeCharmap(PyObject *unicode,
   8540                          PyObject *mapping,
   8541                          const char *errors)
   8542 {
   8543     /* output object */
   8544     PyObject *res = NULL;
   8545     /* current input position */
   8546     Py_ssize_t inpos = 0;
   8547     Py_ssize_t size;
   8548     /* current output position */
   8549     Py_ssize_t respos = 0;
   8550     PyObject *error_handler_obj = NULL;
   8551     PyObject *exc = NULL;
   8552     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
   8553     void *data;
   8554     int kind;
   8555 
   8556     if (PyUnicode_READY(unicode) == -1)
   8557         return NULL;
   8558     size = PyUnicode_GET_LENGTH(unicode);
   8559     data = PyUnicode_DATA(unicode);
   8560     kind = PyUnicode_KIND(unicode);
   8561 
   8562     /* Default to Latin-1 */
   8563     if (mapping == NULL)
   8564         return unicode_encode_ucs1(unicode, errors, 256);
   8565 
   8566     /* allocate enough for a simple encoding without
   8567        replacements, if we need more, we'll resize */
   8568     res = PyBytes_FromStringAndSize(NULL, size);
   8569     if (res == NULL)
   8570         goto onError;
   8571     if (size == 0)
   8572         return res;
   8573 
   8574     while (inpos<size) {
   8575         Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
   8576         /* try to encode it */
   8577         charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
   8578         if (x==enc_EXCEPTION) /* error */
   8579             goto onError;
   8580         if (x==enc_FAILED) { /* unencodable character */
   8581             if (charmap_encoding_error(unicode, &inpos, mapping,
   8582                                        &exc,
   8583                                        &error_handler, &error_handler_obj, errors,
   8584                                        &res, &respos)) {
   8585                 goto onError;
   8586             }
   8587         }
   8588         else
   8589             /* done with this character => adjust input position */
   8590             ++inpos;
   8591     }
   8592 
   8593     /* Resize if we allocated to much */
   8594     if (respos<PyBytes_GET_SIZE(res))
   8595         if (_PyBytes_Resize(&res, respos) < 0)
   8596             goto onError;
   8597 
   8598     Py_XDECREF(exc);
   8599     Py_XDECREF(error_handler_obj);
   8600     return res;
   8601 
   8602   onError:
   8603     Py_XDECREF(res);
   8604     Py_XDECREF(exc);
   8605     Py_XDECREF(error_handler_obj);
   8606     return NULL;
   8607 }
   8608 
   8609 /* Deprecated */
   8610 PyObject *
   8611 PyUnicode_EncodeCharmap(const Py_UNICODE *p,
   8612                         Py_ssize_t size,
   8613                         PyObject *mapping,
   8614                         const char *errors)
   8615 {
   8616     PyObject *result;
   8617     PyObject *unicode = PyUnicode_FromWideChar(p, size);
   8618     if (unicode == NULL)
   8619         return NULL;
   8620     result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
   8621     Py_DECREF(unicode);
   8622     return result;
   8623 }
   8624 
   8625 PyObject *
   8626 PyUnicode_AsCharmapString(PyObject *unicode,
   8627                           PyObject *mapping)
   8628 {
   8629     if (!PyUnicode_Check(unicode) || mapping == NULL) {
   8630         PyErr_BadArgument();
   8631         return NULL;
   8632     }
   8633     return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
   8634 }
   8635 
   8636 /* create or adjust a UnicodeTranslateError */
   8637 static void
   8638 make_translate_exception(PyObject **exceptionObject,
   8639                          PyObject *unicode,
   8640                          Py_ssize_t startpos, Py_ssize_t endpos,
   8641                          const char *reason)
   8642 {
   8643     if (*exceptionObject == NULL) {
   8644         *exceptionObject = _PyUnicodeTranslateError_Create(
   8645             unicode, startpos, endpos, reason);
   8646     }
   8647     else {
   8648         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
   8649             goto onError;
   8650         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
   8651             goto onError;
   8652         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
   8653             goto onError;
   8654         return;
   8655       onError:
   8656         Py_CLEAR(*exceptionObject);
   8657     }
   8658 }
   8659 
   8660 /* error handling callback helper:
   8661    build arguments, call the callback and check the arguments,
   8662    put the result into newpos and return the replacement string, which
   8663    has to be freed by the caller */
   8664 static PyObject *
   8665 unicode_translate_call_errorhandler(const char *errors,
   8666                                     PyObject **errorHandler,
   8667                                     const char *reason,
   8668                                     PyObject *unicode, PyObject **exceptionObject,
   8669                                     Py_ssize_t startpos, Py_ssize_t endpos,
   8670                                     Py_ssize_t *newpos)
   8671 {
   8672     static const char *argparse = "Un;translating error handler must return (str, int) tuple";
   8673 
   8674     Py_ssize_t i_newpos;
   8675     PyObject *restuple;
   8676     PyObject *resunicode;
   8677 
   8678     if (*errorHandler == NULL) {
   8679         *errorHandler = PyCodec_LookupError(errors);
   8680         if (*errorHandler == NULL)
   8681             return NULL;
   8682     }
   8683 
   8684     make_translate_exception(exceptionObject,
   8685                              unicode, startpos, endpos, reason);
   8686     if (*exceptionObject == NULL)
   8687         return NULL;
   8688 
   8689     restuple = PyObject_CallFunctionObjArgs(
   8690         *errorHandler, *exceptionObject, NULL);
   8691     if (restuple == NULL)
   8692         return NULL;
   8693     if (!PyTuple_Check(restuple)) {
   8694         PyErr_SetString(PyExc_TypeError, &argparse[3]);
   8695         Py_DECREF(restuple);
   8696         return NULL;
   8697     }
   8698     if (!PyArg_ParseTuple(restuple, argparse,
   8699                           &resunicode, &i_newpos)) {
   8700         Py_DECREF(restuple);
   8701         return NULL;
   8702     }
   8703     if (i_newpos<0)
   8704         *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
   8705     else
   8706         *newpos = i_newpos;
   8707     if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
   8708         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
   8709         Py_DECREF(restuple);
   8710         return NULL;
   8711     }
   8712     Py_INCREF(resunicode);
   8713     Py_DECREF(restuple);
   8714     return resunicode;
   8715 }
   8716 
   8717 /* Lookup the character ch in the mapping and put the result in result,
   8718    which must be decrefed by the caller.
   8719    Return 0 on success, -1 on error */
   8720 static int
   8721 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
   8722 {
   8723     PyObject *w = PyLong_FromLong((long)c);
   8724     PyObject *x;
   8725 
   8726     if (w == NULL)
   8727         return -1;
   8728     x = PyObject_GetItem(mapping, w);
   8729     Py_DECREF(w);
   8730     if (x == NULL) {
   8731         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
   8732             /* No mapping found means: use 1:1 mapping. */
   8733             PyErr_Clear();
   8734             *result = NULL;
   8735             return 0;
   8736         } else
   8737             return -1;
   8738     }
   8739     else if (x == Py_None) {
   8740         *result = x;
   8741         return 0;
   8742     }
   8743     else if (PyLong_Check(x)) {
   8744         long value = PyLong_AS_LONG(x);
   8745         if (value < 0 || value > MAX_UNICODE) {
   8746             PyErr_Format(PyExc_ValueError,
   8747                          "character mapping must be in range(0x%x)",
   8748                          MAX_UNICODE+1);
   8749             Py_DECREF(x);
   8750             return -1;
   8751         }
   8752         *result = x;
   8753         return 0;
   8754     }
   8755     else if (PyUnicode_Check(x)) {
   8756         *result = x;
   8757         return 0;
   8758     }
   8759     else {
   8760         /* wrong return value */
   8761         PyErr_SetString(PyExc_TypeError,
   8762                         "character mapping must return integer, None or str");
   8763         Py_DECREF(x);
   8764         return -1;
   8765     }
   8766 }
   8767 
   8768 /* lookup the character, write the result into the writer.
   8769    Return 1 if the result was written into the writer, return 0 if the mapping
   8770    was undefined, raise an exception return -1 on error. */
   8771 static int
   8772 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
   8773                         _PyUnicodeWriter *writer)
   8774 {
   8775     PyObject *item;
   8776 
   8777     if (charmaptranslate_lookup(ch, mapping, &item))
   8778         return -1;
   8779 
   8780     if (item == NULL) {
   8781         /* not found => default to 1:1 mapping */
   8782         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
   8783             return -1;
   8784         }
   8785         return 1;
   8786     }
   8787 
   8788     if (item == Py_None) {
   8789         Py_DECREF(item);
   8790         return 0;
   8791     }
   8792 
   8793     if (PyLong_Check(item)) {
   8794         long ch = (Py_UCS4)PyLong_AS_LONG(item);
   8795         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
   8796            used it */
   8797         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
   8798             Py_DECREF(item);
   8799             return -1;
   8800         }
   8801         Py_DECREF(item);
   8802         return 1;
   8803     }
   8804 
   8805     if (!PyUnicode_Check(item)) {
   8806         Py_DECREF(item);
   8807         return -1;
   8808     }
   8809 
   8810     if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
   8811         Py_DECREF(item);
   8812         return -1;
   8813     }
   8814 
   8815     Py_DECREF(item);
   8816     return 1;
   8817 }
   8818 
   8819 static int
   8820 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
   8821                               Py_UCS1 *translate)
   8822 {
   8823     PyObject *item = NULL;
   8824     int ret = 0;
   8825 
   8826     if (charmaptranslate_lookup(ch, mapping, &item)) {
   8827         return -1;
   8828     }
   8829 
   8830     if (item == Py_None) {
   8831         /* deletion */
   8832         translate[ch] = 0xfe;
   8833     }
   8834     else if (item == NULL) {
   8835         /* not found => default to 1:1 mapping */
   8836         translate[ch] = ch;
   8837         return 1;
   8838     }
   8839     else if (PyLong_Check(item)) {
   8840         long replace = PyLong_AS_LONG(item);
   8841         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
   8842            used it */
   8843         if (127 < replace) {
   8844             /* invalid character or character outside ASCII:
   8845                skip the fast translate */
   8846             goto exit;
   8847         }
   8848         translate[ch] = (Py_UCS1)replace;
   8849     }
   8850     else if (PyUnicode_Check(item)) {
   8851         Py_UCS4 replace;
   8852 
   8853         if (PyUnicode_READY(item) == -1) {
   8854             Py_DECREF(item);
   8855             return -1;
   8856         }
   8857         if (PyUnicode_GET_LENGTH(item) != 1)
   8858             goto exit;
   8859 
   8860         replace = PyUnicode_READ_CHAR(item, 0);
   8861         if (replace > 127)
   8862             goto exit;
   8863         translate[ch] = (Py_UCS1)replace;
   8864     }
   8865     else {
   8866         /* not None, NULL, long or unicode */
   8867         goto exit;
   8868     }
   8869     ret = 1;
   8870 
   8871   exit:
   8872     Py_DECREF(item);
   8873     return ret;
   8874 }
   8875 
   8876 /* Fast path for ascii => ascii translation. Return 1 if the whole string
   8877    was translated into writer, return 0 if the input string was partially
   8878    translated into writer, raise an exception and return -1 on error. */
   8879 static int
   8880 unicode_fast_translate(PyObject *input, PyObject *mapping,
   8881                        _PyUnicodeWriter *writer, int ignore,
   8882                        Py_ssize_t *input_pos)
   8883 {
   8884     Py_UCS1 ascii_table[128], ch, ch2;
   8885     Py_ssize_t len;
   8886     Py_UCS1 *in, *end, *out;
   8887     int res = 0;
   8888 
   8889     len = PyUnicode_GET_LENGTH(input);
   8890 
   8891     memset(ascii_table, 0xff, 128);
   8892 
   8893     in = PyUnicode_1BYTE_DATA(input);
   8894     end = in + len;
   8895 
   8896     assert(PyUnicode_IS_ASCII(writer->buffer));
   8897     assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
   8898     out = PyUnicode_1BYTE_DATA(writer->buffer);
   8899 
   8900     for (; in < end; in++) {
   8901         ch = *in;
   8902         ch2 = ascii_table[ch];
   8903         if (ch2 == 0xff) {
   8904             int translate = unicode_fast_translate_lookup(mapping, ch,
   8905                                                           ascii_table);
   8906             if (translate < 0)
   8907                 return -1;
   8908             if (translate == 0)
   8909                 goto exit;
   8910             ch2 = ascii_table[ch];
   8911         }
   8912         if (ch2 == 0xfe) {
   8913             if (ignore)
   8914                 continue;
   8915             goto exit;
   8916         }
   8917         assert(ch2 < 128);
   8918         *out = ch2;
   8919         out++;
   8920     }
   8921     res = 1;
   8922 
   8923 exit:
   8924     writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
   8925     *input_pos = in - PyUnicode_1BYTE_DATA(input);
   8926     return res;
   8927 }
   8928 
   8929 static PyObject *
   8930 _PyUnicode_TranslateCharmap(PyObject *input,
   8931                             PyObject *mapping,
   8932                             const char *errors)
   8933 {
   8934     /* input object */
   8935     char *data;
   8936     Py_ssize_t size, i;
   8937     int kind;
   8938     /* output buffer */
   8939     _PyUnicodeWriter writer;
   8940     /* error handler */
   8941     const char *reason = "character maps to <undefined>";
   8942     PyObject *errorHandler = NULL;
   8943     PyObject *exc = NULL;
   8944     int ignore;
   8945     int res;
   8946 
   8947     if (mapping == NULL) {
   8948         PyErr_BadArgument();
   8949         return NULL;
   8950     }
   8951 
   8952     if (PyUnicode_READY(input) == -1)
   8953         return NULL;
   8954     data = (char*)PyUnicode_DATA(input);
   8955     kind = PyUnicode_KIND(input);
   8956     size = PyUnicode_GET_LENGTH(input);
   8957 
   8958     if (size == 0)
   8959         return PyUnicode_FromObject(input);
   8960 
   8961     /* allocate enough for a simple 1:1 translation without
   8962        replacements, if we need more, we'll resize */
   8963     _PyUnicodeWriter_Init(&writer);
   8964     if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
   8965         goto onError;
   8966 
   8967     ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
   8968 
   8969     if (PyUnicode_READY(input) == -1)
   8970         return NULL;
   8971     if (PyUnicode_IS_ASCII(input)) {
   8972         res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
   8973         if (res < 0) {
   8974             _PyUnicodeWriter_Dealloc(&writer);
   8975             return NULL;
   8976         }
   8977         if (res == 1)
   8978             return _PyUnicodeWriter_Finish(&writer);
   8979     }
   8980     else {
   8981         i = 0;
   8982     }
   8983 
   8984     while (i<size) {
   8985         /* try to encode it */
   8986         int translate;
   8987         PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
   8988         Py_ssize_t newpos;
   8989         /* startpos for collecting untranslatable chars */
   8990         Py_ssize_t collstart;
   8991         Py_ssize_t collend;
   8992         Py_UCS4 ch;
   8993 
   8994         ch = PyUnicode_READ(kind, data, i);
   8995         translate = charmaptranslate_output(ch, mapping, &writer);
   8996         if (translate < 0)
   8997             goto onError;
   8998 
   8999         if (translate != 0) {
   9000             /* it worked => adjust input pointer */
   9001             ++i;
   9002             continue;
   9003         }
   9004 
   9005         /* untranslatable character */
   9006         collstart = i;
   9007         collend = i+1;
   9008 
   9009         /* find all untranslatable characters */
   9010         while (collend < size) {
   9011             PyObject *x;
   9012             ch = PyUnicode_READ(kind, data, collend);
   9013             if (charmaptranslate_lookup(ch, mapping, &x))
   9014                 goto onError;
   9015             Py_XDECREF(x);
   9016             if (x != Py_None)
   9017                 break;
   9018             ++collend;
   9019         }
   9020 
   9021         if (ignore) {
   9022             i = collend;
   9023         }
   9024         else {
   9025             repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
   9026                                                              reason, input, &exc,
   9027                                                              collstart, collend, &newpos);
   9028             if (repunicode == NULL)
   9029                 goto onError;
   9030             if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
   9031                 Py_DECREF(repunicode);
   9032                 goto onError;
   9033             }
   9034             Py_DECREF(repunicode);
   9035             i = newpos;
   9036         }
   9037     }
   9038     Py_XDECREF(exc);
   9039     Py_XDECREF(errorHandler);
   9040     return _PyUnicodeWriter_Finish(&writer);
   9041 
   9042   onError:
   9043     _PyUnicodeWriter_Dealloc(&writer);
   9044     Py_XDECREF(exc);
   9045     Py_XDECREF(errorHandler);
   9046     return NULL;
   9047 }
   9048 
   9049 /* Deprecated. Use PyUnicode_Translate instead. */
   9050 PyObject *
   9051 PyUnicode_TranslateCharmap(const Py_UNICODE *p,
   9052                            Py_ssize_t size,
   9053                            PyObject *mapping,
   9054                            const char *errors)
   9055 {
   9056     PyObject *result;
   9057     PyObject *unicode = PyUnicode_FromWideChar(p, size);
   9058     if (!unicode)
   9059         return NULL;
   9060     result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
   9061     Py_DECREF(unicode);
   9062     return result;
   9063 }
   9064 
   9065 PyObject *
   9066 PyUnicode_Translate(PyObject *str,
   9067                     PyObject *mapping,
   9068                     const char *errors)
   9069 {
   9070     if (ensure_unicode(str) < 0)
   9071         return NULL;
   9072     return _PyUnicode_TranslateCharmap(str, mapping, errors);
   9073 }
   9074 
   9075 PyObject *
   9076 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
   9077 {
   9078     if (!PyUnicode_Check(unicode)) {
   9079         PyErr_BadInternalCall();
   9080         return NULL;
   9081     }
   9082     if (PyUnicode_READY(unicode) == -1)
   9083         return NULL;
   9084     if (PyUnicode_IS_ASCII(unicode)) {
   9085         /* If the string is already ASCII, just return the same string */
   9086         Py_INCREF(unicode);
   9087         return unicode;
   9088     }
   9089 
   9090     Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
   9091     PyObject *result = PyUnicode_New(len, 127);
   9092     if (result == NULL) {
   9093         return NULL;
   9094     }
   9095 
   9096     Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
   9097     int kind = PyUnicode_KIND(unicode);
   9098     const void *data = PyUnicode_DATA(unicode);
   9099     Py_ssize_t i;
   9100     for (i = 0; i < len; ++i) {
   9101         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   9102         if (ch < 127) {
   9103             out[i] = ch;
   9104         }
   9105         else if (Py_UNICODE_ISSPACE(ch)) {
   9106             out[i] = ' ';
   9107         }
   9108         else {
   9109             int decimal = Py_UNICODE_TODECIMAL(ch);
   9110             if (decimal < 0) {
   9111                 out[i] = '?';
   9112                 out[i+1] = '\0';
   9113                 _PyUnicode_LENGTH(result) = i + 1;
   9114                 break;
   9115             }
   9116             out[i] = '0' + decimal;
   9117         }
   9118     }
   9119 
   9120     assert(_PyUnicode_CheckConsistency(result, 1));
   9121     return result;
   9122 }
   9123 
   9124 PyObject *
   9125 PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
   9126                                   Py_ssize_t length)
   9127 {
   9128     PyObject *decimal;
   9129     Py_ssize_t i;
   9130     Py_UCS4 maxchar;
   9131     enum PyUnicode_Kind kind;
   9132     void *data;
   9133 
   9134     maxchar = 127;
   9135     for (i = 0; i < length; i++) {
   9136         Py_UCS4 ch = s[i];
   9137         if (ch > 127) {
   9138             int decimal = Py_UNICODE_TODECIMAL(ch);
   9139             if (decimal >= 0)
   9140                 ch = '0' + decimal;
   9141             maxchar = Py_MAX(maxchar, ch);
   9142         }
   9143     }
   9144 
   9145     /* Copy to a new string */
   9146     decimal = PyUnicode_New(length, maxchar);
   9147     if (decimal == NULL)
   9148         return decimal;
   9149     kind = PyUnicode_KIND(decimal);
   9150     data = PyUnicode_DATA(decimal);
   9151     /* Iterate over code points */
   9152     for (i = 0; i < length; i++) {
   9153         Py_UCS4 ch = s[i];
   9154         if (ch > 127) {
   9155             int decimal = Py_UNICODE_TODECIMAL(ch);
   9156             if (decimal >= 0)
   9157                 ch = '0' + decimal;
   9158         }
   9159         PyUnicode_WRITE(kind, data, i, ch);
   9160     }
   9161     return unicode_result(decimal);
   9162 }
   9163 /* --- Decimal Encoder ---------------------------------------------------- */
   9164 
   9165 int
   9166 PyUnicode_EncodeDecimal(Py_UNICODE *s,
   9167                         Py_ssize_t length,
   9168                         char *output,
   9169                         const char *errors)
   9170 {
   9171     PyObject *unicode;
   9172     Py_ssize_t i;
   9173     enum PyUnicode_Kind kind;
   9174     void *data;
   9175 
   9176     if (output == NULL) {
   9177         PyErr_BadArgument();
   9178         return -1;
   9179     }
   9180 
   9181     unicode = PyUnicode_FromWideChar(s, length);
   9182     if (unicode == NULL)
   9183         return -1;
   9184 
   9185     kind = PyUnicode_KIND(unicode);
   9186     data = PyUnicode_DATA(unicode);
   9187 
   9188     for (i=0; i < length; ) {
   9189         PyObject *exc;
   9190         Py_UCS4 ch;
   9191         int decimal;
   9192         Py_ssize_t startpos;
   9193 
   9194         ch = PyUnicode_READ(kind, data, i);
   9195 
   9196         if (Py_UNICODE_ISSPACE(ch)) {
   9197             *output++ = ' ';
   9198             i++;
   9199             continue;
   9200         }
   9201         decimal = Py_UNICODE_TODECIMAL(ch);
   9202         if (decimal >= 0) {
   9203             *output++ = '0' + decimal;
   9204             i++;
   9205             continue;
   9206         }
   9207         if (0 < ch && ch < 256) {
   9208             *output++ = (char)ch;
   9209             i++;
   9210             continue;
   9211         }
   9212 
   9213         startpos = i;
   9214         exc = NULL;
   9215         raise_encode_exception(&exc, "decimal", unicode,
   9216                                startpos, startpos+1,
   9217                                "invalid decimal Unicode string");
   9218         Py_XDECREF(exc);
   9219         Py_DECREF(unicode);
   9220         return -1;
   9221     }
   9222     /* 0-terminate the output string */
   9223     *output++ = '\0';
   9224     Py_DECREF(unicode);
   9225     return 0;
   9226 }
   9227 
   9228 /* --- Helpers ------------------------------------------------------------ */
   9229 
   9230 /* helper macro to fixup start/end slice values */
   9231 #define ADJUST_INDICES(start, end, len)         \
   9232     if (end > len)                              \
   9233         end = len;                              \
   9234     else if (end < 0) {                         \
   9235         end += len;                             \
   9236         if (end < 0)                            \
   9237             end = 0;                            \
   9238     }                                           \
   9239     if (start < 0) {                            \
   9240         start += len;                           \
   9241         if (start < 0)                          \
   9242             start = 0;                          \
   9243     }
   9244 
   9245 static Py_ssize_t
   9246 any_find_slice(PyObject* s1, PyObject* s2,
   9247                Py_ssize_t start,
   9248                Py_ssize_t end,
   9249                int direction)
   9250 {
   9251     int kind1, kind2;
   9252     void *buf1, *buf2;
   9253     Py_ssize_t len1, len2, result;
   9254 
   9255     kind1 = PyUnicode_KIND(s1);
   9256     kind2 = PyUnicode_KIND(s2);
   9257     if (kind1 < kind2)
   9258         return -1;
   9259 
   9260     len1 = PyUnicode_GET_LENGTH(s1);
   9261     len2 = PyUnicode_GET_LENGTH(s2);
   9262     ADJUST_INDICES(start, end, len1);
   9263     if (end - start < len2)
   9264         return -1;
   9265 
   9266     buf1 = PyUnicode_DATA(s1);
   9267     buf2 = PyUnicode_DATA(s2);
   9268     if (len2 == 1) {
   9269         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
   9270         result = findchar((const char *)buf1 + kind1*start,
   9271                           kind1, end - start, ch, direction);
   9272         if (result == -1)
   9273             return -1;
   9274         else
   9275             return start + result;
   9276     }
   9277 
   9278     if (kind2 != kind1) {
   9279         buf2 = _PyUnicode_AsKind(s2, kind1);
   9280         if (!buf2)
   9281             return -2;
   9282     }
   9283 
   9284     if (direction > 0) {
   9285         switch (kind1) {
   9286         case PyUnicode_1BYTE_KIND:
   9287             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
   9288                 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
   9289             else
   9290                 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
   9291             break;
   9292         case PyUnicode_2BYTE_KIND:
   9293             result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
   9294             break;
   9295         case PyUnicode_4BYTE_KIND:
   9296             result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
   9297             break;
   9298         default:
   9299             Py_UNREACHABLE();
   9300         }
   9301     }
   9302     else {
   9303         switch (kind1) {
   9304         case PyUnicode_1BYTE_KIND:
   9305             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
   9306                 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
   9307             else
   9308                 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
   9309             break;
   9310         case PyUnicode_2BYTE_KIND:
   9311             result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
   9312             break;
   9313         case PyUnicode_4BYTE_KIND:
   9314             result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
   9315             break;
   9316         default:
   9317             Py_UNREACHABLE();
   9318         }
   9319     }
   9320 
   9321     if (kind2 != kind1)
   9322         PyMem_Free(buf2);
   9323 
   9324     return result;
   9325 }
   9326 
   9327 /* _PyUnicode_InsertThousandsGrouping() helper functions */
   9328 #include "stringlib/localeutil.h"
   9329 
   9330 /**
   9331  * InsertThousandsGrouping:
   9332  * @writer: Unicode writer.
   9333  * @n_buffer: Number of characters in @buffer.
   9334  * @digits: Digits we're reading from. If count is non-NULL, this is unused.
   9335  * @d_pos: Start of digits string.
   9336  * @n_digits: The number of digits in the string, in which we want
   9337  *            to put the grouping chars.
   9338  * @min_width: The minimum width of the digits in the output string.
   9339  *             Output will be zero-padded on the left to fill.
   9340  * @grouping: see definition in localeconv().
   9341  * @thousands_sep: see definition in localeconv().
   9342  *
   9343  * There are 2 modes: counting and filling. If @writer is NULL,
   9344  *  we are in counting mode, else filling mode.
   9345  * If counting, the required buffer size is returned.
   9346  * If filling, we know the buffer will be large enough, so we don't
   9347  *  need to pass in the buffer size.
   9348  * Inserts thousand grouping characters (as defined by grouping and
   9349  *  thousands_sep) into @writer.
   9350  *
   9351  * Return value: -1 on error, number of characters otherwise.
   9352  **/
   9353 Py_ssize_t
   9354 _PyUnicode_InsertThousandsGrouping(
   9355     _PyUnicodeWriter *writer,
   9356     Py_ssize_t n_buffer,
   9357     PyObject *digits,
   9358     Py_ssize_t d_pos,
   9359     Py_ssize_t n_digits,
   9360     Py_ssize_t min_width,
   9361     const char *grouping,
   9362     PyObject *thousands_sep,
   9363     Py_UCS4 *maxchar)
   9364 {
   9365     min_width = Py_MAX(0, min_width);
   9366     if (writer) {
   9367         assert(digits != NULL);
   9368         assert(maxchar == NULL);
   9369     }
   9370     else {
   9371         assert(digits == NULL);
   9372         assert(maxchar != NULL);
   9373     }
   9374     assert(0 <= d_pos);
   9375     assert(0 <= n_digits);
   9376     assert(grouping != NULL);
   9377 
   9378     if (digits != NULL) {
   9379         if (PyUnicode_READY(digits) == -1) {
   9380             return -1;
   9381         }
   9382     }
   9383     if (PyUnicode_READY(thousands_sep) == -1) {
   9384         return -1;
   9385     }
   9386 
   9387     Py_ssize_t count = 0;
   9388     Py_ssize_t n_zeros;
   9389     int loop_broken = 0;
   9390     int use_separator = 0; /* First time through, don't append the
   9391                               separator. They only go between
   9392                               groups. */
   9393     Py_ssize_t buffer_pos;
   9394     Py_ssize_t digits_pos;
   9395     Py_ssize_t len;
   9396     Py_ssize_t n_chars;
   9397     Py_ssize_t remaining = n_digits; /* Number of chars remaining to
   9398                                         be looked at */
   9399     /* A generator that returns all of the grouping widths, until it
   9400        returns 0. */
   9401     GroupGenerator groupgen;
   9402     GroupGenerator_init(&groupgen, grouping);
   9403     const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
   9404 
   9405     /* if digits are not grouped, thousands separator
   9406        should be an empty string */
   9407     assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
   9408 
   9409     digits_pos = d_pos + n_digits;
   9410     if (writer) {
   9411         buffer_pos = writer->pos + n_buffer;
   9412         assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
   9413         assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
   9414     }
   9415     else {
   9416         buffer_pos = n_buffer;
   9417     }
   9418 
   9419     if (!writer) {
   9420         *maxchar = 127;
   9421     }
   9422 
   9423     while ((len = GroupGenerator_next(&groupgen)) > 0) {
   9424         len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
   9425         n_zeros = Py_MAX(0, len - remaining);
   9426         n_chars = Py_MAX(0, Py_MIN(remaining, len));
   9427 
   9428         /* Use n_zero zero's and n_chars chars */
   9429 
   9430         /* Count only, don't do anything. */
   9431         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
   9432 
   9433         /* Copy into the writer. */
   9434         InsertThousandsGrouping_fill(writer, &buffer_pos,
   9435                                      digits, &digits_pos,
   9436                                      n_chars, n_zeros,
   9437                                      use_separator ? thousands_sep : NULL,
   9438                                      thousands_sep_len, maxchar);
   9439 
   9440         /* Use a separator next time. */
   9441         use_separator = 1;
   9442 
   9443         remaining -= n_chars;
   9444         min_width -= len;
   9445 
   9446         if (remaining <= 0 && min_width <= 0) {
   9447             loop_broken = 1;
   9448             break;
   9449         }
   9450         min_width -= thousands_sep_len;
   9451     }
   9452     if (!loop_broken) {
   9453         /* We left the loop without using a break statement. */
   9454 
   9455         len = Py_MAX(Py_MAX(remaining, min_width), 1);
   9456         n_zeros = Py_MAX(0, len - remaining);
   9457         n_chars = Py_MAX(0, Py_MIN(remaining, len));
   9458 
   9459         /* Use n_zero zero's and n_chars chars */
   9460         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
   9461 
   9462         /* Copy into the writer. */
   9463         InsertThousandsGrouping_fill(writer, &buffer_pos,
   9464                                      digits, &digits_pos,
   9465                                      n_chars, n_zeros,
   9466                                      use_separator ? thousands_sep : NULL,
   9467                                      thousands_sep_len, maxchar);
   9468     }
   9469     return count;
   9470 }
   9471 
   9472 
   9473 Py_ssize_t
   9474 PyUnicode_Count(PyObject *str,
   9475                 PyObject *substr,
   9476                 Py_ssize_t start,
   9477                 Py_ssize_t end)
   9478 {
   9479     Py_ssize_t result;
   9480     int kind1, kind2;
   9481     void *buf1 = NULL, *buf2 = NULL;
   9482     Py_ssize_t len1, len2;
   9483 
   9484     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
   9485         return -1;
   9486 
   9487     kind1 = PyUnicode_KIND(str);
   9488     kind2 = PyUnicode_KIND(substr);
   9489     if (kind1 < kind2)
   9490         return 0;
   9491 
   9492     len1 = PyUnicode_GET_LENGTH(str);
   9493     len2 = PyUnicode_GET_LENGTH(substr);
   9494     ADJUST_INDICES(start, end, len1);
   9495     if (end - start < len2)
   9496         return 0;
   9497 
   9498     buf1 = PyUnicode_DATA(str);
   9499     buf2 = PyUnicode_DATA(substr);
   9500     if (kind2 != kind1) {
   9501         buf2 = _PyUnicode_AsKind(substr, kind1);
   9502         if (!buf2)
   9503             goto onError;
   9504     }
   9505 
   9506     switch (kind1) {
   9507     case PyUnicode_1BYTE_KIND:
   9508         if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
   9509             result = asciilib_count(
   9510                 ((Py_UCS1*)buf1) + start, end - start,
   9511                 buf2, len2, PY_SSIZE_T_MAX
   9512                 );
   9513         else
   9514             result = ucs1lib_count(
   9515                 ((Py_UCS1*)buf1) + start, end - start,
   9516                 buf2, len2, PY_SSIZE_T_MAX
   9517                 );
   9518         break;
   9519     case PyUnicode_2BYTE_KIND:
   9520         result = ucs2lib_count(
   9521             ((Py_UCS2*)buf1) + start, end - start,
   9522             buf2, len2, PY_SSIZE_T_MAX
   9523             );
   9524         break;
   9525     case PyUnicode_4BYTE_KIND:
   9526         result = ucs4lib_count(
   9527             ((Py_UCS4*)buf1) + start, end - start,
   9528             buf2, len2, PY_SSIZE_T_MAX
   9529             );
   9530         break;
   9531     default:
   9532         Py_UNREACHABLE();
   9533     }
   9534 
   9535     if (kind2 != kind1)
   9536         PyMem_Free(buf2);
   9537 
   9538     return result;
   9539   onError:
   9540     if (kind2 != kind1 && buf2)
   9541         PyMem_Free(buf2);
   9542     return -1;
   9543 }
   9544 
   9545 Py_ssize_t
   9546 PyUnicode_Find(PyObject *str,
   9547                PyObject *substr,
   9548                Py_ssize_t start,
   9549                Py_ssize_t end,
   9550                int direction)
   9551 {
   9552     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
   9553         return -2;
   9554 
   9555     return any_find_slice(str, substr, start, end, direction);
   9556 }
   9557 
   9558 Py_ssize_t
   9559 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
   9560                    Py_ssize_t start, Py_ssize_t end,
   9561                    int direction)
   9562 {
   9563     int kind;
   9564     Py_ssize_t len, result;
   9565     if (PyUnicode_READY(str) == -1)
   9566         return -2;
   9567     len = PyUnicode_GET_LENGTH(str);
   9568     ADJUST_INDICES(start, end, len);
   9569     if (end - start < 1)
   9570         return -1;
   9571     kind = PyUnicode_KIND(str);
   9572     result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
   9573                       kind, end-start, ch, direction);
   9574     if (result == -1)
   9575         return -1;
   9576     else
   9577         return start + result;
   9578 }
   9579 
   9580 static int
   9581 tailmatch(PyObject *self,
   9582           PyObject *substring,
   9583           Py_ssize_t start,
   9584           Py_ssize_t end,
   9585           int direction)
   9586 {
   9587     int kind_self;
   9588     int kind_sub;
   9589     void *data_self;
   9590     void *data_sub;
   9591     Py_ssize_t offset;
   9592     Py_ssize_t i;
   9593     Py_ssize_t end_sub;
   9594 
   9595     if (PyUnicode_READY(self) == -1 ||
   9596         PyUnicode_READY(substring) == -1)
   9597         return -1;
   9598 
   9599     ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
   9600     end -= PyUnicode_GET_LENGTH(substring);
   9601     if (end < start)
   9602         return 0;
   9603 
   9604     if (PyUnicode_GET_LENGTH(substring) == 0)
   9605         return 1;
   9606 
   9607     kind_self = PyUnicode_KIND(self);
   9608     data_self = PyUnicode_DATA(self);
   9609     kind_sub = PyUnicode_KIND(substring);
   9610     data_sub = PyUnicode_DATA(substring);
   9611     end_sub = PyUnicode_GET_LENGTH(substring) - 1;
   9612 
   9613     if (direction > 0)
   9614         offset = end;
   9615     else
   9616         offset = start;
   9617 
   9618     if (PyUnicode_READ(kind_self, data_self, offset) ==
   9619         PyUnicode_READ(kind_sub, data_sub, 0) &&
   9620         PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
   9621         PyUnicode_READ(kind_sub, data_sub, end_sub)) {
   9622         /* If both are of the same kind, memcmp is sufficient */
   9623         if (kind_self == kind_sub) {
   9624             return ! memcmp((char *)data_self +
   9625                                 (offset * PyUnicode_KIND(substring)),
   9626                             data_sub,
   9627                             PyUnicode_GET_LENGTH(substring) *
   9628                                 PyUnicode_KIND(substring));
   9629         }
   9630         /* otherwise we have to compare each character by first accessing it */
   9631         else {
   9632             /* We do not need to compare 0 and len(substring)-1 because
   9633                the if statement above ensured already that they are equal
   9634                when we end up here. */
   9635             for (i = 1; i < end_sub; ++i) {
   9636                 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
   9637                     PyUnicode_READ(kind_sub, data_sub, i))
   9638                     return 0;
   9639             }
   9640             return 1;
   9641         }
   9642     }
   9643 
   9644     return 0;
   9645 }
   9646 
   9647 Py_ssize_t
   9648 PyUnicode_Tailmatch(PyObject *str,
   9649                     PyObject *substr,
   9650                     Py_ssize_t start,
   9651                     Py_ssize_t end,
   9652                     int direction)
   9653 {
   9654     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
   9655         return -1;
   9656 
   9657     return tailmatch(str, substr, start, end, direction);
   9658 }
   9659 
   9660 static PyObject *
   9661 ascii_upper_or_lower(PyObject *self, int lower)
   9662 {
   9663     Py_ssize_t len = PyUnicode_GET_LENGTH(self);
   9664     char *resdata, *data = PyUnicode_DATA(self);
   9665     PyObject *res;
   9666 
   9667     res = PyUnicode_New(len, 127);
   9668     if (res == NULL)
   9669         return NULL;
   9670     resdata = PyUnicode_DATA(res);
   9671     if (lower)
   9672         _Py_bytes_lower(resdata, data, len);
   9673     else
   9674         _Py_bytes_upper(resdata, data, len);
   9675     return res;
   9676 }
   9677 
   9678 static Py_UCS4
   9679 handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
   9680 {
   9681     Py_ssize_t j;
   9682     int final_sigma;
   9683     Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
   9684     /* U+03A3 is in the Final_Sigma context when, it is found like this:
   9685 
   9686      \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
   9687 
   9688     where ! is a negation and \p{xxx} is a character with property xxx.
   9689     */
   9690     for (j = i - 1; j >= 0; j--) {
   9691         c = PyUnicode_READ(kind, data, j);
   9692         if (!_PyUnicode_IsCaseIgnorable(c))
   9693             break;
   9694     }
   9695     final_sigma = j >= 0 && _PyUnicode_IsCased(c);
   9696     if (final_sigma) {
   9697         for (j = i + 1; j < length; j++) {
   9698             c = PyUnicode_READ(kind, data, j);
   9699             if (!_PyUnicode_IsCaseIgnorable(c))
   9700                 break;
   9701         }
   9702         final_sigma = j == length || !_PyUnicode_IsCased(c);
   9703     }
   9704     return (final_sigma) ? 0x3C2 : 0x3C3;
   9705 }
   9706 
   9707 static int
   9708 lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
   9709            Py_UCS4 c, Py_UCS4 *mapped)
   9710 {
   9711     /* Obscure special case. */
   9712     if (c == 0x3A3) {
   9713         mapped[0] = handle_capital_sigma(kind, data, length, i);
   9714         return 1;
   9715     }
   9716     return _PyUnicode_ToLowerFull(c, mapped);
   9717 }
   9718 
   9719 static Py_ssize_t
   9720 do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
   9721 {
   9722     Py_ssize_t i, k = 0;
   9723     int n_res, j;
   9724     Py_UCS4 c, mapped[3];
   9725 
   9726     c = PyUnicode_READ(kind, data, 0);
   9727     n_res = _PyUnicode_ToUpperFull(c, mapped);
   9728     for (j = 0; j < n_res; j++) {
   9729         *maxchar = Py_MAX(*maxchar, mapped[j]);
   9730         res[k++] = mapped[j];
   9731     }
   9732     for (i = 1; i < length; i++) {
   9733         c = PyUnicode_READ(kind, data, i);
   9734         n_res = lower_ucs4(kind, data, length, i, c, mapped);
   9735         for (j = 0; j < n_res; j++) {
   9736             *maxchar = Py_MAX(*maxchar, mapped[j]);
   9737             res[k++] = mapped[j];
   9738         }
   9739     }
   9740     return k;
   9741 }
   9742 
   9743 static Py_ssize_t
   9744 do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
   9745     Py_ssize_t i, k = 0;
   9746 
   9747     for (i = 0; i < length; i++) {
   9748         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
   9749         int n_res, j;
   9750         if (Py_UNICODE_ISUPPER(c)) {
   9751             n_res = lower_ucs4(kind, data, length, i, c, mapped);
   9752         }
   9753         else if (Py_UNICODE_ISLOWER(c)) {
   9754             n_res = _PyUnicode_ToUpperFull(c, mapped);
   9755         }
   9756         else {
   9757             n_res = 1;
   9758             mapped[0] = c;
   9759         }
   9760         for (j = 0; j < n_res; j++) {
   9761             *maxchar = Py_MAX(*maxchar, mapped[j]);
   9762             res[k++] = mapped[j];
   9763         }
   9764     }
   9765     return k;
   9766 }
   9767 
   9768 static Py_ssize_t
   9769 do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
   9770                   Py_UCS4 *maxchar, int lower)
   9771 {
   9772     Py_ssize_t i, k = 0;
   9773 
   9774     for (i = 0; i < length; i++) {
   9775         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
   9776         int n_res, j;
   9777         if (lower)
   9778             n_res = lower_ucs4(kind, data, length, i, c, mapped);
   9779         else
   9780             n_res = _PyUnicode_ToUpperFull(c, mapped);
   9781         for (j = 0; j < n_res; j++) {
   9782             *maxchar = Py_MAX(*maxchar, mapped[j]);
   9783             res[k++] = mapped[j];
   9784         }
   9785     }
   9786     return k;
   9787 }
   9788 
   9789 static Py_ssize_t
   9790 do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
   9791 {
   9792     return do_upper_or_lower(kind, data, length, res, maxchar, 0);
   9793 }
   9794 
   9795 static Py_ssize_t
   9796 do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
   9797 {
   9798     return do_upper_or_lower(kind, data, length, res, maxchar, 1);
   9799 }
   9800 
   9801 static Py_ssize_t
   9802 do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
   9803 {
   9804     Py_ssize_t i, k = 0;
   9805 
   9806     for (i = 0; i < length; i++) {
   9807         Py_UCS4 c = PyUnicode_READ(kind, data, i);
   9808         Py_UCS4 mapped[3];
   9809         int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
   9810         for (j = 0; j < n_res; j++) {
   9811             *maxchar = Py_MAX(*maxchar, mapped[j]);
   9812             res[k++] = mapped[j];
   9813         }
   9814     }
   9815     return k;
   9816 }
   9817 
   9818 static Py_ssize_t
   9819 do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
   9820 {
   9821     Py_ssize_t i, k = 0;
   9822     int previous_is_cased;
   9823 
   9824     previous_is_cased = 0;
   9825     for (i = 0; i < length; i++) {
   9826         const Py_UCS4 c = PyUnicode_READ(kind, data, i);
   9827         Py_UCS4 mapped[3];
   9828         int n_res, j;
   9829 
   9830         if (previous_is_cased)
   9831             n_res = lower_ucs4(kind, data, length, i, c, mapped);
   9832         else
   9833             n_res = _PyUnicode_ToTitleFull(c, mapped);
   9834 
   9835         for (j = 0; j < n_res; j++) {
   9836             *maxchar = Py_MAX(*maxchar, mapped[j]);
   9837             res[k++] = mapped[j];
   9838         }
   9839 
   9840         previous_is_cased = _PyUnicode_IsCased(c);
   9841     }
   9842     return k;
   9843 }
   9844 
   9845 static PyObject *
   9846 case_operation(PyObject *self,
   9847                Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
   9848 {
   9849     PyObject *res = NULL;
   9850     Py_ssize_t length, newlength = 0;
   9851     int kind, outkind;
   9852     void *data, *outdata;
   9853     Py_UCS4 maxchar = 0, *tmp, *tmpend;
   9854 
   9855     assert(PyUnicode_IS_READY(self));
   9856 
   9857     kind = PyUnicode_KIND(self);
   9858     data = PyUnicode_DATA(self);
   9859     length = PyUnicode_GET_LENGTH(self);
   9860     if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
   9861         PyErr_SetString(PyExc_OverflowError, "string is too long");
   9862         return NULL;
   9863     }
   9864     tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
   9865     if (tmp == NULL)
   9866         return PyErr_NoMemory();
   9867     newlength = perform(kind, data, length, tmp, &maxchar);
   9868     res = PyUnicode_New(newlength, maxchar);
   9869     if (res == NULL)
   9870         goto leave;
   9871     tmpend = tmp + newlength;
   9872     outdata = PyUnicode_DATA(res);
   9873     outkind = PyUnicode_KIND(res);
   9874     switch (outkind) {
   9875     case PyUnicode_1BYTE_KIND:
   9876         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
   9877         break;
   9878     case PyUnicode_2BYTE_KIND:
   9879         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
   9880         break;
   9881     case PyUnicode_4BYTE_KIND:
   9882         memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
   9883         break;
   9884     default:
   9885         Py_UNREACHABLE();
   9886     }
   9887   leave:
   9888     PyMem_FREE(tmp);
   9889     return res;
   9890 }
   9891 
   9892 PyObject *
   9893 PyUnicode_Join(PyObject *separator, PyObject *seq)
   9894 {
   9895     PyObject *res;
   9896     PyObject *fseq;
   9897     Py_ssize_t seqlen;
   9898     PyObject **items;
   9899 
   9900     fseq = PySequence_Fast(seq, "can only join an iterable");
   9901     if (fseq == NULL) {
   9902         return NULL;
   9903     }
   9904 
   9905     /* NOTE: the following code can't call back into Python code,
   9906      * so we are sure that fseq won't be mutated.
   9907      */
   9908 
   9909     items = PySequence_Fast_ITEMS(fseq);
   9910     seqlen = PySequence_Fast_GET_SIZE(fseq);
   9911     res = _PyUnicode_JoinArray(separator, items, seqlen);
   9912     Py_DECREF(fseq);
   9913     return res;
   9914 }
   9915 
   9916 PyObject *
   9917 _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
   9918 {
   9919     PyObject *res = NULL; /* the result */
   9920     PyObject *sep = NULL;
   9921     Py_ssize_t seplen;
   9922     PyObject *item;
   9923     Py_ssize_t sz, i, res_offset;
   9924     Py_UCS4 maxchar;
   9925     Py_UCS4 item_maxchar;
   9926     int use_memcpy;
   9927     unsigned char *res_data = NULL, *sep_data = NULL;
   9928     PyObject *last_obj;
   9929     unsigned int kind = 0;
   9930 
   9931     /* If empty sequence, return u"". */
   9932     if (seqlen == 0) {
   9933         _Py_RETURN_UNICODE_EMPTY();
   9934     }
   9935 
   9936     /* If singleton sequence with an exact Unicode, return that. */
   9937     last_obj = NULL;
   9938     if (seqlen == 1) {
   9939         if (PyUnicode_CheckExact(items[0])) {
   9940             res = items[0];
   9941             Py_INCREF(res);
   9942             return res;
   9943         }
   9944         seplen = 0;
   9945         maxchar = 0;
   9946     }
   9947     else {
   9948         /* Set up sep and seplen */
   9949         if (separator == NULL) {
   9950             /* fall back to a blank space separator */
   9951             sep = PyUnicode_FromOrdinal(' ');
   9952             if (!sep)
   9953                 goto onError;
   9954             seplen = 1;
   9955             maxchar = 32;
   9956         }
   9957         else {
   9958             if (!PyUnicode_Check(separator)) {
   9959                 PyErr_Format(PyExc_TypeError,
   9960                              "separator: expected str instance,"
   9961                              " %.80s found",
   9962                              Py_TYPE(separator)->tp_name);
   9963                 goto onError;
   9964             }
   9965             if (PyUnicode_READY(separator))
   9966                 goto onError;
   9967             sep = separator;
   9968             seplen = PyUnicode_GET_LENGTH(separator);
   9969             maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
   9970             /* inc refcount to keep this code path symmetric with the
   9971                above case of a blank separator */
   9972             Py_INCREF(sep);
   9973         }
   9974         last_obj = sep;
   9975     }
   9976 
   9977     /* There are at least two things to join, or else we have a subclass
   9978      * of str in the sequence.
   9979      * Do a pre-pass to figure out the total amount of space we'll
   9980      * need (sz), and see whether all argument are strings.
   9981      */
   9982     sz = 0;
   9983 #ifdef Py_DEBUG
   9984     use_memcpy = 0;
   9985 #else
   9986     use_memcpy = 1;
   9987 #endif
   9988     for (i = 0; i < seqlen; i++) {
   9989         size_t add_sz;
   9990         item = items[i];
   9991         if (!PyUnicode_Check(item)) {
   9992             PyErr_Format(PyExc_TypeError,
   9993                          "sequence item %zd: expected str instance,"
   9994                          " %.80s found",
   9995                          i, Py_TYPE(item)->tp_name);
   9996             goto onError;
   9997         }
   9998         if (PyUnicode_READY(item) == -1)
   9999             goto onError;
   10000         add_sz = PyUnicode_GET_LENGTH(item);
   10001         item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
   10002         maxchar = Py_MAX(maxchar, item_maxchar);
   10003         if (i != 0) {
   10004             add_sz += seplen;
   10005         }
   10006         if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
   10007             PyErr_SetString(PyExc_OverflowError,
   10008                             "join() result is too long for a Python string");
   10009             goto onError;
   10010         }
   10011         sz += add_sz;
   10012         if (use_memcpy && last_obj != NULL) {
   10013             if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
   10014                 use_memcpy = 0;
   10015         }
   10016         last_obj = item;
   10017     }
   10018 
   10019     res = PyUnicode_New(sz, maxchar);
   10020     if (res == NULL)
   10021         goto onError;
   10022 
   10023     /* Catenate everything. */
   10024 #ifdef Py_DEBUG
   10025     use_memcpy = 0;
   10026 #else
   10027     if (use_memcpy) {
   10028         res_data = PyUnicode_1BYTE_DATA(res);
   10029         kind = PyUnicode_KIND(res);
   10030         if (seplen != 0)
   10031             sep_data = PyUnicode_1BYTE_DATA(sep);
   10032     }
   10033 #endif
   10034     if (use_memcpy) {
   10035         for (i = 0; i < seqlen; ++i) {
   10036             Py_ssize_t itemlen;
   10037             item = items[i];
   10038 
   10039             /* Copy item, and maybe the separator. */
   10040             if (i && seplen != 0) {
   10041                 memcpy(res_data,
   10042                           sep_data,
   10043                           kind * seplen);
   10044                 res_data += kind * seplen;
   10045             }
   10046 
   10047             itemlen = PyUnicode_GET_LENGTH(item);
   10048             if (itemlen != 0) {
   10049                 memcpy(res_data,
   10050                           PyUnicode_DATA(item),
   10051                           kind * itemlen);
   10052                 res_data += kind * itemlen;
   10053             }
   10054         }
   10055         assert(res_data == PyUnicode_1BYTE_DATA(res)
   10056                            + kind * PyUnicode_GET_LENGTH(res));
   10057     }
   10058     else {
   10059         for (i = 0, res_offset = 0; i < seqlen; ++i) {
   10060             Py_ssize_t itemlen;
   10061             item = items[i];
   10062 
   10063             /* Copy item, and maybe the separator. */
   10064             if (i && seplen != 0) {
   10065                 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
   10066                 res_offset += seplen;
   10067             }
   10068 
   10069             itemlen = PyUnicode_GET_LENGTH(item);
   10070             if (itemlen != 0) {
   10071                 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
   10072                 res_offset += itemlen;
   10073             }
   10074         }
   10075         assert(res_offset == PyUnicode_GET_LENGTH(res));
   10076     }
   10077 
   10078     Py_XDECREF(sep);
   10079     assert(_PyUnicode_CheckConsistency(res, 1));
   10080     return res;
   10081 
   10082   onError:
   10083     Py_XDECREF(sep);
   10084     Py_XDECREF(res);
   10085     return NULL;
   10086 }
   10087 
   10088 void
   10089 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
   10090                     Py_UCS4 fill_char)
   10091 {
   10092     const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
   10093     void *data = PyUnicode_DATA(unicode);
   10094     assert(PyUnicode_IS_READY(unicode));
   10095     assert(unicode_modifiable(unicode));
   10096     assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
   10097     assert(start >= 0);
   10098     assert(start + length <= PyUnicode_GET_LENGTH(unicode));
   10099     FILL(kind, data, fill_char, start, length);
   10100 }
   10101 
   10102 Py_ssize_t
   10103 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
   10104                Py_UCS4 fill_char)
   10105 {
   10106     Py_ssize_t maxlen;
   10107 
   10108     if (!PyUnicode_Check(unicode)) {
   10109         PyErr_BadInternalCall();
   10110         return -1;
   10111     }
   10112     if (PyUnicode_READY(unicode) == -1)
   10113         return -1;
   10114     if (unicode_check_modifiable(unicode))
   10115         return -1;
   10116 
   10117     if (start < 0) {
   10118         PyErr_SetString(PyExc_IndexError, "string index out of range");
   10119         return -1;
   10120     }
   10121     if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
   10122         PyErr_SetString(PyExc_ValueError,
   10123                          "fill character is bigger than "
   10124                          "the string maximum character");
   10125         return -1;
   10126     }
   10127 
   10128     maxlen = PyUnicode_GET_LENGTH(unicode) - start;
   10129     length = Py_MIN(maxlen, length);
   10130     if (length <= 0)
   10131         return 0;
   10132 
   10133     _PyUnicode_FastFill(unicode, start, length, fill_char);
   10134     return length;
   10135 }
   10136 
   10137 static PyObject *
   10138 pad(PyObject *self,
   10139     Py_ssize_t left,
   10140     Py_ssize_t right,
   10141     Py_UCS4 fill)
   10142 {
   10143     PyObject *u;
   10144     Py_UCS4 maxchar;
   10145     int kind;
   10146     void *data;
   10147 
   10148     if (left < 0)
   10149         left = 0;
   10150     if (right < 0)
   10151         right = 0;
   10152 
   10153     if (left == 0 && right == 0)
   10154         return unicode_result_unchanged(self);
   10155 
   10156     if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
   10157         right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
   10158         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
   10159         return NULL;
   10160     }
   10161     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
   10162     maxchar = Py_MAX(maxchar, fill);
   10163     u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
   10164     if (!u)
   10165         return NULL;
   10166 
   10167     kind = PyUnicode_KIND(u);
   10168     data = PyUnicode_DATA(u);
   10169     if (left)
   10170         FILL(kind, data, fill, 0, left);
   10171     if (right)
   10172         FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
   10173     _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
   10174     assert(_PyUnicode_CheckConsistency(u, 1));
   10175     return u;
   10176 }
   10177 
   10178 PyObject *
   10179 PyUnicode_Splitlines(PyObject *string, int keepends)
   10180 {
   10181     PyObject *list;
   10182 
   10183     if (ensure_unicode(string) < 0)
   10184         return NULL;
   10185 
   10186     switch (PyUnicode_KIND(string)) {
   10187     case PyUnicode_1BYTE_KIND:
   10188         if (PyUnicode_IS_ASCII(string))
   10189             list = asciilib_splitlines(
   10190                 string, PyUnicode_1BYTE_DATA(string),
   10191                 PyUnicode_GET_LENGTH(string), keepends);
   10192         else
   10193             list = ucs1lib_splitlines(
   10194                 string, PyUnicode_1BYTE_DATA(string),
   10195                 PyUnicode_GET_LENGTH(string), keepends);
   10196         break;
   10197     case PyUnicode_2BYTE_KIND:
   10198         list = ucs2lib_splitlines(
   10199             string, PyUnicode_2BYTE_DATA(string),
   10200             PyUnicode_GET_LENGTH(string), keepends);
   10201         break;
   10202     case PyUnicode_4BYTE_KIND:
   10203         list = ucs4lib_splitlines(
   10204             string, PyUnicode_4BYTE_DATA(string),
   10205             PyUnicode_GET_LENGTH(string), keepends);
   10206         break;
   10207     default:
   10208         Py_UNREACHABLE();
   10209     }
   10210     return list;
   10211 }
   10212 
   10213 static PyObject *
   10214 split(PyObject *self,
   10215       PyObject *substring,
   10216       Py_ssize_t maxcount)
   10217 {
   10218     int kind1, kind2;
   10219     void *buf1, *buf2;
   10220     Py_ssize_t len1, len2;
   10221     PyObject* out;
   10222 
   10223     if (maxcount < 0)
   10224         maxcount = PY_SSIZE_T_MAX;
   10225 
   10226     if (PyUnicode_READY(self) == -1)
   10227         return NULL;
   10228 
   10229     if (substring == NULL)
   10230         switch (PyUnicode_KIND(self)) {
   10231         case PyUnicode_1BYTE_KIND:
   10232             if (PyUnicode_IS_ASCII(self))
   10233                 return asciilib_split_whitespace(
   10234                     self,  PyUnicode_1BYTE_DATA(self),
   10235                     PyUnicode_GET_LENGTH(self), maxcount
   10236                     );
   10237             else
   10238                 return ucs1lib_split_whitespace(
   10239                     self,  PyUnicode_1BYTE_DATA(self),
   10240                     PyUnicode_GET_LENGTH(self), maxcount
   10241                     );
   10242         case PyUnicode_2BYTE_KIND:
   10243             return ucs2lib_split_whitespace(
   10244                 self,  PyUnicode_2BYTE_DATA(self),
   10245                 PyUnicode_GET_LENGTH(self), maxcount
   10246                 );
   10247         case PyUnicode_4BYTE_KIND:
   10248             return ucs4lib_split_whitespace(
   10249                 self,  PyUnicode_4BYTE_DATA(self),
   10250                 PyUnicode_GET_LENGTH(self), maxcount
   10251                 );
   10252         default:
   10253             Py_UNREACHABLE();
   10254         }
   10255 
   10256     if (PyUnicode_READY(substring) == -1)
   10257         return NULL;
   10258 
   10259     kind1 = PyUnicode_KIND(self);
   10260     kind2 = PyUnicode_KIND(substring);
   10261     len1 = PyUnicode_GET_LENGTH(self);
   10262     len2 = PyUnicode_GET_LENGTH(substring);
   10263     if (kind1 < kind2 || len1 < len2) {
   10264         out = PyList_New(1);
   10265         if (out == NULL)
   10266             return NULL;
   10267         Py_INCREF(self);
   10268         PyList_SET_ITEM(out, 0, self);
   10269         return out;
   10270     }
   10271     buf1 = PyUnicode_DATA(self);
   10272     buf2 = PyUnicode_DATA(substring);
   10273     if (kind2 != kind1) {
   10274         buf2 = _PyUnicode_AsKind(substring, kind1);
   10275         if (!buf2)
   10276             return NULL;
   10277     }
   10278 
   10279     switch (kind1) {
   10280     case PyUnicode_1BYTE_KIND:
   10281         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
   10282             out = asciilib_split(
   10283                 self,  buf1, len1, buf2, len2, maxcount);
   10284         else
   10285             out = ucs1lib_split(
   10286                 self,  buf1, len1, buf2, len2, maxcount);
   10287         break;
   10288     case PyUnicode_2BYTE_KIND:
   10289         out = ucs2lib_split(
   10290             self,  buf1, len1, buf2, len2, maxcount);
   10291         break;
   10292     case PyUnicode_4BYTE_KIND:
   10293         out = ucs4lib_split(
   10294             self,  buf1, len1, buf2, len2, maxcount);
   10295         break;
   10296     default:
   10297         out = NULL;
   10298     }
   10299     if (kind2 != kind1)
   10300         PyMem_Free(buf2);
   10301     return out;
   10302 }
   10303 
   10304 static PyObject *
   10305 rsplit(PyObject *self,
   10306        PyObject *substring,
   10307        Py_ssize_t maxcount)
   10308 {
   10309     int kind1, kind2;
   10310     void *buf1, *buf2;
   10311     Py_ssize_t len1, len2;
   10312     PyObject* out;
   10313 
   10314     if (maxcount < 0)
   10315         maxcount = PY_SSIZE_T_MAX;
   10316 
   10317     if (PyUnicode_READY(self) == -1)
   10318         return NULL;
   10319 
   10320     if (substring == NULL)
   10321         switch (PyUnicode_KIND(self)) {
   10322         case PyUnicode_1BYTE_KIND:
   10323             if (PyUnicode_IS_ASCII(self))
   10324                 return asciilib_rsplit_whitespace(
   10325                     self,  PyUnicode_1BYTE_DATA(self),
   10326                     PyUnicode_GET_LENGTH(self), maxcount
   10327                     );
   10328             else
   10329                 return ucs1lib_rsplit_whitespace(
   10330                     self,  PyUnicode_1BYTE_DATA(self),
   10331                     PyUnicode_GET_LENGTH(self), maxcount
   10332                     );
   10333         case PyUnicode_2BYTE_KIND:
   10334             return ucs2lib_rsplit_whitespace(
   10335                 self,  PyUnicode_2BYTE_DATA(self),
   10336                 PyUnicode_GET_LENGTH(self), maxcount
   10337                 );
   10338         case PyUnicode_4BYTE_KIND:
   10339             return ucs4lib_rsplit_whitespace(
   10340                 self,  PyUnicode_4BYTE_DATA(self),
   10341                 PyUnicode_GET_LENGTH(self), maxcount
   10342                 );
   10343         default:
   10344             Py_UNREACHABLE();
   10345         }
   10346 
   10347     if (PyUnicode_READY(substring) == -1)
   10348         return NULL;
   10349 
   10350     kind1 = PyUnicode_KIND(self);
   10351     kind2 = PyUnicode_KIND(substring);
   10352     len1 = PyUnicode_GET_LENGTH(self);
   10353     len2 = PyUnicode_GET_LENGTH(substring);
   10354     if (kind1 < kind2 || len1 < len2) {
   10355         out = PyList_New(1);
   10356         if (out == NULL)
   10357             return NULL;
   10358         Py_INCREF(self);
   10359         PyList_SET_ITEM(out, 0, self);
   10360         return out;
   10361     }
   10362     buf1 = PyUnicode_DATA(self);
   10363     buf2 = PyUnicode_DATA(substring);
   10364     if (kind2 != kind1) {
   10365         buf2 = _PyUnicode_AsKind(substring, kind1);
   10366         if (!buf2)
   10367             return NULL;
   10368     }
   10369 
   10370     switch (kind1) {
   10371     case PyUnicode_1BYTE_KIND:
   10372         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
   10373             out = asciilib_rsplit(
   10374                 self,  buf1, len1, buf2, len2, maxcount);
   10375         else
   10376             out = ucs1lib_rsplit(
   10377                 self,  buf1, len1, buf2, len2, maxcount);
   10378         break;
   10379     case PyUnicode_2BYTE_KIND:
   10380         out = ucs2lib_rsplit(
   10381             self,  buf1, len1, buf2, len2, maxcount);
   10382         break;
   10383     case PyUnicode_4BYTE_KIND:
   10384         out = ucs4lib_rsplit(
   10385             self,  buf1, len1, buf2, len2, maxcount);
   10386         break;
   10387     default:
   10388         out = NULL;
   10389     }
   10390     if (kind2 != kind1)
   10391         PyMem_Free(buf2);
   10392     return out;
   10393 }
   10394 
   10395 static Py_ssize_t
   10396 anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
   10397             PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
   10398 {
   10399     switch (kind) {
   10400     case PyUnicode_1BYTE_KIND:
   10401         if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
   10402             return asciilib_find(buf1, len1, buf2, len2, offset);
   10403         else
   10404             return ucs1lib_find(buf1, len1, buf2, len2, offset);
   10405     case PyUnicode_2BYTE_KIND:
   10406         return ucs2lib_find(buf1, len1, buf2, len2, offset);
   10407     case PyUnicode_4BYTE_KIND:
   10408         return ucs4lib_find(buf1, len1, buf2, len2, offset);
   10409     }
   10410     Py_UNREACHABLE();
   10411 }
   10412 
   10413 static Py_ssize_t
   10414 anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
   10415              PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
   10416 {
   10417     switch (kind) {
   10418     case PyUnicode_1BYTE_KIND:
   10419         if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
   10420             return asciilib_count(sbuf, slen, buf1, len1, maxcount);
   10421         else
   10422             return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
   10423     case PyUnicode_2BYTE_KIND:
   10424         return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
   10425     case PyUnicode_4BYTE_KIND:
   10426         return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
   10427     }
   10428     Py_UNREACHABLE();
   10429 }
   10430 
   10431 static void
   10432 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
   10433                       Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
   10434 {
   10435     int kind = PyUnicode_KIND(u);
   10436     void *data = PyUnicode_DATA(u);
   10437     Py_ssize_t len = PyUnicode_GET_LENGTH(u);
   10438     if (kind == PyUnicode_1BYTE_KIND) {
   10439         ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
   10440                                       (Py_UCS1 *)data + len,
   10441                                       u1, u2, maxcount);
   10442     }
   10443     else if (kind == PyUnicode_2BYTE_KIND) {
   10444         ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
   10445                                       (Py_UCS2 *)data + len,
   10446                                       u1, u2, maxcount);
   10447     }
   10448     else {
   10449         assert(kind == PyUnicode_4BYTE_KIND);
   10450         ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
   10451                                       (Py_UCS4 *)data + len,
   10452                                       u1, u2, maxcount);
   10453     }
   10454 }
   10455 
   10456 static PyObject *
   10457 replace(PyObject *self, PyObject *str1,
   10458         PyObject *str2, Py_ssize_t maxcount)
   10459 {
   10460     PyObject *u;
   10461     char *sbuf = PyUnicode_DATA(self);
   10462     char *buf1 = PyUnicode_DATA(str1);
   10463     char *buf2 = PyUnicode_DATA(str2);
   10464     int srelease = 0, release1 = 0, release2 = 0;
   10465     int skind = PyUnicode_KIND(self);
   10466     int kind1 = PyUnicode_KIND(str1);
   10467     int kind2 = PyUnicode_KIND(str2);
   10468     Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
   10469     Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
   10470     Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
   10471     int mayshrink;
   10472     Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
   10473 
   10474     if (maxcount < 0)
   10475         maxcount = PY_SSIZE_T_MAX;
   10476     else if (maxcount == 0 || slen == 0)
   10477         goto nothing;
   10478 
   10479     if (str1 == str2)
   10480         goto nothing;
   10481 
   10482     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
   10483     maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
   10484     if (maxchar < maxchar_str1)
   10485         /* substring too wide to be present */
   10486         goto nothing;
   10487     maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
   10488     /* Replacing str1 with str2 may cause a maxchar reduction in the
   10489        result string. */
   10490     mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
   10491     maxchar = Py_MAX(maxchar, maxchar_str2);
   10492 
   10493     if (len1 == len2) {
   10494         /* same length */
   10495         if (len1 == 0)
   10496             goto nothing;
   10497         if (len1 == 1) {
   10498             /* replace characters */
   10499             Py_UCS4 u1, u2;
   10500             Py_ssize_t pos;
   10501 
   10502             u1 = PyUnicode_READ(kind1, buf1, 0);
   10503             pos = findchar(sbuf, skind, slen, u1, 1);
   10504             if (pos < 0)
   10505                 goto nothing;
   10506             u2 = PyUnicode_READ(kind2, buf2, 0);
   10507             u = PyUnicode_New(slen, maxchar);
   10508             if (!u)
   10509                 goto error;
   10510 
   10511             _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
   10512             replace_1char_inplace(u, pos, u1, u2, maxcount);
   10513         }
   10514         else {
   10515             int rkind = skind;
   10516             char *res;
   10517             Py_ssize_t i;
   10518 
   10519             if (kind1 < rkind) {
   10520                 /* widen substring */
   10521                 buf1 = _PyUnicode_AsKind(str1, rkind);
   10522                 if (!buf1) goto error;
   10523                 release1 = 1;
   10524             }
   10525             i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
   10526             if (i < 0)
   10527                 goto nothing;
   10528             if (rkind > kind2) {
   10529                 /* widen replacement */
   10530                 buf2 = _PyUnicode_AsKind(str2, rkind);
   10531                 if (!buf2) goto error;
   10532                 release2 = 1;
   10533             }
   10534             else if (rkind < kind2) {
   10535                 /* widen self and buf1 */
   10536                 rkind = kind2;
   10537                 if (release1) PyMem_Free(buf1);
   10538                 release1 = 0;
   10539                 sbuf = _PyUnicode_AsKind(self, rkind);
   10540                 if (!sbuf) goto error;
   10541                 srelease = 1;
   10542                 buf1 = _PyUnicode_AsKind(str1, rkind);
   10543                 if (!buf1) goto error;
   10544                 release1 = 1;
   10545             }
   10546             u = PyUnicode_New(slen, maxchar);
   10547             if (!u)
   10548                 goto error;
   10549             assert(PyUnicode_KIND(u) == rkind);
   10550             res = PyUnicode_DATA(u);
   10551 
   10552             memcpy(res, sbuf, rkind * slen);
   10553             /* change everything in-place, starting with this one */
   10554             memcpy(res + rkind * i,
   10555                    buf2,
   10556                    rkind * len2);
   10557             i += len1;
   10558 
   10559             while ( --maxcount > 0) {
   10560                 i = anylib_find(rkind, self,
   10561                                 sbuf+rkind*i, slen-i,
   10562                                 str1, buf1, len1, i);
   10563                 if (i == -1)
   10564                     break;
   10565                 memcpy(res + rkind * i,
   10566                        buf2,
   10567                        rkind * len2);
   10568                 i += len1;
   10569             }
   10570         }
   10571     }
   10572     else {
   10573         Py_ssize_t n, i, j, ires;
   10574         Py_ssize_t new_size;
   10575         int rkind = skind;
   10576         char *res;
   10577 
   10578         if (kind1 < rkind) {
   10579             /* widen substring */
   10580             buf1 = _PyUnicode_AsKind(str1, rkind);
   10581             if (!buf1) goto error;
   10582             release1 = 1;
   10583         }
   10584         n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
   10585         if (n == 0)
   10586             goto nothing;
   10587         if (kind2 < rkind) {
   10588             /* widen replacement */
   10589             buf2 = _PyUnicode_AsKind(str2, rkind);
   10590             if (!buf2) goto error;
   10591             release2 = 1;
   10592         }
   10593         else if (kind2 > rkind) {
   10594             /* widen self and buf1 */
   10595             rkind = kind2;
   10596             sbuf = _PyUnicode_AsKind(self, rkind);
   10597             if (!sbuf) goto error;
   10598             srelease = 1;
   10599             if (release1) PyMem_Free(buf1);
   10600             release1 = 0;
   10601             buf1 = _PyUnicode_AsKind(str1, rkind);
   10602             if (!buf1) goto error;
   10603             release1 = 1;
   10604         }
   10605         /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
   10606            PyUnicode_GET_LENGTH(str1))); */
   10607         if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
   10608                 PyErr_SetString(PyExc_OverflowError,
   10609                                 "replace string is too long");
   10610                 goto error;
   10611         }
   10612         new_size = slen + n * (len2 - len1);
   10613         if (new_size == 0) {
   10614             _Py_INCREF_UNICODE_EMPTY();
   10615             if (!unicode_empty)
   10616                 goto error;
   10617             u = unicode_empty;
   10618             goto done;
   10619         }
   10620         if (new_size > (PY_SSIZE_T_MAX / rkind)) {
   10621             PyErr_SetString(PyExc_OverflowError,
   10622                             "replace string is too long");
   10623             goto error;
   10624         }
   10625         u = PyUnicode_New(new_size, maxchar);
   10626         if (!u)
   10627             goto error;
   10628         assert(PyUnicode_KIND(u) == rkind);
   10629         res = PyUnicode_DATA(u);
   10630         ires = i = 0;
   10631         if (len1 > 0) {
   10632             while (n-- > 0) {
   10633                 /* look for next match */
   10634                 j = anylib_find(rkind, self,
   10635                                 sbuf + rkind * i, slen-i,
   10636                                 str1, buf1, len1, i);
   10637                 if (j == -1)
   10638                     break;
   10639                 else if (j > i) {
   10640                     /* copy unchanged part [i:j] */
   10641                     memcpy(res + rkind * ires,
   10642                            sbuf + rkind * i,
   10643                            rkind * (j-i));
   10644                     ires += j - i;
   10645                 }
   10646                 /* copy substitution string */
   10647                 if (len2 > 0) {
   10648                     memcpy(res + rkind * ires,
   10649                            buf2,
   10650                            rkind * len2);
   10651                     ires += len2;
   10652                 }
   10653                 i = j + len1;
   10654             }
   10655             if (i < slen)
   10656                 /* copy tail [i:] */
   10657                 memcpy(res + rkind * ires,
   10658                        sbuf + rkind * i,
   10659                        rkind * (slen-i));
   10660         }
   10661         else {
   10662             /* interleave */
   10663             while (n > 0) {
   10664                 memcpy(res + rkind * ires,
   10665                        buf2,
   10666                        rkind * len2);
   10667                 ires += len2;
   10668                 if (--n <= 0)
   10669                     break;
   10670                 memcpy(res + rkind * ires,
   10671                        sbuf + rkind * i,
   10672                        rkind);
   10673                 ires++;
   10674                 i++;
   10675             }
   10676             memcpy(res + rkind * ires,
   10677                    sbuf + rkind * i,
   10678                    rkind * (slen-i));
   10679         }
   10680     }
   10681 
   10682     if (mayshrink) {
   10683         unicode_adjust_maxchar(&u);
   10684         if (u == NULL)
   10685             goto error;
   10686     }
   10687 
   10688   done:
   10689     if (srelease)
   10690         PyMem_FREE(sbuf);
   10691     if (release1)
   10692         PyMem_FREE(buf1);
   10693     if (release2)
   10694         PyMem_FREE(buf2);
   10695     assert(_PyUnicode_CheckConsistency(u, 1));
   10696     return u;
   10697 
   10698   nothing:
   10699     /* nothing to replace; return original string (when possible) */
   10700     if (srelease)
   10701         PyMem_FREE(sbuf);
   10702     if (release1)
   10703         PyMem_FREE(buf1);
   10704     if (release2)
   10705         PyMem_FREE(buf2);
   10706     return unicode_result_unchanged(self);
   10707 
   10708   error:
   10709     if (srelease && sbuf)
   10710         PyMem_FREE(sbuf);
   10711     if (release1 && buf1)
   10712         PyMem_FREE(buf1);
   10713     if (release2 && buf2)
   10714         PyMem_FREE(buf2);
   10715     return NULL;
   10716 }
   10717 
   10718 /* --- Unicode Object Methods --------------------------------------------- */
   10719 
   10720 /*[clinic input]
   10721 str.title as unicode_title
   10722 
   10723 Return a version of the string where each word is titlecased.
   10724 
   10725 More specifically, words start with uppercased characters and all remaining
   10726 cased characters have lower case.
   10727 [clinic start generated code]*/
   10728 
   10729 static PyObject *
   10730 unicode_title_impl(PyObject *self)
   10731 /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
   10732 {
   10733     if (PyUnicode_READY(self) == -1)
   10734         return NULL;
   10735     return case_operation(self, do_title);
   10736 }
   10737 
   10738 /*[clinic input]
   10739 str.capitalize as unicode_capitalize
   10740 
   10741 Return a capitalized version of the string.
   10742 
   10743 More specifically, make the first character have upper case and the rest lower
   10744 case.
   10745 [clinic start generated code]*/
   10746 
   10747 static PyObject *
   10748 unicode_capitalize_impl(PyObject *self)
   10749 /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
   10750 {
   10751     if (PyUnicode_READY(self) == -1)
   10752         return NULL;
   10753     if (PyUnicode_GET_LENGTH(self) == 0)
   10754         return unicode_result_unchanged(self);
   10755     return case_operation(self, do_capitalize);
   10756 }
   10757 
   10758 /*[clinic input]
   10759 str.casefold as unicode_casefold
   10760 
   10761 Return a version of the string suitable for caseless comparisons.
   10762 [clinic start generated code]*/
   10763 
   10764 static PyObject *
   10765 unicode_casefold_impl(PyObject *self)
   10766 /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
   10767 {
   10768     if (PyUnicode_READY(self) == -1)
   10769         return NULL;
   10770     if (PyUnicode_IS_ASCII(self))
   10771         return ascii_upper_or_lower(self, 1);
   10772     return case_operation(self, do_casefold);
   10773 }
   10774 
   10775 
   10776 /* Argument converter. Accepts a single Unicode character. */
   10777 
   10778 static int
   10779 convert_uc(PyObject *obj, void *addr)
   10780 {
   10781     Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
   10782 
   10783     if (!PyUnicode_Check(obj)) {
   10784         PyErr_Format(PyExc_TypeError,
   10785                      "The fill character must be a unicode character, "
   10786                      "not %.100s", Py_TYPE(obj)->tp_name);
   10787         return 0;
   10788     }
   10789     if (PyUnicode_READY(obj) < 0)
   10790         return 0;
   10791     if (PyUnicode_GET_LENGTH(obj) != 1) {
   10792         PyErr_SetString(PyExc_TypeError,
   10793                         "The fill character must be exactly one character long");
   10794         return 0;
   10795     }
   10796     *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
   10797     return 1;
   10798 }
   10799 
   10800 /*[clinic input]
   10801 str.center as unicode_center
   10802 
   10803     width: Py_ssize_t
   10804     fillchar: Py_UCS4 = ' '
   10805     /
   10806 
   10807 Return a centered string of length width.
   10808 
   10809 Padding is done using the specified fill character (default is a space).
   10810 [clinic start generated code]*/
   10811 
   10812 static PyObject *
   10813 unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
   10814 /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
   10815 {
   10816     Py_ssize_t marg, left;
   10817 
   10818     if (PyUnicode_READY(self) == -1)
   10819         return NULL;
   10820 
   10821     if (PyUnicode_GET_LENGTH(self) >= width)
   10822         return unicode_result_unchanged(self);
   10823 
   10824     marg = width - PyUnicode_GET_LENGTH(self);
   10825     left = marg / 2 + (marg & width & 1);
   10826 
   10827     return pad(self, left, marg - left, fillchar);
   10828 }
   10829 
   10830 /* This function assumes that str1 and str2 are readied by the caller. */
   10831 
   10832 static int
   10833 unicode_compare(PyObject *str1, PyObject *str2)
   10834 {
   10835 #define COMPARE(TYPE1, TYPE2) \
   10836     do { \
   10837         TYPE1* p1 = (TYPE1 *)data1; \
   10838         TYPE2* p2 = (TYPE2 *)data2; \
   10839         TYPE1* end = p1 + len; \
   10840         Py_UCS4 c1, c2; \
   10841         for (; p1 != end; p1++, p2++) { \
   10842             c1 = *p1; \
   10843             c2 = *p2; \
   10844             if (c1 != c2) \
   10845                 return (c1 < c2) ? -1 : 1; \
   10846         } \
   10847     } \
   10848     while (0)
   10849 
   10850     int kind1, kind2;
   10851     void *data1, *data2;
   10852     Py_ssize_t len1, len2, len;
   10853 
   10854     kind1 = PyUnicode_KIND(str1);
   10855     kind2 = PyUnicode_KIND(str2);
   10856     data1 = PyUnicode_DATA(str1);
   10857     data2 = PyUnicode_DATA(str2);
   10858     len1 = PyUnicode_GET_LENGTH(str1);
   10859     len2 = PyUnicode_GET_LENGTH(str2);
   10860     len = Py_MIN(len1, len2);
   10861 
   10862     switch(kind1) {
   10863     case PyUnicode_1BYTE_KIND:
   10864     {
   10865         switch(kind2) {
   10866         case PyUnicode_1BYTE_KIND:
   10867         {
   10868             int cmp = memcmp(data1, data2, len);
   10869             /* normalize result of memcmp() into the range [-1; 1] */
   10870             if (cmp < 0)
   10871                 return -1;
   10872             if (cmp > 0)
   10873                 return 1;
   10874             break;
   10875         }
   10876         case PyUnicode_2BYTE_KIND:
   10877             COMPARE(Py_UCS1, Py_UCS2);
   10878             break;
   10879         case PyUnicode_4BYTE_KIND:
   10880             COMPARE(Py_UCS1, Py_UCS4);
   10881             break;
   10882         default:
   10883             Py_UNREACHABLE();
   10884         }
   10885         break;
   10886     }
   10887     case PyUnicode_2BYTE_KIND:
   10888     {
   10889         switch(kind2) {
   10890         case PyUnicode_1BYTE_KIND:
   10891             COMPARE(Py_UCS2, Py_UCS1);
   10892             break;
   10893         case PyUnicode_2BYTE_KIND:
   10894         {
   10895             COMPARE(Py_UCS2, Py_UCS2);
   10896             break;
   10897         }
   10898         case PyUnicode_4BYTE_KIND:
   10899             COMPARE(Py_UCS2, Py_UCS4);
   10900             break;
   10901         default:
   10902             Py_UNREACHABLE();
   10903         }
   10904         break;
   10905     }
   10906     case PyUnicode_4BYTE_KIND:
   10907     {
   10908         switch(kind2) {
   10909         case PyUnicode_1BYTE_KIND:
   10910             COMPARE(Py_UCS4, Py_UCS1);
   10911             break;
   10912         case PyUnicode_2BYTE_KIND:
   10913             COMPARE(Py_UCS4, Py_UCS2);
   10914             break;
   10915         case PyUnicode_4BYTE_KIND:
   10916         {
   10917 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
   10918             int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
   10919             /* normalize result of wmemcmp() into the range [-1; 1] */
   10920             if (cmp < 0)
   10921                 return -1;
   10922             if (cmp > 0)
   10923                 return 1;
   10924 #else
   10925             COMPARE(Py_UCS4, Py_UCS4);
   10926 #endif
   10927             break;
   10928         }
   10929         default:
   10930             Py_UNREACHABLE();
   10931         }
   10932         break;
   10933     }
   10934     default:
   10935         Py_UNREACHABLE();
   10936     }
   10937 
   10938     if (len1 == len2)
   10939         return 0;
   10940     if (len1 < len2)
   10941         return -1;
   10942     else
   10943         return 1;
   10944 
   10945 #undef COMPARE
   10946 }
   10947 
   10948 static int
   10949 unicode_compare_eq(PyObject *str1, PyObject *str2)
   10950 {
   10951     int kind;
   10952     void *data1, *data2;
   10953     Py_ssize_t len;
   10954     int cmp;
   10955 
   10956     len = PyUnicode_GET_LENGTH(str1);
   10957     if (PyUnicode_GET_LENGTH(str2) != len)
   10958         return 0;
   10959     kind = PyUnicode_KIND(str1);
   10960     if (PyUnicode_KIND(str2) != kind)
   10961         return 0;
   10962     data1 = PyUnicode_DATA(str1);
   10963     data2 = PyUnicode_DATA(str2);
   10964 
   10965     cmp = memcmp(data1, data2, len * kind);
   10966     return (cmp == 0);
   10967 }
   10968 
   10969 
   10970 int
   10971 PyUnicode_Compare(PyObject *left, PyObject *right)
   10972 {
   10973     if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
   10974         if (PyUnicode_READY(left) == -1 ||
   10975             PyUnicode_READY(right) == -1)
   10976             return -1;
   10977 
   10978         /* a string is equal to itself */
   10979         if (left == right)
   10980             return 0;
   10981 
   10982         return unicode_compare(left, right);
   10983     }
   10984     PyErr_Format(PyExc_TypeError,
   10985                  "Can't compare %.100s and %.100s",
   10986                  left->ob_type->tp_name,
   10987                  right->ob_type->tp_name);
   10988     return -1;
   10989 }
   10990 
   10991 int
   10992 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
   10993 {
   10994     Py_ssize_t i;
   10995     int kind;
   10996     Py_UCS4 chr;
   10997     const unsigned char *ustr = (const unsigned char *)str;
   10998 
   10999     assert(_PyUnicode_CHECK(uni));
   11000     if (!PyUnicode_IS_READY(uni)) {
   11001         const wchar_t *ws = _PyUnicode_WSTR(uni);
   11002         /* Compare Unicode string and source character set string */
   11003         for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
   11004             if (chr != ustr[i])
   11005                 return (chr < ustr[i]) ? -1 : 1;
   11006         }
   11007         /* This check keeps Python strings that end in '\0' from comparing equal
   11008          to C strings identical up to that point. */
   11009         if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
   11010             return 1; /* uni is longer */
   11011         if (ustr[i])
   11012             return -1; /* str is longer */
   11013         return 0;
   11014     }
   11015     kind = PyUnicode_KIND(uni);
   11016     if (kind == PyUnicode_1BYTE_KIND) {
   11017         const void *data = PyUnicode_1BYTE_DATA(uni);
   11018         size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
   11019         size_t len, len2 = strlen(str);
   11020         int cmp;
   11021 
   11022         len = Py_MIN(len1, len2);
   11023         cmp = memcmp(data, str, len);
   11024         if (cmp != 0) {
   11025             if (cmp < 0)
   11026                 return -1;
   11027             else
   11028                 return 1;
   11029         }
   11030         if (len1 > len2)
   11031             return 1; /* uni is longer */
   11032         if (len1 < len2)
   11033             return -1; /* str is longer */
   11034         return 0;
   11035     }
   11036     else {
   11037         void *data = PyUnicode_DATA(uni);
   11038         /* Compare Unicode string and source character set string */
   11039         for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
   11040             if (chr != (unsigned char)str[i])
   11041                 return (chr < (unsigned char)(str[i])) ? -1 : 1;
   11042         /* This check keeps Python strings that end in '\0' from comparing equal
   11043          to C strings identical up to that point. */
   11044         if (PyUnicode_GET_LENGTH(uni) != i || chr)
   11045             return 1; /* uni is longer */
   11046         if (str[i])
   11047             return -1; /* str is longer */
   11048         return 0;
   11049     }
   11050 }
   11051 
   11052 static int
   11053 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
   11054 {
   11055     size_t i, len;
   11056     const wchar_t *p;
   11057     len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
   11058     if (strlen(str) != len)
   11059         return 0;
   11060     p = _PyUnicode_WSTR(unicode);
   11061     assert(p);
   11062     for (i = 0; i < len; i++) {
   11063         unsigned char c = (unsigned char)str[i];
   11064         if (c >= 128 || p[i] != (wchar_t)c)
   11065             return 0;
   11066     }
   11067     return 1;
   11068 }
   11069 
   11070 int
   11071 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
   11072 {
   11073     size_t len;
   11074     assert(_PyUnicode_CHECK(unicode));
   11075     assert(str);
   11076 #ifndef NDEBUG
   11077     for (const char *p = str; *p; p++) {
   11078         assert((unsigned char)*p < 128);
   11079     }
   11080 #endif
   11081     if (PyUnicode_READY(unicode) == -1) {
   11082         /* Memory error or bad data */
   11083         PyErr_Clear();
   11084         return non_ready_unicode_equal_to_ascii_string(unicode, str);
   11085     }
   11086     if (!PyUnicode_IS_ASCII(unicode))
   11087         return 0;
   11088     len = (size_t)PyUnicode_GET_LENGTH(unicode);
   11089     return strlen(str) == len &&
   11090            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
   11091 }
   11092 
   11093 int
   11094 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
   11095 {
   11096     PyObject *right_uni;
   11097     Py_hash_t hash;
   11098 
   11099     assert(_PyUnicode_CHECK(left));
   11100     assert(right->string);
   11101 #ifndef NDEBUG
   11102     for (const char *p = right->string; *p; p++) {
   11103         assert((unsigned char)*p < 128);
   11104     }
   11105 #endif
   11106 
   11107     if (PyUnicode_READY(left) == -1) {
   11108         /* memory error or bad data */
   11109         PyErr_Clear();
   11110         return non_ready_unicode_equal_to_ascii_string(left, right->string);
   11111     }
   11112 
   11113     if (!PyUnicode_IS_ASCII(left))
   11114         return 0;
   11115 
   11116     right_uni = _PyUnicode_FromId(right);       /* borrowed */
   11117     if (right_uni == NULL) {
   11118         /* memory error or bad data */
   11119         PyErr_Clear();
   11120         return _PyUnicode_EqualToASCIIString(left, right->string);
   11121     }
   11122 
   11123     if (left == right_uni)
   11124         return 1;
   11125 
   11126     if (PyUnicode_CHECK_INTERNED(left))
   11127         return 0;
   11128 
   11129     assert(_PyUnicode_HASH(right_uni) != -1);
   11130     hash = _PyUnicode_HASH(left);
   11131     if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
   11132         return 0;
   11133 
   11134     return unicode_compare_eq(left, right_uni);
   11135 }
   11136 
   11137 PyObject *
   11138 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
   11139 {
   11140     int result;
   11141 
   11142     if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
   11143         Py_RETURN_NOTIMPLEMENTED;
   11144 
   11145     if (PyUnicode_READY(left) == -1 ||
   11146         PyUnicode_READY(right) == -1)
   11147         return NULL;
   11148 
   11149     if (left == right) {
   11150         switch (op) {
   11151         case Py_EQ:
   11152         case Py_LE:
   11153         case Py_GE:
   11154             /* a string is equal to itself */
   11155             Py_RETURN_TRUE;
   11156         case Py_NE:
   11157         case Py_LT:
   11158         case Py_GT:
   11159             Py_RETURN_FALSE;
   11160         default:
   11161             PyErr_BadArgument();
   11162             return NULL;
   11163         }
   11164     }
   11165     else if (op == Py_EQ || op == Py_NE) {
   11166         result = unicode_compare_eq(left, right);
   11167         result ^= (op == Py_NE);
   11168         return PyBool_FromLong(result);
   11169     }
   11170     else {
   11171         result = unicode_compare(left, right);
   11172         Py_RETURN_RICHCOMPARE(result, 0, op);
   11173     }
   11174 }
   11175 
   11176 int
   11177 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
   11178 {
   11179     return unicode_eq(aa, bb);
   11180 }
   11181 
   11182 int
   11183 PyUnicode_Contains(PyObject *str, PyObject *substr)
   11184 {
   11185     int kind1, kind2;
   11186     void *buf1, *buf2;
   11187     Py_ssize_t len1, len2;
   11188     int result;
   11189 
   11190     if (!PyUnicode_Check(substr)) {
   11191         PyErr_Format(PyExc_TypeError,
   11192                      "'in <string>' requires string as left operand, not %.100s",
   11193                      Py_TYPE(substr)->tp_name);
   11194         return -1;
   11195     }
   11196     if (PyUnicode_READY(substr) == -1)
   11197         return -1;
   11198     if (ensure_unicode(str) < 0)
   11199         return -1;
   11200 
   11201     kind1 = PyUnicode_KIND(str);
   11202     kind2 = PyUnicode_KIND(substr);
   11203     if (kind1 < kind2)
   11204         return 0;
   11205     len1 = PyUnicode_GET_LENGTH(str);
   11206     len2 = PyUnicode_GET_LENGTH(substr);
   11207     if (len1 < len2)
   11208         return 0;
   11209     buf1 = PyUnicode_DATA(str);
   11210     buf2 = PyUnicode_DATA(substr);
   11211     if (len2 == 1) {
   11212         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
   11213         result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
   11214         return result;
   11215     }
   11216     if (kind2 != kind1) {
   11217         buf2 = _PyUnicode_AsKind(substr, kind1);
   11218         if (!buf2)
   11219             return -1;
   11220     }
   11221 
   11222     switch (kind1) {
   11223     case PyUnicode_1BYTE_KIND:
   11224         result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
   11225         break;
   11226     case PyUnicode_2BYTE_KIND:
   11227         result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
   11228         break;
   11229     case PyUnicode_4BYTE_KIND:
   11230         result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
   11231         break;
   11232     default:
   11233         Py_UNREACHABLE();
   11234     }
   11235 
   11236     if (kind2 != kind1)
   11237         PyMem_Free(buf2);
   11238 
   11239     return result;
   11240 }
   11241 
   11242 /* Concat to string or Unicode object giving a new Unicode object. */
   11243 
   11244 PyObject *
   11245 PyUnicode_Concat(PyObject *left, PyObject *right)
   11246 {
   11247     PyObject *result;
   11248     Py_UCS4 maxchar, maxchar2;
   11249     Py_ssize_t left_len, right_len, new_len;
   11250 
   11251     if (ensure_unicode(left) < 0)
   11252         return NULL;
   11253 
   11254     if (!PyUnicode_Check(right)) {
   11255         PyErr_Format(PyExc_TypeError,
   11256                      "can only concatenate str (not \"%.200s\") to str",
   11257                      right->ob_type->tp_name);
   11258         return NULL;
   11259     }
   11260     if (PyUnicode_READY(right) < 0)
   11261         return NULL;
   11262 
   11263     /* Shortcuts */
   11264     if (left == unicode_empty)
   11265         return PyUnicode_FromObject(right);
   11266     if (right == unicode_empty)
   11267         return PyUnicode_FromObject(left);
   11268 
   11269     left_len = PyUnicode_GET_LENGTH(left);
   11270     right_len = PyUnicode_GET_LENGTH(right);
   11271     if (left_len > PY_SSIZE_T_MAX - right_len) {
   11272         PyErr_SetString(PyExc_OverflowError,
   11273                         "strings are too large to concat");
   11274         return NULL;
   11275     }
   11276     new_len = left_len + right_len;
   11277 
   11278     maxchar = PyUnicode_MAX_CHAR_VALUE(left);
   11279     maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
   11280     maxchar = Py_MAX(maxchar, maxchar2);
   11281 
   11282     /* Concat the two Unicode strings */
   11283     result = PyUnicode_New(new_len, maxchar);
   11284     if (result == NULL)
   11285         return NULL;
   11286     _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
   11287     _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
   11288     assert(_PyUnicode_CheckConsistency(result, 1));
   11289     return result;
   11290 }
   11291 
   11292 void
   11293 PyUnicode_Append(PyObject **p_left, PyObject *right)
   11294 {
   11295     PyObject *left, *res;
   11296     Py_UCS4 maxchar, maxchar2;
   11297     Py_ssize_t left_len, right_len, new_len;
   11298 
   11299     if (p_left == NULL) {
   11300         if (!PyErr_Occurred())
   11301             PyErr_BadInternalCall();
   11302         return;
   11303     }
   11304     left = *p_left;
   11305     if (right == NULL || left == NULL
   11306         || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
   11307         if (!PyErr_Occurred())
   11308             PyErr_BadInternalCall();
   11309         goto error;
   11310     }
   11311 
   11312     if (PyUnicode_READY(left) == -1)
   11313         goto error;
   11314     if (PyUnicode_READY(right) == -1)
   11315         goto error;
   11316 
   11317     /* Shortcuts */
   11318     if (left == unicode_empty) {
   11319         Py_DECREF(left);
   11320         Py_INCREF(right);
   11321         *p_left = right;
   11322         return;
   11323     }
   11324     if (right == unicode_empty)
   11325         return;
   11326 
   11327     left_len = PyUnicode_GET_LENGTH(left);
   11328     right_len = PyUnicode_GET_LENGTH(right);
   11329     if (left_len > PY_SSIZE_T_MAX - right_len) {
   11330         PyErr_SetString(PyExc_OverflowError,
   11331                         "strings are too large to concat");
   11332         goto error;
   11333     }
   11334     new_len = left_len + right_len;
   11335 
   11336     if (unicode_modifiable(left)
   11337         && PyUnicode_CheckExact(right)
   11338         && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
   11339         /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
   11340            to change the structure size, but characters are stored just after
   11341            the structure, and so it requires to move all characters which is
   11342            not so different than duplicating the string. */
   11343         && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
   11344     {
   11345         /* append inplace */
   11346         if (unicode_resize(p_left, new_len) != 0)
   11347             goto error;
   11348 
   11349         /* copy 'right' into the newly allocated area of 'left' */
   11350         _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
   11351     }
   11352     else {
   11353         maxchar = PyUnicode_MAX_CHAR_VALUE(left);
   11354         maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
   11355         maxchar = Py_MAX(maxchar, maxchar2);
   11356 
   11357         /* Concat the two Unicode strings */
   11358         res = PyUnicode_New(new_len, maxchar);
   11359         if (res == NULL)
   11360             goto error;
   11361         _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
   11362         _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
   11363         Py_DECREF(left);
   11364         *p_left = res;
   11365     }
   11366     assert(_PyUnicode_CheckConsistency(*p_left, 1));
   11367     return;
   11368 
   11369 error:
   11370     Py_CLEAR(*p_left);
   11371 }
   11372 
   11373 void
   11374 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
   11375 {
   11376     PyUnicode_Append(pleft, right);
   11377     Py_XDECREF(right);
   11378 }
   11379 
   11380 /*
   11381 Wraps stringlib_parse_args_finds() and additionally ensures that the
   11382 first argument is a unicode object.
   11383 */
   11384 
   11385 static inline int
   11386 parse_args_finds_unicode(const char * function_name, PyObject *args,
   11387                          PyObject **substring,
   11388                          Py_ssize_t *start, Py_ssize_t *end)
   11389 {
   11390     if(stringlib_parse_args_finds(function_name, args, substring,
   11391                                   start, end)) {
   11392         if (ensure_unicode(*substring) < 0)
   11393             return 0;
   11394         return 1;
   11395     }
   11396     return 0;
   11397 }
   11398 
   11399 PyDoc_STRVAR(count__doc__,
   11400              "S.count(sub[, start[, end]]) -> int\n\
   11401 \n\
   11402 Return the number of non-overlapping occurrences of substring sub in\n\
   11403 string S[start:end].  Optional arguments start and end are\n\
   11404 interpreted as in slice notation.");
   11405 
   11406 static PyObject *
   11407 unicode_count(PyObject *self, PyObject *args)
   11408 {
   11409     PyObject *substring = NULL;   /* initialize to fix a compiler warning */
   11410     Py_ssize_t start = 0;
   11411     Py_ssize_t end = PY_SSIZE_T_MAX;
   11412     PyObject *result;
   11413     int kind1, kind2;
   11414     void *buf1, *buf2;
   11415     Py_ssize_t len1, len2, iresult;
   11416 
   11417     if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
   11418         return NULL;
   11419 
   11420     kind1 = PyUnicode_KIND(self);
   11421     kind2 = PyUnicode_KIND(substring);
   11422     if (kind1 < kind2)
   11423         return PyLong_FromLong(0);
   11424 
   11425     len1 = PyUnicode_GET_LENGTH(self);
   11426     len2 = PyUnicode_GET_LENGTH(substring);
   11427     ADJUST_INDICES(start, end, len1);
   11428     if (end - start < len2)
   11429         return PyLong_FromLong(0);
   11430 
   11431     buf1 = PyUnicode_DATA(self);
   11432     buf2 = PyUnicode_DATA(substring);
   11433     if (kind2 != kind1) {
   11434         buf2 = _PyUnicode_AsKind(substring, kind1);
   11435         if (!buf2)
   11436             return NULL;
   11437     }
   11438     switch (kind1) {
   11439     case PyUnicode_1BYTE_KIND:
   11440         iresult = ucs1lib_count(
   11441             ((Py_UCS1*)buf1) + start, end - start,
   11442             buf2, len2, PY_SSIZE_T_MAX
   11443             );
   11444         break;
   11445     case PyUnicode_2BYTE_KIND:
   11446         iresult = ucs2lib_count(
   11447             ((Py_UCS2*)buf1) + start, end - start,
   11448             buf2, len2, PY_SSIZE_T_MAX
   11449             );
   11450         break;
   11451     case PyUnicode_4BYTE_KIND:
   11452         iresult = ucs4lib_count(
   11453             ((Py_UCS4*)buf1) + start, end - start,
   11454             buf2, len2, PY_SSIZE_T_MAX
   11455             );
   11456         break;
   11457     default:
   11458         Py_UNREACHABLE();
   11459     }
   11460 
   11461     result = PyLong_FromSsize_t(iresult);
   11462 
   11463     if (kind2 != kind1)
   11464         PyMem_Free(buf2);
   11465 
   11466     return result;
   11467 }
   11468 
   11469 /*[clinic input]
   11470 str.encode as unicode_encode
   11471 
   11472     encoding: str(c_default="NULL") = 'utf-8'
   11473         The encoding in which to encode the string.
   11474     errors: str(c_default="NULL") = 'strict'
   11475         The error handling scheme to use for encoding errors.
   11476         The default is 'strict' meaning that encoding errors raise a
   11477         UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
   11478         'xmlcharrefreplace' as well as any other name registered with
   11479         codecs.register_error that can handle UnicodeEncodeErrors.
   11480 
   11481 Encode the string using the codec registered for encoding.
   11482 [clinic start generated code]*/
   11483 
   11484 static PyObject *
   11485 unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
   11486 /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
   11487 {
   11488     return PyUnicode_AsEncodedString(self, encoding, errors);
   11489 }
   11490 
   11491 /*[clinic input]
   11492 str.expandtabs as unicode_expandtabs
   11493 
   11494     tabsize: int = 8
   11495 
   11496 Return a copy where all tab characters are expanded using spaces.
   11497 
   11498 If tabsize is not given, a tab size of 8 characters is assumed.
   11499 [clinic start generated code]*/
   11500 
   11501 static PyObject *
   11502 unicode_expandtabs_impl(PyObject *self, int tabsize)
   11503 /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
   11504 {
   11505     Py_ssize_t i, j, line_pos, src_len, incr;
   11506     Py_UCS4 ch;
   11507     PyObject *u;
   11508     void *src_data, *dest_data;
   11509     int kind;
   11510     int found;
   11511 
   11512     if (PyUnicode_READY(self) == -1)
   11513         return NULL;
   11514 
   11515     /* First pass: determine size of output string */
   11516     src_len = PyUnicode_GET_LENGTH(self);
   11517     i = j = line_pos = 0;
   11518     kind = PyUnicode_KIND(self);
   11519     src_data = PyUnicode_DATA(self);
   11520     found = 0;
   11521     for (; i < src_len; i++) {
   11522         ch = PyUnicode_READ(kind, src_data, i);
   11523         if (ch == '\t') {
   11524             found = 1;
   11525             if (tabsize > 0) {
   11526                 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
   11527                 if (j > PY_SSIZE_T_MAX - incr)
   11528                     goto overflow;
   11529                 line_pos += incr;
   11530                 j += incr;
   11531             }
   11532         }
   11533         else {
   11534             if (j > PY_SSIZE_T_MAX - 1)
   11535                 goto overflow;
   11536             line_pos++;
   11537             j++;
   11538             if (ch == '\n' || ch == '\r')
   11539                 line_pos = 0;
   11540         }
   11541     }
   11542     if (!found)
   11543         return unicode_result_unchanged(self);
   11544 
   11545     /* Second pass: create output string and fill it */
   11546     u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
   11547     if (!u)
   11548         return NULL;
   11549     dest_data = PyUnicode_DATA(u);
   11550 
   11551     i = j = line_pos = 0;
   11552 
   11553     for (; i < src_len; i++) {
   11554         ch = PyUnicode_READ(kind, src_data, i);
   11555         if (ch == '\t') {
   11556             if (tabsize > 0) {
   11557                 incr = tabsize - (line_pos % tabsize);
   11558                 line_pos += incr;
   11559                 FILL(kind, dest_data, ' ', j, incr);
   11560                 j += incr;
   11561             }
   11562         }
   11563         else {
   11564             line_pos++;
   11565             PyUnicode_WRITE(kind, dest_data, j, ch);
   11566             j++;
   11567             if (ch == '\n' || ch == '\r')
   11568                 line_pos = 0;
   11569         }
   11570     }
   11571     assert (j == PyUnicode_GET_LENGTH(u));
   11572     return unicode_result(u);
   11573 
   11574   overflow:
   11575     PyErr_SetString(PyExc_OverflowError, "new string is too long");
   11576     return NULL;
   11577 }
   11578 
   11579 PyDoc_STRVAR(find__doc__,
   11580              "S.find(sub[, start[, end]]) -> int\n\
   11581 \n\
   11582 Return the lowest index in S where substring sub is found,\n\
   11583 such that sub is contained within S[start:end].  Optional\n\
   11584 arguments start and end are interpreted as in slice notation.\n\
   11585 \n\
   11586 Return -1 on failure.");
   11587 
   11588 static PyObject *
   11589 unicode_find(PyObject *self, PyObject *args)
   11590 {
   11591     /* initialize variables to prevent gcc warning */
   11592     PyObject *substring = NULL;
   11593     Py_ssize_t start = 0;
   11594     Py_ssize_t end = 0;
   11595     Py_ssize_t result;
   11596 
   11597     if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
   11598         return NULL;
   11599 
   11600     if (PyUnicode_READY(self) == -1)
   11601         return NULL;
   11602 
   11603     result = any_find_slice(self, substring, start, end, 1);
   11604 
   11605     if (result == -2)
   11606         return NULL;
   11607 
   11608     return PyLong_FromSsize_t(result);
   11609 }
   11610 
   11611 static PyObject *
   11612 unicode_getitem(PyObject *self, Py_ssize_t index)
   11613 {
   11614     void *data;
   11615     enum PyUnicode_Kind kind;
   11616     Py_UCS4 ch;
   11617 
   11618     if (!PyUnicode_Check(self)) {
   11619         PyErr_BadArgument();
   11620         return NULL;
   11621     }
   11622     if (PyUnicode_READY(self) == -1) {
   11623         return NULL;
   11624     }
   11625     if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
   11626         PyErr_SetString(PyExc_IndexError, "string index out of range");
   11627         return NULL;
   11628     }
   11629     kind = PyUnicode_KIND(self);
   11630     data = PyUnicode_DATA(self);
   11631     ch = PyUnicode_READ(kind, data, index);
   11632     return unicode_char(ch);
   11633 }
   11634 
   11635 /* Believe it or not, this produces the same value for ASCII strings
   11636    as bytes_hash(). */
   11637 static Py_hash_t
   11638 unicode_hash(PyObject *self)
   11639 {
   11640     Py_ssize_t len;
   11641     Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
   11642 
   11643 #ifdef Py_DEBUG
   11644     assert(_Py_HashSecret_Initialized);
   11645 #endif
   11646     if (_PyUnicode_HASH(self) != -1)
   11647         return _PyUnicode_HASH(self);
   11648     if (PyUnicode_READY(self) == -1)
   11649         return -1;
   11650     len = PyUnicode_GET_LENGTH(self);
   11651     /*
   11652       We make the hash of the empty string be 0, rather than using
   11653       (prefix ^ suffix), since this slightly obfuscates the hash secret
   11654     */
   11655     if (len == 0) {
   11656         _PyUnicode_HASH(self) = 0;
   11657         return 0;
   11658     }
   11659     x = _Py_HashBytes(PyUnicode_DATA(self),
   11660                       PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
   11661     _PyUnicode_HASH(self) = x;
   11662     return x;
   11663 }
   11664 
   11665 PyDoc_STRVAR(index__doc__,
   11666              "S.index(sub[, start[, end]]) -> int\n\
   11667 \n\
   11668 Return the lowest index in S where substring sub is found, \n\
   11669 such that sub is contained within S[start:end].  Optional\n\
   11670 arguments start and end are interpreted as in slice notation.\n\
   11671 \n\
   11672 Raises ValueError when the substring is not found.");
   11673 
   11674 static PyObject *
   11675 unicode_index(PyObject *self, PyObject *args)
   11676 {
   11677     /* initialize variables to prevent gcc warning */
   11678     Py_ssize_t result;
   11679     PyObject *substring = NULL;
   11680     Py_ssize_t start = 0;
   11681     Py_ssize_t end = 0;
   11682 
   11683     if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
   11684         return NULL;
   11685 
   11686     if (PyUnicode_READY(self) == -1)
   11687         return NULL;
   11688 
   11689     result = any_find_slice(self, substring, start, end, 1);
   11690 
   11691     if (result == -2)
   11692         return NULL;
   11693 
   11694     if (result < 0) {
   11695         PyErr_SetString(PyExc_ValueError, "substring not found");
   11696         return NULL;
   11697     }
   11698 
   11699     return PyLong_FromSsize_t(result);
   11700 }
   11701 
   11702 /*[clinic input]
   11703 str.isascii as unicode_isascii
   11704 
   11705 Return True if all characters in the string are ASCII, False otherwise.
   11706 
   11707 ASCII characters have code points in the range U+0000-U+007F.
   11708 Empty string is ASCII too.
   11709 [clinic start generated code]*/
   11710 
   11711 static PyObject *
   11712 unicode_isascii_impl(PyObject *self)
   11713 /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
   11714 {
   11715     if (PyUnicode_READY(self) == -1) {
   11716         return NULL;
   11717     }
   11718     return PyBool_FromLong(PyUnicode_IS_ASCII(self));
   11719 }
   11720 
   11721 /*[clinic input]
   11722 str.islower as unicode_islower
   11723 
   11724 Return True if the string is a lowercase string, False otherwise.
   11725 
   11726 A string is lowercase if all cased characters in the string are lowercase and
   11727 there is at least one cased character in the string.
   11728 [clinic start generated code]*/
   11729 
   11730 static PyObject *
   11731 unicode_islower_impl(PyObject *self)
   11732 /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
   11733 {
   11734     Py_ssize_t i, length;
   11735     int kind;
   11736     void *data;
   11737     int cased;
   11738 
   11739     if (PyUnicode_READY(self) == -1)
   11740         return NULL;
   11741     length = PyUnicode_GET_LENGTH(self);
   11742     kind = PyUnicode_KIND(self);
   11743     data = PyUnicode_DATA(self);
   11744 
   11745     /* Shortcut for single character strings */
   11746     if (length == 1)
   11747         return PyBool_FromLong(
   11748             Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
   11749 
   11750     /* Special case for empty strings */
   11751     if (length == 0)
   11752         Py_RETURN_FALSE;
   11753 
   11754     cased = 0;
   11755     for (i = 0; i < length; i++) {
   11756         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   11757 
   11758         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
   11759             Py_RETURN_FALSE;
   11760         else if (!cased && Py_UNICODE_ISLOWER(ch))
   11761             cased = 1;
   11762     }
   11763     return PyBool_FromLong(cased);
   11764 }
   11765 
   11766 /*[clinic input]
   11767 str.isupper as unicode_isupper
   11768 
   11769 Return True if the string is an uppercase string, False otherwise.
   11770 
   11771 A string is uppercase if all cased characters in the string are uppercase and
   11772 there is at least one cased character in the string.
   11773 [clinic start generated code]*/
   11774 
   11775 static PyObject *
   11776 unicode_isupper_impl(PyObject *self)
   11777 /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
   11778 {
   11779     Py_ssize_t i, length;
   11780     int kind;
   11781     void *data;
   11782     int cased;
   11783 
   11784     if (PyUnicode_READY(self) == -1)
   11785         return NULL;
   11786     length = PyUnicode_GET_LENGTH(self);
   11787     kind = PyUnicode_KIND(self);
   11788     data = PyUnicode_DATA(self);
   11789 
   11790     /* Shortcut for single character strings */
   11791     if (length == 1)
   11792         return PyBool_FromLong(
   11793             Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
   11794 
   11795     /* Special case for empty strings */
   11796     if (length == 0)
   11797         Py_RETURN_FALSE;
   11798 
   11799     cased = 0;
   11800     for (i = 0; i < length; i++) {
   11801         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   11802 
   11803         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
   11804             Py_RETURN_FALSE;
   11805         else if (!cased && Py_UNICODE_ISUPPER(ch))
   11806             cased = 1;
   11807     }
   11808     return PyBool_FromLong(cased);
   11809 }
   11810 
   11811 /*[clinic input]
   11812 str.istitle as unicode_istitle
   11813 
   11814 Return True if the string is a title-cased string, False otherwise.
   11815 
   11816 In a title-cased string, upper- and title-case characters may only
   11817 follow uncased characters and lowercase characters only cased ones.
   11818 [clinic start generated code]*/
   11819 
   11820 static PyObject *
   11821 unicode_istitle_impl(PyObject *self)
   11822 /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
   11823 {
   11824     Py_ssize_t i, length;
   11825     int kind;
   11826     void *data;
   11827     int cased, previous_is_cased;
   11828 
   11829     if (PyUnicode_READY(self) == -1)
   11830         return NULL;
   11831     length = PyUnicode_GET_LENGTH(self);
   11832     kind = PyUnicode_KIND(self);
   11833     data = PyUnicode_DATA(self);
   11834 
   11835     /* Shortcut for single character strings */
   11836     if (length == 1) {
   11837         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
   11838         return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
   11839                                (Py_UNICODE_ISUPPER(ch) != 0));
   11840     }
   11841 
   11842     /* Special case for empty strings */
   11843     if (length == 0)
   11844         Py_RETURN_FALSE;
   11845 
   11846     cased = 0;
   11847     previous_is_cased = 0;
   11848     for (i = 0; i < length; i++) {
   11849         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   11850 
   11851         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
   11852             if (previous_is_cased)
   11853                 Py_RETURN_FALSE;
   11854             previous_is_cased = 1;
   11855             cased = 1;
   11856         }
   11857         else if (Py_UNICODE_ISLOWER(ch)) {
   11858             if (!previous_is_cased)
   11859                 Py_RETURN_FALSE;
   11860             previous_is_cased = 1;
   11861             cased = 1;
   11862         }
   11863         else
   11864             previous_is_cased = 0;
   11865     }
   11866     return PyBool_FromLong(cased);
   11867 }
   11868 
   11869 /*[clinic input]
   11870 str.isspace as unicode_isspace
   11871 
   11872 Return True if the string is a whitespace string, False otherwise.
   11873 
   11874 A string is whitespace if all characters in the string are whitespace and there
   11875 is at least one character in the string.
   11876 [clinic start generated code]*/
   11877 
   11878 static PyObject *
   11879 unicode_isspace_impl(PyObject *self)
   11880 /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
   11881 {
   11882     Py_ssize_t i, length;
   11883     int kind;
   11884     void *data;
   11885 
   11886     if (PyUnicode_READY(self) == -1)
   11887         return NULL;
   11888     length = PyUnicode_GET_LENGTH(self);
   11889     kind = PyUnicode_KIND(self);
   11890     data = PyUnicode_DATA(self);
   11891 
   11892     /* Shortcut for single character strings */
   11893     if (length == 1)
   11894         return PyBool_FromLong(
   11895             Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
   11896 
   11897     /* Special case for empty strings */
   11898     if (length == 0)
   11899         Py_RETURN_FALSE;
   11900 
   11901     for (i = 0; i < length; i++) {
   11902         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   11903         if (!Py_UNICODE_ISSPACE(ch))
   11904             Py_RETURN_FALSE;
   11905     }
   11906     Py_RETURN_TRUE;
   11907 }
   11908 
   11909 /*[clinic input]
   11910 str.isalpha as unicode_isalpha
   11911 
   11912 Return True if the string is an alphabetic string, False otherwise.
   11913 
   11914 A string is alphabetic if all characters in the string are alphabetic and there
   11915 is at least one character in the string.
   11916 [clinic start generated code]*/
   11917 
   11918 static PyObject *
   11919 unicode_isalpha_impl(PyObject *self)
   11920 /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
   11921 {
   11922     Py_ssize_t i, length;
   11923     int kind;
   11924     void *data;
   11925 
   11926     if (PyUnicode_READY(self) == -1)
   11927         return NULL;
   11928     length = PyUnicode_GET_LENGTH(self);
   11929     kind = PyUnicode_KIND(self);
   11930     data = PyUnicode_DATA(self);
   11931 
   11932     /* Shortcut for single character strings */
   11933     if (length == 1)
   11934         return PyBool_FromLong(
   11935             Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
   11936 
   11937     /* Special case for empty strings */
   11938     if (length == 0)
   11939         Py_RETURN_FALSE;
   11940 
   11941     for (i = 0; i < length; i++) {
   11942         if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
   11943             Py_RETURN_FALSE;
   11944     }
   11945     Py_RETURN_TRUE;
   11946 }
   11947 
   11948 /*[clinic input]
   11949 str.isalnum as unicode_isalnum
   11950 
   11951 Return True if the string is an alpha-numeric string, False otherwise.
   11952 
   11953 A string is alpha-numeric if all characters in the string are alpha-numeric and
   11954 there is at least one character in the string.
   11955 [clinic start generated code]*/
   11956 
   11957 static PyObject *
   11958 unicode_isalnum_impl(PyObject *self)
   11959 /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
   11960 {
   11961     int kind;
   11962     void *data;
   11963     Py_ssize_t len, i;
   11964 
   11965     if (PyUnicode_READY(self) == -1)
   11966         return NULL;
   11967 
   11968     kind = PyUnicode_KIND(self);
   11969     data = PyUnicode_DATA(self);
   11970     len = PyUnicode_GET_LENGTH(self);
   11971 
   11972     /* Shortcut for single character strings */
   11973     if (len == 1) {
   11974         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
   11975         return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
   11976     }
   11977 
   11978     /* Special case for empty strings */
   11979     if (len == 0)
   11980         Py_RETURN_FALSE;
   11981 
   11982     for (i = 0; i < len; i++) {
   11983         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   11984         if (!Py_UNICODE_ISALNUM(ch))
   11985             Py_RETURN_FALSE;
   11986     }
   11987     Py_RETURN_TRUE;
   11988 }
   11989 
   11990 /*[clinic input]
   11991 str.isdecimal as unicode_isdecimal
   11992 
   11993 Return True if the string is a decimal string, False otherwise.
   11994 
   11995 A string is a decimal string if all characters in the string are decimal and
   11996 there is at least one character in the string.
   11997 [clinic start generated code]*/
   11998 
   11999 static PyObject *
   12000 unicode_isdecimal_impl(PyObject *self)
   12001 /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
   12002 {
   12003     Py_ssize_t i, length;
   12004     int kind;
   12005     void *data;
   12006 
   12007     if (PyUnicode_READY(self) == -1)
   12008         return NULL;
   12009     length = PyUnicode_GET_LENGTH(self);
   12010     kind = PyUnicode_KIND(self);
   12011     data = PyUnicode_DATA(self);
   12012 
   12013     /* Shortcut for single character strings */
   12014     if (length == 1)
   12015         return PyBool_FromLong(
   12016             Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
   12017 
   12018     /* Special case for empty strings */
   12019     if (length == 0)
   12020         Py_RETURN_FALSE;
   12021 
   12022     for (i = 0; i < length; i++) {
   12023         if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
   12024             Py_RETURN_FALSE;
   12025     }
   12026     Py_RETURN_TRUE;
   12027 }
   12028 
   12029 /*[clinic input]
   12030 str.isdigit as unicode_isdigit
   12031 
   12032 Return True if the string is a digit string, False otherwise.
   12033 
   12034 A string is a digit string if all characters in the string are digits and there
   12035 is at least one character in the string.
   12036 [clinic start generated code]*/
   12037 
   12038 static PyObject *
   12039 unicode_isdigit_impl(PyObject *self)
   12040 /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
   12041 {
   12042     Py_ssize_t i, length;
   12043     int kind;
   12044     void *data;
   12045 
   12046     if (PyUnicode_READY(self) == -1)
   12047         return NULL;
   12048     length = PyUnicode_GET_LENGTH(self);
   12049     kind = PyUnicode_KIND(self);
   12050     data = PyUnicode_DATA(self);
   12051 
   12052     /* Shortcut for single character strings */
   12053     if (length == 1) {
   12054         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
   12055         return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
   12056     }
   12057 
   12058     /* Special case for empty strings */
   12059     if (length == 0)
   12060         Py_RETURN_FALSE;
   12061 
   12062     for (i = 0; i < length; i++) {
   12063         if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
   12064             Py_RETURN_FALSE;
   12065     }
   12066     Py_RETURN_TRUE;
   12067 }
   12068 
   12069 /*[clinic input]
   12070 str.isnumeric as unicode_isnumeric
   12071 
   12072 Return True if the string is a numeric string, False otherwise.
   12073 
   12074 A string is numeric if all characters in the string are numeric and there is at
   12075 least one character in the string.
   12076 [clinic start generated code]*/
   12077 
   12078 static PyObject *
   12079 unicode_isnumeric_impl(PyObject *self)
   12080 /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
   12081 {
   12082     Py_ssize_t i, length;
   12083     int kind;
   12084     void *data;
   12085 
   12086     if (PyUnicode_READY(self) == -1)
   12087         return NULL;
   12088     length = PyUnicode_GET_LENGTH(self);
   12089     kind = PyUnicode_KIND(self);
   12090     data = PyUnicode_DATA(self);
   12091 
   12092     /* Shortcut for single character strings */
   12093     if (length == 1)
   12094         return PyBool_FromLong(
   12095             Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
   12096 
   12097     /* Special case for empty strings */
   12098     if (length == 0)
   12099         Py_RETURN_FALSE;
   12100 
   12101     for (i = 0; i < length; i++) {
   12102         if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
   12103             Py_RETURN_FALSE;
   12104     }
   12105     Py_RETURN_TRUE;
   12106 }
   12107 
   12108 int
   12109 PyUnicode_IsIdentifier(PyObject *self)
   12110 {
   12111     int kind;
   12112     void *data;
   12113     Py_ssize_t i;
   12114     Py_UCS4 first;
   12115 
   12116     if (PyUnicode_READY(self) == -1) {
   12117         Py_FatalError("identifier not ready");
   12118         return 0;
   12119     }
   12120 
   12121     /* Special case for empty strings */
   12122     if (PyUnicode_GET_LENGTH(self) == 0)
   12123         return 0;
   12124     kind = PyUnicode_KIND(self);
   12125     data = PyUnicode_DATA(self);
   12126 
   12127     /* PEP 3131 says that the first character must be in
   12128        XID_Start and subsequent characters in XID_Continue,
   12129        and for the ASCII range, the 2.x rules apply (i.e
   12130        start with letters and underscore, continue with
   12131        letters, digits, underscore). However, given the current
   12132        definition of XID_Start and XID_Continue, it is sufficient
   12133        to check just for these, except that _ must be allowed
   12134        as starting an identifier.  */
   12135     first = PyUnicode_READ(kind, data, 0);
   12136     if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
   12137         return 0;
   12138 
   12139     for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
   12140         if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
   12141             return 0;
   12142     return 1;
   12143 }
   12144 
   12145 /*[clinic input]
   12146 str.isidentifier as unicode_isidentifier
   12147 
   12148 Return True if the string is a valid Python identifier, False otherwise.
   12149 
   12150 Use keyword.iskeyword() to test for reserved identifiers such as "def" and
   12151 "class".
   12152 [clinic start generated code]*/
   12153 
   12154 static PyObject *
   12155 unicode_isidentifier_impl(PyObject *self)
   12156 /*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
   12157 {
   12158     return PyBool_FromLong(PyUnicode_IsIdentifier(self));
   12159 }
   12160 
   12161 /*[clinic input]
   12162 str.isprintable as unicode_isprintable
   12163 
   12164 Return True if the string is printable, False otherwise.
   12165 
   12166 A string is printable if all of its characters are considered printable in
   12167 repr() or if it is empty.
   12168 [clinic start generated code]*/
   12169 
   12170 static PyObject *
   12171 unicode_isprintable_impl(PyObject *self)
   12172 /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
   12173 {
   12174     Py_ssize_t i, length;
   12175     int kind;
   12176     void *data;
   12177 
   12178     if (PyUnicode_READY(self) == -1)
   12179         return NULL;
   12180     length = PyUnicode_GET_LENGTH(self);
   12181     kind = PyUnicode_KIND(self);
   12182     data = PyUnicode_DATA(self);
   12183 
   12184     /* Shortcut for single character strings */
   12185     if (length == 1)
   12186         return PyBool_FromLong(
   12187             Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
   12188 
   12189     for (i = 0; i < length; i++) {
   12190         if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
   12191             Py_RETURN_FALSE;
   12192         }
   12193     }
   12194     Py_RETURN_TRUE;
   12195 }
   12196 
   12197 /*[clinic input]
   12198 str.join as unicode_join
   12199 
   12200     iterable: object
   12201     /
   12202 
   12203 Concatenate any number of strings.
   12204 
   12205 The string whose method is called is inserted in between each given string.
   12206 The result is returned as a new string.
   12207 
   12208 Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
   12209 [clinic start generated code]*/
   12210 
   12211 static PyObject *
   12212 unicode_join(PyObject *self, PyObject *iterable)
   12213 /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
   12214 {
   12215     return PyUnicode_Join(self, iterable);
   12216 }
   12217 
   12218 static Py_ssize_t
   12219 unicode_length(PyObject *self)
   12220 {
   12221     if (PyUnicode_READY(self) == -1)
   12222         return -1;
   12223     return PyUnicode_GET_LENGTH(self);
   12224 }
   12225 
   12226 /*[clinic input]
   12227 str.ljust as unicode_ljust
   12228 
   12229     width: Py_ssize_t
   12230     fillchar: Py_UCS4 = ' '
   12231     /
   12232 
   12233 Return a left-justified string of length width.
   12234 
   12235 Padding is done using the specified fill character (default is a space).
   12236 [clinic start generated code]*/
   12237 
   12238 static PyObject *
   12239 unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
   12240 /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
   12241 {
   12242     if (PyUnicode_READY(self) == -1)
   12243         return NULL;
   12244 
   12245     if (PyUnicode_GET_LENGTH(self) >= width)
   12246         return unicode_result_unchanged(self);
   12247 
   12248     return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
   12249 }
   12250 
   12251 /*[clinic input]
   12252 str.lower as unicode_lower
   12253 
   12254 Return a copy of the string converted to lowercase.
   12255 [clinic start generated code]*/
   12256 
   12257 static PyObject *
   12258 unicode_lower_impl(PyObject *self)
   12259 /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
   12260 {
   12261     if (PyUnicode_READY(self) == -1)
   12262         return NULL;
   12263     if (PyUnicode_IS_ASCII(self))
   12264         return ascii_upper_or_lower(self, 1);
   12265     return case_operation(self, do_lower);
   12266 }
   12267 
   12268 #define LEFTSTRIP 0
   12269 #define RIGHTSTRIP 1
   12270 #define BOTHSTRIP 2
   12271 
   12272 /* Arrays indexed by above */
   12273 static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
   12274 
   12275 #define STRIPNAME(i) (stripfuncnames[i])
   12276 
   12277 /* externally visible for str.strip(unicode) */
   12278 PyObject *
   12279 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
   12280 {
   12281     void *data;
   12282     int kind;
   12283     Py_ssize_t i, j, len;
   12284     BLOOM_MASK sepmask;
   12285     Py_ssize_t seplen;
   12286 
   12287     if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
   12288         return NULL;
   12289 
   12290     kind = PyUnicode_KIND(self);
   12291     data = PyUnicode_DATA(self);
   12292     len = PyUnicode_GET_LENGTH(self);
   12293     seplen = PyUnicode_GET_LENGTH(sepobj);
   12294     sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
   12295                               PyUnicode_DATA(sepobj),
   12296                               seplen);
   12297 
   12298     i = 0;
   12299     if (striptype != RIGHTSTRIP) {
   12300         while (i < len) {
   12301             Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   12302             if (!BLOOM(sepmask, ch))
   12303                 break;
   12304             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
   12305                 break;
   12306             i++;
   12307         }
   12308     }
   12309 
   12310     j = len;
   12311     if (striptype != LEFTSTRIP) {
   12312         j--;
   12313         while (j >= i) {
   12314             Py_UCS4 ch = PyUnicode_READ(kind, data, j);
   12315             if (!BLOOM(sepmask, ch))
   12316                 break;
   12317             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
   12318                 break;
   12319             j--;
   12320         }
   12321 
   12322         j++;
   12323     }
   12324 
   12325     return PyUnicode_Substring(self, i, j);
   12326 }
   12327 
   12328 PyObject*
   12329 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
   12330 {
   12331     unsigned char *data;
   12332     int kind;
   12333     Py_ssize_t length;
   12334 
   12335     if (PyUnicode_READY(self) == -1)
   12336         return NULL;
   12337 
   12338     length = PyUnicode_GET_LENGTH(self);
   12339     end = Py_MIN(end, length);
   12340 
   12341     if (start == 0 && end == length)
   12342         return unicode_result_unchanged(self);
   12343 
   12344     if (start < 0 || end < 0) {
   12345         PyErr_SetString(PyExc_IndexError, "string index out of range");
   12346         return NULL;
   12347     }
   12348     if (start >= length || end < start)
   12349         _Py_RETURN_UNICODE_EMPTY();
   12350 
   12351     length = end - start;
   12352     if (PyUnicode_IS_ASCII(self)) {
   12353         data = PyUnicode_1BYTE_DATA(self);
   12354         return _PyUnicode_FromASCII((char*)(data + start), length);
   12355     }
   12356     else {
   12357         kind = PyUnicode_KIND(self);
   12358         data = PyUnicode_1BYTE_DATA(self);
   12359         return PyUnicode_FromKindAndData(kind,
   12360                                          data + kind * start,
   12361                                          length);
   12362     }
   12363 }
   12364 
   12365 static PyObject *
   12366 do_strip(PyObject *self, int striptype)
   12367 {
   12368     Py_ssize_t len, i, j;
   12369 
   12370     if (PyUnicode_READY(self) == -1)
   12371         return NULL;
   12372 
   12373     len = PyUnicode_GET_LENGTH(self);
   12374 
   12375     if (PyUnicode_IS_ASCII(self)) {
   12376         Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
   12377 
   12378         i = 0;
   12379         if (striptype != RIGHTSTRIP) {
   12380             while (i < len) {
   12381                 Py_UCS1 ch = data[i];
   12382                 if (!_Py_ascii_whitespace[ch])
   12383                     break;
   12384                 i++;
   12385             }
   12386         }
   12387 
   12388         j = len;
   12389         if (striptype != LEFTSTRIP) {
   12390             j--;
   12391             while (j >= i) {
   12392                 Py_UCS1 ch = data[j];
   12393                 if (!_Py_ascii_whitespace[ch])
   12394                     break;
   12395                 j--;
   12396             }
   12397             j++;
   12398         }
   12399     }
   12400     else {
   12401         int kind = PyUnicode_KIND(self);
   12402         void *data = PyUnicode_DATA(self);
   12403 
   12404         i = 0;
   12405         if (striptype != RIGHTSTRIP) {
   12406             while (i < len) {
   12407                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   12408                 if (!Py_UNICODE_ISSPACE(ch))
   12409                     break;
   12410                 i++;
   12411             }
   12412         }
   12413 
   12414         j = len;
   12415         if (striptype != LEFTSTRIP) {
   12416             j--;
   12417             while (j >= i) {
   12418                 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
   12419                 if (!Py_UNICODE_ISSPACE(ch))
   12420                     break;
   12421                 j--;
   12422             }
   12423             j++;
   12424         }
   12425     }
   12426 
   12427     return PyUnicode_Substring(self, i, j);
   12428 }
   12429 
   12430 
   12431 static PyObject *
   12432 do_argstrip(PyObject *self, int striptype, PyObject *sep)
   12433 {
   12434     if (sep != NULL && sep != Py_None) {
   12435         if (PyUnicode_Check(sep))
   12436             return _PyUnicode_XStrip(self, striptype, sep);
   12437         else {
   12438             PyErr_Format(PyExc_TypeError,
   12439                          "%s arg must be None or str",
   12440                          STRIPNAME(striptype));
   12441             return NULL;
   12442         }
   12443     }
   12444 
   12445     return do_strip(self, striptype);
   12446 }
   12447 
   12448 
   12449 /*[clinic input]
   12450 str.strip as unicode_strip
   12451 
   12452     chars: object = None
   12453     /
   12454 
   12455 Return a copy of the string with leading and trailing whitespace remove.
   12456 
   12457 If chars is given and not None, remove characters in chars instead.
   12458 [clinic start generated code]*/
   12459 
   12460 static PyObject *
   12461 unicode_strip_impl(PyObject *self, PyObject *chars)
   12462 /*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
   12463 {
   12464     return do_argstrip(self, BOTHSTRIP, chars);
   12465 }
   12466 
   12467 
   12468 /*[clinic input]
   12469 str.lstrip as unicode_lstrip
   12470 
   12471     chars: object = NULL
   12472     /
   12473 
   12474 Return a copy of the string with leading whitespace removed.
   12475 
   12476 If chars is given and not None, remove characters in chars instead.
   12477 [clinic start generated code]*/
   12478 
   12479 static PyObject *
   12480 unicode_lstrip_impl(PyObject *self, PyObject *chars)
   12481 /*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
   12482 {
   12483     return do_argstrip(self, LEFTSTRIP, chars);
   12484 }
   12485 
   12486 
   12487 /*[clinic input]
   12488 str.rstrip as unicode_rstrip
   12489 
   12490     chars: object = NULL
   12491     /
   12492 
   12493 Return a copy of the string with trailing whitespace removed.
   12494 
   12495 If chars is given and not None, remove characters in chars instead.
   12496 [clinic start generated code]*/
   12497 
   12498 static PyObject *
   12499 unicode_rstrip_impl(PyObject *self, PyObject *chars)
   12500 /*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
   12501 {
   12502     return do_argstrip(self, RIGHTSTRIP, chars);
   12503 }
   12504 
   12505 
   12506 static PyObject*
   12507 unicode_repeat(PyObject *str, Py_ssize_t len)
   12508 {
   12509     PyObject *u;
   12510     Py_ssize_t nchars, n;
   12511 
   12512     if (len < 1)
   12513         _Py_RETURN_UNICODE_EMPTY();
   12514 
   12515     /* no repeat, return original string */
   12516     if (len == 1)
   12517         return unicode_result_unchanged(str);
   12518 
   12519     if (PyUnicode_READY(str) == -1)
   12520         return NULL;
   12521 
   12522     if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
   12523         PyErr_SetString(PyExc_OverflowError,
   12524                         "repeated string is too long");
   12525         return NULL;
   12526     }
   12527     nchars = len * PyUnicode_GET_LENGTH(str);
   12528 
   12529     u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
   12530     if (!u)
   12531         return NULL;
   12532     assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
   12533 
   12534     if (PyUnicode_GET_LENGTH(str) == 1) {
   12535         const int kind = PyUnicode_KIND(str);
   12536         const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
   12537         if (kind == PyUnicode_1BYTE_KIND) {
   12538             void *to = PyUnicode_DATA(u);
   12539             memset(to, (unsigned char)fill_char, len);
   12540         }
   12541         else if (kind == PyUnicode_2BYTE_KIND) {
   12542             Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
   12543             for (n = 0; n < len; ++n)
   12544                 ucs2[n] = fill_char;
   12545         } else {
   12546             Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
   12547             assert(kind == PyUnicode_4BYTE_KIND);
   12548             for (n = 0; n < len; ++n)
   12549                 ucs4[n] = fill_char;
   12550         }
   12551     }
   12552     else {
   12553         /* number of characters copied this far */
   12554         Py_ssize_t done = PyUnicode_GET_LENGTH(str);
   12555         const Py_ssize_t char_size = PyUnicode_KIND(str);
   12556         char *to = (char *) PyUnicode_DATA(u);
   12557         memcpy(to, PyUnicode_DATA(str),
   12558                   PyUnicode_GET_LENGTH(str) * char_size);
   12559         while (done < nchars) {
   12560             n = (done <= nchars-done) ? done : nchars-done;
   12561             memcpy(to + (done * char_size), to, n * char_size);
   12562             done += n;
   12563         }
   12564     }
   12565 
   12566     assert(_PyUnicode_CheckConsistency(u, 1));
   12567     return u;
   12568 }
   12569 
   12570 PyObject *
   12571 PyUnicode_Replace(PyObject *str,
   12572                   PyObject *substr,
   12573                   PyObject *replstr,
   12574                   Py_ssize_t maxcount)
   12575 {
   12576     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
   12577             ensure_unicode(replstr) < 0)
   12578         return NULL;
   12579     return replace(str, substr, replstr, maxcount);
   12580 }
   12581 
   12582 /*[clinic input]
   12583 str.replace as unicode_replace
   12584 
   12585     old: unicode
   12586     new: unicode
   12587     count: Py_ssize_t = -1
   12588         Maximum number of occurrences to replace.
   12589         -1 (the default value) means replace all occurrences.
   12590     /
   12591 
   12592 Return a copy with all occurrences of substring old replaced by new.
   12593 
   12594 If the optional argument count is given, only the first count occurrences are
   12595 replaced.
   12596 [clinic start generated code]*/
   12597 
   12598 static PyObject *
   12599 unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
   12600                      Py_ssize_t count)
   12601 /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
   12602 {
   12603     if (PyUnicode_READY(self) == -1)
   12604         return NULL;
   12605     return replace(self, old, new, count);
   12606 }
   12607 
   12608 static PyObject *
   12609 unicode_repr(PyObject *unicode)
   12610 {
   12611     PyObject *repr;
   12612     Py_ssize_t isize;
   12613     Py_ssize_t osize, squote, dquote, i, o;
   12614     Py_UCS4 max, quote;
   12615     int ikind, okind, unchanged;
   12616     void *idata, *odata;
   12617 
   12618     if (PyUnicode_READY(unicode) == -1)
   12619         return NULL;
   12620 
   12621     isize = PyUnicode_GET_LENGTH(unicode);
   12622     idata = PyUnicode_DATA(unicode);
   12623 
   12624     /* Compute length of output, quote characters, and
   12625        maximum character */
   12626     osize = 0;
   12627     max = 127;
   12628     squote = dquote = 0;
   12629     ikind = PyUnicode_KIND(unicode);
   12630     for (i = 0; i < isize; i++) {
   12631         Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
   12632         Py_ssize_t incr = 1;
   12633         switch (ch) {
   12634         case '\'': squote++; break;
   12635         case '"':  dquote++; break;
   12636         case '\\': case '\t': case '\r': case '\n':
   12637             incr = 2;
   12638             break;
   12639         default:
   12640             /* Fast-path ASCII */
   12641             if (ch < ' ' || ch == 0x7f)
   12642                 incr = 4; /* \xHH */
   12643             else if (ch < 0x7f)
   12644                 ;
   12645             else if (Py_UNICODE_ISPRINTABLE(ch))
   12646                 max = ch > max ? ch : max;
   12647             else if (ch < 0x100)
   12648                 incr = 4; /* \xHH */
   12649             else if (ch < 0x10000)
   12650                 incr = 6; /* \uHHHH */
   12651             else
   12652                 incr = 10; /* \uHHHHHHHH */
   12653         }
   12654         if (osize > PY_SSIZE_T_MAX - incr) {
   12655             PyErr_SetString(PyExc_OverflowError,
   12656                             "string is too long to generate repr");
   12657             return NULL;
   12658         }
   12659         osize += incr;
   12660     }
   12661 
   12662     quote = '\'';
   12663     unchanged = (osize == isize);
   12664     if (squote) {
   12665         unchanged = 0;
   12666         if (dquote)
   12667             /* Both squote and dquote present. Use squote,
   12668                and escape them */
   12669             osize += squote;
   12670         else
   12671             quote = '"';
   12672     }
   12673     osize += 2;   /* quotes */
   12674 
   12675     repr = PyUnicode_New(osize, max);
   12676     if (repr == NULL)
   12677         return NULL;
   12678     okind = PyUnicode_KIND(repr);
   12679     odata = PyUnicode_DATA(repr);
   12680 
   12681     PyUnicode_WRITE(okind, odata, 0, quote);
   12682     PyUnicode_WRITE(okind, odata, osize-1, quote);
   12683     if (unchanged) {
   12684         _PyUnicode_FastCopyCharacters(repr, 1,
   12685                                       unicode, 0,
   12686                                       isize);
   12687     }
   12688     else {
   12689         for (i = 0, o = 1; i < isize; i++) {
   12690             Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
   12691 
   12692             /* Escape quotes and backslashes */
   12693             if ((ch == quote) || (ch == '\\')) {
   12694                 PyUnicode_WRITE(okind, odata, o++, '\\');
   12695                 PyUnicode_WRITE(okind, odata, o++, ch);
   12696                 continue;
   12697             }
   12698 
   12699             /* Map special whitespace to '\t', \n', '\r' */
   12700             if (ch == '\t') {
   12701                 PyUnicode_WRITE(okind, odata, o++, '\\');
   12702                 PyUnicode_WRITE(okind, odata, o++, 't');
   12703             }
   12704             else if (ch == '\n') {
   12705                 PyUnicode_WRITE(okind, odata, o++, '\\');
   12706                 PyUnicode_WRITE(okind, odata, o++, 'n');
   12707             }
   12708             else if (ch == '\r') {
   12709                 PyUnicode_WRITE(okind, odata, o++, '\\');
   12710                 PyUnicode_WRITE(okind, odata, o++, 'r');
   12711             }
   12712 
   12713             /* Map non-printable US ASCII to '\xhh' */
   12714             else if (ch < ' ' || ch == 0x7F) {
   12715                 PyUnicode_WRITE(okind, odata, o++, '\\');
   12716                 PyUnicode_WRITE(okind, odata, o++, 'x');
   12717                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
   12718                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
   12719             }
   12720 
   12721             /* Copy ASCII characters as-is */
   12722             else if (ch < 0x7F) {
   12723                 PyUnicode_WRITE(okind, odata, o++, ch);
   12724             }
   12725 
   12726             /* Non-ASCII characters */
   12727             else {
   12728                 /* Map Unicode whitespace and control characters
   12729                    (categories Z* and C* except ASCII space)
   12730                 */
   12731                 if (!Py_UNICODE_ISPRINTABLE(ch)) {
   12732                     PyUnicode_WRITE(okind, odata, o++, '\\');
   12733                     /* Map 8-bit characters to '\xhh' */
   12734                     if (ch <= 0xff) {
   12735                         PyUnicode_WRITE(okind, odata, o++, 'x');
   12736                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
   12737                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
   12738                     }
   12739                     /* Map 16-bit characters to '\uxxxx' */
   12740                     else if (ch <= 0xffff) {
   12741                         PyUnicode_WRITE(okind, odata, o++, 'u');
   12742                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
   12743                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
   12744                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
   12745                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
   12746                     }
   12747                     /* Map 21-bit characters to '\U00xxxxxx' */
   12748                     else {
   12749                         PyUnicode_WRITE(okind, odata, o++, 'U');
   12750                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
   12751                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
   12752                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
   12753                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
   12754                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
   12755                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
   12756                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
   12757                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
   12758                     }
   12759                 }
   12760                 /* Copy characters as-is */
   12761                 else {
   12762                     PyUnicode_WRITE(okind, odata, o++, ch);
   12763                 }
   12764             }
   12765         }
   12766     }
   12767     /* Closing quote already added at the beginning */
   12768     assert(_PyUnicode_CheckConsistency(repr, 1));
   12769     return repr;
   12770 }
   12771 
   12772 PyDoc_STRVAR(rfind__doc__,
   12773              "S.rfind(sub[, start[, end]]) -> int\n\
   12774 \n\
   12775 Return the highest index in S where substring sub is found,\n\
   12776 such that sub is contained within S[start:end].  Optional\n\
   12777 arguments start and end are interpreted as in slice notation.\n\
   12778 \n\
   12779 Return -1 on failure.");
   12780 
   12781 static PyObject *
   12782 unicode_rfind(PyObject *self, PyObject *args)
   12783 {
   12784     /* initialize variables to prevent gcc warning */
   12785     PyObject *substring = NULL;
   12786     Py_ssize_t start = 0;
   12787     Py_ssize_t end = 0;
   12788     Py_ssize_t result;
   12789 
   12790     if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
   12791         return NULL;
   12792 
   12793     if (PyUnicode_READY(self) == -1)
   12794         return NULL;
   12795 
   12796     result = any_find_slice(self, substring, start, end, -1);
   12797 
   12798     if (result == -2)
   12799         return NULL;
   12800 
   12801     return PyLong_FromSsize_t(result);
   12802 }
   12803 
   12804 PyDoc_STRVAR(rindex__doc__,
   12805              "S.rindex(sub[, start[, end]]) -> int\n\
   12806 \n\
   12807 Return the highest index in S where substring sub is found,\n\
   12808 such that sub is contained within S[start:end].  Optional\n\
   12809 arguments start and end are interpreted as in slice notation.\n\
   12810 \n\
   12811 Raises ValueError when the substring is not found.");
   12812 
   12813 static PyObject *
   12814 unicode_rindex(PyObject *self, PyObject *args)
   12815 {
   12816     /* initialize variables to prevent gcc warning */
   12817     PyObject *substring = NULL;
   12818     Py_ssize_t start = 0;
   12819     Py_ssize_t end = 0;
   12820     Py_ssize_t result;
   12821 
   12822     if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
   12823         return NULL;
   12824 
   12825     if (PyUnicode_READY(self) == -1)
   12826         return NULL;
   12827 
   12828     result = any_find_slice(self, substring, start, end, -1);
   12829 
   12830     if (result == -2)
   12831         return NULL;
   12832 
   12833     if (result < 0) {
   12834         PyErr_SetString(PyExc_ValueError, "substring not found");
   12835         return NULL;
   12836     }
   12837 
   12838     return PyLong_FromSsize_t(result);
   12839 }
   12840 
   12841 /*[clinic input]
   12842 str.rjust as unicode_rjust
   12843 
   12844     width: Py_ssize_t
   12845     fillchar: Py_UCS4 = ' '
   12846     /
   12847 
   12848 Return a right-justified string of length width.
   12849 
   12850 Padding is done using the specified fill character (default is a space).
   12851 [clinic start generated code]*/
   12852 
   12853 static PyObject *
   12854 unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
   12855 /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
   12856 {
   12857     if (PyUnicode_READY(self) == -1)
   12858         return NULL;
   12859 
   12860     if (PyUnicode_GET_LENGTH(self) >= width)
   12861         return unicode_result_unchanged(self);
   12862 
   12863     return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
   12864 }
   12865 
   12866 PyObject *
   12867 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
   12868 {
   12869     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
   12870         return NULL;
   12871 
   12872     return split(s, sep, maxsplit);
   12873 }
   12874 
   12875 /*[clinic input]
   12876 str.split as unicode_split
   12877 
   12878     sep: object = None
   12879         The delimiter according which to split the string.
   12880         None (the default value) means split according to any whitespace,
   12881         and discard empty strings from the result.
   12882     maxsplit: Py_ssize_t = -1
   12883         Maximum number of splits to do.
   12884         -1 (the default value) means no limit.
   12885 
   12886 Return a list of the words in the string, using sep as the delimiter string.
   12887 [clinic start generated code]*/
   12888 
   12889 static PyObject *
   12890 unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
   12891 /*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
   12892 {
   12893     if (sep == Py_None)
   12894         return split(self, NULL, maxsplit);
   12895     if (PyUnicode_Check(sep))
   12896         return split(self, sep, maxsplit);
   12897 
   12898     PyErr_Format(PyExc_TypeError,
   12899                  "must be str or None, not %.100s",
   12900                  Py_TYPE(sep)->tp_name);
   12901     return NULL;
   12902 }
   12903 
   12904 PyObject *
   12905 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
   12906 {
   12907     PyObject* out;
   12908     int kind1, kind2;
   12909     void *buf1, *buf2;
   12910     Py_ssize_t len1, len2;
   12911 
   12912     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
   12913         return NULL;
   12914 
   12915     kind1 = PyUnicode_KIND(str_obj);
   12916     kind2 = PyUnicode_KIND(sep_obj);
   12917     len1 = PyUnicode_GET_LENGTH(str_obj);
   12918     len2 = PyUnicode_GET_LENGTH(sep_obj);
   12919     if (kind1 < kind2 || len1 < len2) {
   12920         _Py_INCREF_UNICODE_EMPTY();
   12921         if (!unicode_empty)
   12922             out = NULL;
   12923         else {
   12924             out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
   12925             Py_DECREF(unicode_empty);
   12926         }
   12927         return out;
   12928     }
   12929     buf1 = PyUnicode_DATA(str_obj);
   12930     buf2 = PyUnicode_DATA(sep_obj);
   12931     if (kind2 != kind1) {
   12932         buf2 = _PyUnicode_AsKind(sep_obj, kind1);
   12933         if (!buf2)
   12934             return NULL;
   12935     }
   12936 
   12937     switch (kind1) {
   12938     case PyUnicode_1BYTE_KIND:
   12939         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
   12940             out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12941         else
   12942             out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12943         break;
   12944     case PyUnicode_2BYTE_KIND:
   12945         out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12946         break;
   12947     case PyUnicode_4BYTE_KIND:
   12948         out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12949         break;
   12950     default:
   12951         Py_UNREACHABLE();
   12952     }
   12953 
   12954     if (kind2 != kind1)
   12955         PyMem_Free(buf2);
   12956 
   12957     return out;
   12958 }
   12959 
   12960 
   12961 PyObject *
   12962 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
   12963 {
   12964     PyObject* out;
   12965     int kind1, kind2;
   12966     void *buf1, *buf2;
   12967     Py_ssize_t len1, len2;
   12968 
   12969     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
   12970         return NULL;
   12971 
   12972     kind1 = PyUnicode_KIND(str_obj);
   12973     kind2 = PyUnicode_KIND(sep_obj);
   12974     len1 = PyUnicode_GET_LENGTH(str_obj);
   12975     len2 = PyUnicode_GET_LENGTH(sep_obj);
   12976     if (kind1 < kind2 || len1 < len2) {
   12977         _Py_INCREF_UNICODE_EMPTY();
   12978         if (!unicode_empty)
   12979             out = NULL;
   12980         else {
   12981             out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
   12982             Py_DECREF(unicode_empty);
   12983         }
   12984         return out;
   12985     }
   12986     buf1 = PyUnicode_DATA(str_obj);
   12987     buf2 = PyUnicode_DATA(sep_obj);
   12988     if (kind2 != kind1) {
   12989         buf2 = _PyUnicode_AsKind(sep_obj, kind1);
   12990         if (!buf2)
   12991             return NULL;
   12992     }
   12993 
   12994     switch (kind1) {
   12995     case PyUnicode_1BYTE_KIND:
   12996         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
   12997             out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12998         else
   12999             out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
   13000         break;
   13001     case PyUnicode_2BYTE_KIND:
   13002         out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
   13003         break;
   13004     case PyUnicode_4BYTE_KIND:
   13005         out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
   13006         break;
   13007     default:
   13008         Py_UNREACHABLE();
   13009     }
   13010 
   13011     if (kind2 != kind1)
   13012         PyMem_Free(buf2);
   13013 
   13014     return out;
   13015 }
   13016 
   13017 /*[clinic input]
   13018 str.partition as unicode_partition
   13019 
   13020     sep: object
   13021     /
   13022 
   13023 Partition the string into three parts using the given separator.
   13024 
   13025 This will search for the separator in the string.  If the separator is found,
   13026 returns a 3-tuple containing the part before the separator, the separator
   13027 itself, and the part after it.
   13028 
   13029 If the separator is not found, returns a 3-tuple containing the original string
   13030 and two empty strings.
   13031 [clinic start generated code]*/
   13032 
   13033 static PyObject *
   13034 unicode_partition(PyObject *self, PyObject *sep)
   13035 /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
   13036 {
   13037     return PyUnicode_Partition(self, sep);
   13038 }
   13039 
   13040 /*[clinic input]
   13041 str.rpartition as unicode_rpartition = str.partition
   13042 
   13043 Partition the string into three parts using the given separator.
   13044 
   13045 This will search for the separator in the string, starting at the end. If
   13046 the separator is found, returns a 3-tuple containing the part before the
   13047 separator, the separator itself, and the part after it.
   13048 
   13049 If the separator is not found, returns a 3-tuple containing two empty strings
   13050 and the original string.
   13051 [clinic start generated code]*/
   13052 
   13053 static PyObject *
   13054 unicode_rpartition(PyObject *self, PyObject *sep)
   13055 /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
   13056 {
   13057     return PyUnicode_RPartition(self, sep);
   13058 }
   13059 
   13060 PyObject *
   13061 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
   13062 {
   13063     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
   13064         return NULL;
   13065 
   13066     return rsplit(s, sep, maxsplit);
   13067 }
   13068 
   13069 /*[clinic input]
   13070 str.rsplit as unicode_rsplit = str.split
   13071 
   13072 Return a list of the words in the string, using sep as the delimiter string.
   13073 
   13074 Splits are done starting at the end of the string and working to the front.
   13075 [clinic start generated code]*/
   13076 
   13077 static PyObject *
   13078 unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
   13079 /*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
   13080 {
   13081     if (sep == Py_None)
   13082         return rsplit(self, NULL, maxsplit);
   13083     if (PyUnicode_Check(sep))
   13084         return rsplit(self, sep, maxsplit);
   13085 
   13086     PyErr_Format(PyExc_TypeError,
   13087                  "must be str or None, not %.100s",
   13088                  Py_TYPE(sep)->tp_name);
   13089     return NULL;
   13090 }
   13091 
   13092 /*[clinic input]
   13093 str.splitlines as unicode_splitlines
   13094 
   13095     keepends: bool(accept={int}) = False
   13096 
   13097 Return a list of the lines in the string, breaking at line boundaries.
   13098 
   13099 Line breaks are not included in the resulting list unless keepends is given and
   13100 true.
   13101 [clinic start generated code]*/
   13102 
   13103 static PyObject *
   13104 unicode_splitlines_impl(PyObject *self, int keepends)
   13105 /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
   13106 {
   13107     return PyUnicode_Splitlines(self, keepends);
   13108 }
   13109 
   13110 static
   13111 PyObject *unicode_str(PyObject *self)
   13112 {
   13113     return unicode_result_unchanged(self);
   13114 }
   13115 
   13116 /*[clinic input]
   13117 str.swapcase as unicode_swapcase
   13118 
   13119 Convert uppercase characters to lowercase and lowercase characters to uppercase.
   13120 [clinic start generated code]*/
   13121 
   13122 static PyObject *
   13123 unicode_swapcase_impl(PyObject *self)
   13124 /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
   13125 {
   13126     if (PyUnicode_READY(self) == -1)
   13127         return NULL;
   13128     return case_operation(self, do_swapcase);
   13129 }
   13130 
   13131 /*[clinic input]
   13132 
   13133 @staticmethod
   13134 str.maketrans as unicode_maketrans
   13135 
   13136   x: object
   13137 
   13138   y: unicode=NULL
   13139 
   13140   z: unicode=NULL
   13141 
   13142   /
   13143 
   13144 Return a translation table usable for str.translate().
   13145 
   13146 If there is only one argument, it must be a dictionary mapping Unicode
   13147 ordinals (integers) or characters to Unicode ordinals, strings or None.
   13148 Character keys will be then converted to ordinals.
   13149 If there are two arguments, they must be strings of equal length, and
   13150 in the resulting dictionary, each character in x will be mapped to the
   13151 character at the same position in y. If there is a third argument, it
   13152 must be a string, whose characters will be mapped to None in the result.
   13153 [clinic start generated code]*/
   13154 
   13155 static PyObject *
   13156 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
   13157 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
   13158 {
   13159     PyObject *new = NULL, *key, *value;
   13160     Py_ssize_t i = 0;
   13161     int res;
   13162 
   13163     new = PyDict_New();
   13164     if (!new)
   13165         return NULL;
   13166     if (y != NULL) {
   13167         int x_kind, y_kind, z_kind;
   13168         void *x_data, *y_data, *z_data;
   13169 
   13170         /* x must be a string too, of equal length */
   13171         if (!PyUnicode_Check(x)) {
   13172             PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
   13173                             "be a string if there is a second argument");
   13174             goto err;
   13175         }
   13176         if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
   13177             PyErr_SetString(PyExc_ValueError, "the first two maketrans "
   13178                             "arguments must have equal length");
   13179             goto err;
   13180         }
   13181         /* create entries for translating chars in x to those in y */
   13182         x_kind = PyUnicode_KIND(x);
   13183         y_kind = PyUnicode_KIND(y);
   13184         x_data = PyUnicode_DATA(x);
   13185         y_data = PyUnicode_DATA(y);
   13186         for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
   13187             key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
   13188             if (!key)
   13189                 goto err;
   13190             value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
   13191             if (!value) {
   13192                 Py_DECREF(key);
   13193                 goto err;
   13194             }
   13195             res = PyDict_SetItem(new, key, value);
   13196             Py_DECREF(key);
   13197             Py_DECREF(value);
   13198             if (res < 0)
   13199                 goto err;
   13200         }
   13201         /* create entries for deleting chars in z */
   13202         if (z != NULL) {
   13203             z_kind = PyUnicode_KIND(z);
   13204             z_data = PyUnicode_DATA(z);
   13205             for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
   13206                 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
   13207                 if (!key)
   13208                     goto err;
   13209                 res = PyDict_SetItem(new, key, Py_None);
   13210                 Py_DECREF(key);
   13211                 if (res < 0)
   13212                     goto err;
   13213             }
   13214         }
   13215     } else {
   13216         int kind;
   13217         void *data;
   13218 
   13219         /* x must be a dict */
   13220         if (!PyDict_CheckExact(x)) {
   13221             PyErr_SetString(PyExc_TypeError, "if you give only one argument "
   13222                             "to maketrans it must be a dict");
   13223             goto err;
   13224         }
   13225         /* copy entries into the new dict, converting string keys to int keys */
   13226         while (PyDict_Next(x, &i, &key, &value)) {
   13227             if (PyUnicode_Check(key)) {
   13228                 /* convert string keys to integer keys */
   13229                 PyObject *newkey;
   13230                 if (PyUnicode_GET_LENGTH(key) != 1) {
   13231                     PyErr_SetString(PyExc_ValueError, "string keys in translate "
   13232                                     "table must be of length 1");
   13233                     goto err;
   13234                 }
   13235                 kind = PyUnicode_KIND(key);
   13236                 data = PyUnicode_DATA(key);
   13237                 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
   13238                 if (!newkey)
   13239                     goto err;
   13240                 res = PyDict_SetItem(new, newkey, value);
   13241                 Py_DECREF(newkey);
   13242                 if (res < 0)
   13243                     goto err;
   13244             } else if (PyLong_Check(key)) {
   13245                 /* just keep integer keys */
   13246                 if (PyDict_SetItem(new, key, value) < 0)
   13247                     goto err;
   13248             } else {
   13249                 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
   13250                                 "be strings or integers");
   13251                 goto err;
   13252             }
   13253         }
   13254     }
   13255     return new;
   13256   err:
   13257     Py_DECREF(new);
   13258     return NULL;
   13259 }
   13260 
   13261 /*[clinic input]
   13262 str.translate as unicode_translate
   13263 
   13264     table: object
   13265         Translation table, which must be a mapping of Unicode ordinals to
   13266         Unicode ordinals, strings, or None.
   13267     /
   13268 
   13269 Replace each character in the string using the given translation table.
   13270 
   13271 The table must implement lookup/indexing via __getitem__, for instance a
   13272 dictionary or list.  If this operation raises LookupError, the character is
   13273 left untouched.  Characters mapped to None are deleted.
   13274 [clinic start generated code]*/
   13275 
   13276 static PyObject *
   13277 unicode_translate(PyObject *self, PyObject *table)
   13278 /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
   13279 {
   13280     return _PyUnicode_TranslateCharmap(self, table, "ignore");
   13281 }
   13282 
   13283 /*[clinic input]
   13284 str.upper as unicode_upper
   13285 
   13286 Return a copy of the string converted to uppercase.
   13287 [clinic start generated code]*/
   13288 
   13289 static PyObject *
   13290 unicode_upper_impl(PyObject *self)
   13291 /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
   13292 {
   13293     if (PyUnicode_READY(self) == -1)
   13294         return NULL;
   13295     if (PyUnicode_IS_ASCII(self))
   13296         return ascii_upper_or_lower(self, 0);
   13297     return case_operation(self, do_upper);
   13298 }
   13299 
   13300 /*[clinic input]
   13301 str.zfill as unicode_zfill
   13302 
   13303     width: Py_ssize_t
   13304     /
   13305 
   13306 Pad a numeric string with zeros on the left, to fill a field of the given width.
   13307 
   13308 The string is never truncated.
   13309 [clinic start generated code]*/
   13310 
   13311 static PyObject *
   13312 unicode_zfill_impl(PyObject *self, Py_ssize_t width)
   13313 /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
   13314 {
   13315     Py_ssize_t fill;
   13316     PyObject *u;
   13317     int kind;
   13318     void *data;
   13319     Py_UCS4 chr;
   13320 
   13321     if (PyUnicode_READY(self) == -1)
   13322         return NULL;
   13323 
   13324     if (PyUnicode_GET_LENGTH(self) >= width)
   13325         return unicode_result_unchanged(self);
   13326 
   13327     fill = width - PyUnicode_GET_LENGTH(self);
   13328 
   13329     u = pad(self, fill, 0, '0');
   13330 
   13331     if (u == NULL)
   13332         return NULL;
   13333 
   13334     kind = PyUnicode_KIND(u);
   13335     data = PyUnicode_DATA(u);
   13336     chr = PyUnicode_READ(kind, data, fill);
   13337 
   13338     if (chr == '+' || chr == '-') {
   13339         /* move sign to beginning of string */
   13340         PyUnicode_WRITE(kind, data, 0, chr);
   13341         PyUnicode_WRITE(kind, data, fill, '0');
   13342     }
   13343 
   13344     assert(_PyUnicode_CheckConsistency(u, 1));
   13345     return u;
   13346 }
   13347 
   13348 #if 0
   13349 static PyObject *
   13350 unicode__decimal2ascii(PyObject *self)
   13351 {
   13352     return PyUnicode_TransformDecimalAndSpaceToASCII(self);
   13353 }
   13354 #endif
   13355 
   13356 PyDoc_STRVAR(startswith__doc__,
   13357              "S.startswith(prefix[, start[, end]]) -> bool\n\
   13358 \n\
   13359 Return True if S starts with the specified prefix, False otherwise.\n\
   13360 With optional start, test S beginning at that position.\n\
   13361 With optional end, stop comparing S at that position.\n\
   13362 prefix can also be a tuple of strings to try.");
   13363 
   13364 static PyObject *
   13365 unicode_startswith(PyObject *self,
   13366                    PyObject *args)
   13367 {
   13368     PyObject *subobj;
   13369     PyObject *substring;
   13370     Py_ssize_t start = 0;
   13371     Py_ssize_t end = PY_SSIZE_T_MAX;
   13372     int result;
   13373 
   13374     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
   13375         return NULL;
   13376     if (PyTuple_Check(subobj)) {
   13377         Py_ssize_t i;
   13378         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
   13379             substring = PyTuple_GET_ITEM(subobj, i);
   13380             if (!PyUnicode_Check(substring)) {
   13381                 PyErr_Format(PyExc_TypeError,
   13382                              "tuple for startswith must only contain str, "
   13383                              "not %.100s",
   13384                              Py_TYPE(substring)->tp_name);
   13385                 return NULL;
   13386             }
   13387             result = tailmatch(self, substring, start, end, -1);
   13388             if (result == -1)
   13389                 return NULL;
   13390             if (result) {
   13391                 Py_RETURN_TRUE;
   13392             }
   13393         }
   13394         /* nothing matched */
   13395         Py_RETURN_FALSE;
   13396     }
   13397     if (!PyUnicode_Check(subobj)) {
   13398         PyErr_Format(PyExc_TypeError,
   13399                      "startswith first arg must be str or "
   13400                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
   13401         return NULL;
   13402     }
   13403     result = tailmatch(self, subobj, start, end, -1);
   13404     if (result == -1)
   13405         return NULL;
   13406     return PyBool_FromLong(result);
   13407 }
   13408 
   13409 
   13410 PyDoc_STRVAR(endswith__doc__,
   13411              "S.endswith(suffix[, start[, end]]) -> bool\n\
   13412 \n\
   13413 Return True if S ends with the specified suffix, False otherwise.\n\
   13414 With optional start, test S beginning at that position.\n\
   13415 With optional end, stop comparing S at that position.\n\
   13416 suffix can also be a tuple of strings to try.");
   13417 
   13418 static PyObject *
   13419 unicode_endswith(PyObject *self,
   13420                  PyObject *args)
   13421 {
   13422     PyObject *subobj;
   13423     PyObject *substring;
   13424     Py_ssize_t start = 0;
   13425     Py_ssize_t end = PY_SSIZE_T_MAX;
   13426     int result;
   13427 
   13428     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
   13429         return NULL;
   13430     if (PyTuple_Check(subobj)) {
   13431         Py_ssize_t i;
   13432         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
   13433             substring = PyTuple_GET_ITEM(subobj, i);
   13434             if (!PyUnicode_Check(substring)) {
   13435                 PyErr_Format(PyExc_TypeError,
   13436                              "tuple for endswith must only contain str, "
   13437                              "not %.100s",
   13438                              Py_TYPE(substring)->tp_name);
   13439                 return NULL;
   13440             }
   13441             result = tailmatch(self, substring, start, end, +1);
   13442             if (result == -1)
   13443                 return NULL;
   13444             if (result) {
   13445                 Py_RETURN_TRUE;
   13446             }
   13447         }
   13448         Py_RETURN_FALSE;
   13449     }
   13450     if (!PyUnicode_Check(subobj)) {
   13451         PyErr_Format(PyExc_TypeError,
   13452                      "endswith first arg must be str or "
   13453                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
   13454         return NULL;
   13455     }
   13456     result = tailmatch(self, subobj, start, end, +1);
   13457     if (result == -1)
   13458         return NULL;
   13459     return PyBool_FromLong(result);
   13460 }
   13461 
   13462 static inline void
   13463 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
   13464 {
   13465     writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
   13466     writer->data = PyUnicode_DATA(writer->buffer);
   13467 
   13468     if (!writer->readonly) {
   13469         writer->kind = PyUnicode_KIND(writer->buffer);
   13470         writer->size = PyUnicode_GET_LENGTH(writer->buffer);
   13471     }
   13472     else {
   13473         /* use a value smaller than PyUnicode_1BYTE_KIND() so
   13474            _PyUnicodeWriter_PrepareKind() will copy the buffer. */
   13475         writer->kind = PyUnicode_WCHAR_KIND;
   13476         assert(writer->kind <= PyUnicode_1BYTE_KIND);
   13477 
   13478         /* Copy-on-write mode: set buffer size to 0 so
   13479          * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
   13480          * next write. */
   13481         writer->size = 0;
   13482     }
   13483 }
   13484 
   13485 void
   13486 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
   13487 {
   13488     memset(writer, 0, sizeof(*writer));
   13489 
   13490     /* ASCII is the bare minimum */
   13491     writer->min_char = 127;
   13492 
   13493     /* use a value smaller than PyUnicode_1BYTE_KIND() so
   13494        _PyUnicodeWriter_PrepareKind() will copy the buffer. */
   13495     writer->kind = PyUnicode_WCHAR_KIND;
   13496     assert(writer->kind <= PyUnicode_1BYTE_KIND);
   13497 }
   13498 
   13499 int
   13500 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
   13501                                  Py_ssize_t length, Py_UCS4 maxchar)
   13502 {
   13503     Py_ssize_t newlen;
   13504     PyObject *newbuffer;
   13505 
   13506     assert(maxchar <= MAX_UNICODE);
   13507 
   13508     /* ensure that the _PyUnicodeWriter_Prepare macro was used */
   13509     assert((maxchar > writer->maxchar && length >= 0)
   13510            || length > 0);
   13511 
   13512     if (length > PY_SSIZE_T_MAX - writer->pos) {
   13513         PyErr_NoMemory();
   13514         return -1;
   13515     }
   13516     newlen = writer->pos + length;
   13517 
   13518     maxchar = Py_MAX(maxchar, writer->min_char);
   13519 
   13520     if (writer->buffer == NULL) {
   13521         assert(!writer->readonly);
   13522         if (writer->overallocate
   13523             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
   13524             /* overallocate to limit the number of realloc() */
   13525             newlen += newlen / OVERALLOCATE_FACTOR;
   13526         }
   13527         if (newlen < writer->min_length)
   13528             newlen = writer->min_length;
   13529 
   13530         writer->buffer = PyUnicode_New(newlen, maxchar);
   13531         if (writer->buffer == NULL)
   13532             return -1;
   13533     }
   13534     else if (newlen > writer->size) {
   13535         if (writer->overallocate
   13536             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
   13537             /* overallocate to limit the number of realloc() */
   13538             newlen += newlen / OVERALLOCATE_FACTOR;
   13539         }
   13540         if (newlen < writer->min_length)
   13541             newlen = writer->min_length;
   13542 
   13543         if (maxchar > writer->maxchar || writer->readonly) {
   13544             /* resize + widen */
   13545             maxchar = Py_MAX(maxchar, writer->maxchar);
   13546             newbuffer = PyUnicode_New(newlen, maxchar);
   13547             if (newbuffer == NULL)
   13548                 return -1;
   13549             _PyUnicode_FastCopyCharacters(newbuffer, 0,
   13550                                           writer->buffer, 0, writer->pos);
   13551             Py_DECREF(writer->buffer);
   13552             writer->readonly = 0;
   13553         }
   13554         else {
   13555             newbuffer = resize_compact(writer->buffer, newlen);
   13556             if (newbuffer == NULL)
   13557                 return -1;
   13558         }
   13559         writer->buffer = newbuffer;
   13560     }
   13561     else if (maxchar > writer->maxchar) {
   13562         assert(!writer->readonly);
   13563         newbuffer = PyUnicode_New(writer->size, maxchar);
   13564         if (newbuffer == NULL)
   13565             return -1;
   13566         _PyUnicode_FastCopyCharacters(newbuffer, 0,
   13567                                       writer->buffer, 0, writer->pos);
   13568         Py_SETREF(writer->buffer, newbuffer);
   13569     }
   13570     _PyUnicodeWriter_Update(writer);
   13571     return 0;
   13572 
   13573 #undef OVERALLOCATE_FACTOR
   13574 }
   13575 
   13576 int
   13577 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
   13578                                      enum PyUnicode_Kind kind)
   13579 {
   13580     Py_UCS4 maxchar;
   13581 
   13582     /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
   13583     assert(writer->kind < kind);
   13584 
   13585     switch (kind)
   13586     {
   13587     case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
   13588     case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
   13589     case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
   13590     default:
   13591         Py_UNREACHABLE();
   13592     }
   13593 
   13594     return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
   13595 }
   13596 
   13597 static inline int
   13598 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
   13599 {
   13600     assert(ch <= MAX_UNICODE);
   13601     if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
   13602         return -1;
   13603     PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
   13604     writer->pos++;
   13605     return 0;
   13606 }
   13607 
   13608 int
   13609 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
   13610 {
   13611     return _PyUnicodeWriter_WriteCharInline(writer, ch);
   13612 }
   13613 
   13614 int
   13615 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
   13616 {
   13617     Py_UCS4 maxchar;
   13618     Py_ssize_t len;
   13619 
   13620     if (PyUnicode_READY(str) == -1)
   13621         return -1;
   13622     len = PyUnicode_GET_LENGTH(str);
   13623     if (len == 0)
   13624         return 0;
   13625     maxchar = PyUnicode_MAX_CHAR_VALUE(str);
   13626     if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
   13627         if (writer->buffer == NULL && !writer->overallocate) {
   13628             assert(_PyUnicode_CheckConsistency(str, 1));
   13629             writer->readonly = 1;
   13630             Py_INCREF(str);
   13631             writer->buffer = str;
   13632             _PyUnicodeWriter_Update(writer);
   13633             writer->pos += len;
   13634             return 0;
   13635         }
   13636         if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
   13637             return -1;
   13638     }
   13639     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
   13640                                   str, 0, len);
   13641     writer->pos += len;
   13642     return 0;
   13643 }
   13644 
   13645 int
   13646 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
   13647                                 Py_ssize_t start, Py_ssize_t end)
   13648 {
   13649     Py_UCS4 maxchar;
   13650     Py_ssize_t len;
   13651 
   13652     if (PyUnicode_READY(str) == -1)
   13653         return -1;
   13654 
   13655     assert(0 <= start);
   13656     assert(end <= PyUnicode_GET_LENGTH(str));
   13657     assert(start <= end);
   13658 
   13659     if (end == 0)
   13660         return 0;
   13661 
   13662     if (start == 0 && end == PyUnicode_GET_LENGTH(str))
   13663         return _PyUnicodeWriter_WriteStr(writer, str);
   13664 
   13665     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
   13666         maxchar = _PyUnicode_FindMaxChar(str, start, end);
   13667     else
   13668         maxchar = writer->maxchar;
   13669     len = end - start;
   13670 
   13671     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
   13672         return -1;
   13673 
   13674     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
   13675                                   str, start, len);
   13676     writer->pos += len;
   13677     return 0;
   13678 }
   13679 
   13680 int
   13681 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
   13682                                   const char *ascii, Py_ssize_t len)
   13683 {
   13684     if (len == -1)
   13685         len = strlen(ascii);
   13686 
   13687     assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
   13688 
   13689     if (writer->buffer == NULL && !writer->overallocate) {
   13690         PyObject *str;
   13691 
   13692         str = _PyUnicode_FromASCII(ascii, len);
   13693         if (str == NULL)
   13694             return -1;
   13695 
   13696         writer->readonly = 1;
   13697         writer->buffer = str;
   13698         _PyUnicodeWriter_Update(writer);
   13699         writer->pos += len;
   13700         return 0;
   13701     }
   13702 
   13703     if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
   13704         return -1;
   13705 
   13706     switch (writer->kind)
   13707     {
   13708     case PyUnicode_1BYTE_KIND:
   13709     {
   13710         const Py_UCS1 *str = (const Py_UCS1 *)ascii;
   13711         Py_UCS1 *data = writer->data;
   13712 
   13713         memcpy(data + writer->pos, str, len);
   13714         break;
   13715     }
   13716     case PyUnicode_2BYTE_KIND:
   13717     {
   13718         _PyUnicode_CONVERT_BYTES(
   13719             Py_UCS1, Py_UCS2,
   13720             ascii, ascii + len,
   13721             (Py_UCS2 *)writer->data + writer->pos);
   13722         break;
   13723     }
   13724     case PyUnicode_4BYTE_KIND:
   13725     {
   13726         _PyUnicode_CONVERT_BYTES(
   13727             Py_UCS1, Py_UCS4,
   13728             ascii, ascii + len,
   13729             (Py_UCS4 *)writer->data + writer->pos);
   13730         break;
   13731     }
   13732     default:
   13733         Py_UNREACHABLE();
   13734     }
   13735 
   13736     writer->pos += len;
   13737     return 0;
   13738 }
   13739 
   13740 int
   13741 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
   13742                                    const char *str, Py_ssize_t len)
   13743 {
   13744     Py_UCS4 maxchar;
   13745 
   13746     maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
   13747     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
   13748         return -1;
   13749     unicode_write_cstr(writer->buffer, writer->pos, str, len);
   13750     writer->pos += len;
   13751     return 0;
   13752 }
   13753 
   13754 PyObject *
   13755 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
   13756 {
   13757     PyObject *str;
   13758 
   13759     if (writer->pos == 0) {
   13760         Py_CLEAR(writer->buffer);
   13761         _Py_RETURN_UNICODE_EMPTY();
   13762     }
   13763 
   13764     str = writer->buffer;
   13765     writer->buffer = NULL;
   13766 
   13767     if (writer->readonly) {
   13768         assert(PyUnicode_GET_LENGTH(str) == writer->pos);
   13769         return str;
   13770     }
   13771 
   13772     if (PyUnicode_GET_LENGTH(str) != writer->pos) {
   13773         PyObject *str2;
   13774         str2 = resize_compact(str, writer->pos);
   13775         if (str2 == NULL) {
   13776             Py_DECREF(str);
   13777             return NULL;
   13778         }
   13779         str = str2;
   13780     }
   13781 
   13782     assert(_PyUnicode_CheckConsistency(str, 1));
   13783     return unicode_result_ready(str);
   13784 }
   13785 
   13786 void
   13787 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
   13788 {
   13789     Py_CLEAR(writer->buffer);
   13790 }
   13791 
   13792 #include "stringlib/unicode_format.h"
   13793 
   13794 PyDoc_STRVAR(format__doc__,
   13795              "S.format(*args, **kwargs) -> str\n\
   13796 \n\
   13797 Return a formatted version of S, using substitutions from args and kwargs.\n\
   13798 The substitutions are identified by braces ('{' and '}').");
   13799 
   13800 PyDoc_STRVAR(format_map__doc__,
   13801              "S.format_map(mapping) -> str\n\
   13802 \n\
   13803 Return a formatted version of S, using substitutions from mapping.\n\
   13804 The substitutions are identified by braces ('{' and '}').");
   13805 
   13806 /*[clinic input]
   13807 str.__format__ as unicode___format__
   13808 
   13809     format_spec: unicode
   13810     /
   13811 
   13812 Return a formatted version of the string as described by format_spec.
   13813 [clinic start generated code]*/
   13814 
   13815 static PyObject *
   13816 unicode___format___impl(PyObject *self, PyObject *format_spec)
   13817 /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
   13818 {
   13819     _PyUnicodeWriter writer;
   13820     int ret;
   13821 
   13822     if (PyUnicode_READY(self) == -1)
   13823         return NULL;
   13824     _PyUnicodeWriter_Init(&writer);
   13825     ret = _PyUnicode_FormatAdvancedWriter(&writer,
   13826                                           self, format_spec, 0,
   13827                                           PyUnicode_GET_LENGTH(format_spec));
   13828     if (ret == -1) {
   13829         _PyUnicodeWriter_Dealloc(&writer);
   13830         return NULL;
   13831     }
   13832     return _PyUnicodeWriter_Finish(&writer);
   13833 }
   13834 
   13835 /*[clinic input]
   13836 str.__sizeof__ as unicode_sizeof
   13837 
   13838 Return the size of the string in memory, in bytes.
   13839 [clinic start generated code]*/
   13840 
   13841 static PyObject *
   13842 unicode_sizeof_impl(PyObject *self)
   13843 /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
   13844 {
   13845     Py_ssize_t size;
   13846 
   13847     /* If it's a compact object, account for base structure +
   13848        character data. */
   13849     if (PyUnicode_IS_COMPACT_ASCII(self))
   13850         size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
   13851     else if (PyUnicode_IS_COMPACT(self))
   13852         size = sizeof(PyCompactUnicodeObject) +
   13853             (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
   13854     else {
   13855         /* If it is a two-block object, account for base object, and
   13856            for character block if present. */
   13857         size = sizeof(PyUnicodeObject);
   13858         if (_PyUnicode_DATA_ANY(self))
   13859             size += (PyUnicode_GET_LENGTH(self) + 1) *
   13860                 PyUnicode_KIND(self);
   13861     }
   13862     /* If the wstr pointer is present, account for it unless it is shared
   13863        with the data pointer. Check if the data is not shared. */
   13864     if (_PyUnicode_HAS_WSTR_MEMORY(self))
   13865         size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
   13866     if (_PyUnicode_HAS_UTF8_MEMORY(self))
   13867         size += PyUnicode_UTF8_LENGTH(self) + 1;
   13868 
   13869     return PyLong_FromSsize_t(size);
   13870 }
   13871 
   13872 static PyObject *
   13873 unicode_getnewargs(PyObject *v)
   13874 {
   13875     PyObject *copy = _PyUnicode_Copy(v);
   13876     if (!copy)
   13877         return NULL;
   13878     return Py_BuildValue("(N)", copy);
   13879 }
   13880 
   13881 static PyMethodDef unicode_methods[] = {
   13882     UNICODE_ENCODE_METHODDEF
   13883     UNICODE_REPLACE_METHODDEF
   13884     UNICODE_SPLIT_METHODDEF
   13885     UNICODE_RSPLIT_METHODDEF
   13886     UNICODE_JOIN_METHODDEF
   13887     UNICODE_CAPITALIZE_METHODDEF
   13888     UNICODE_CASEFOLD_METHODDEF
   13889     UNICODE_TITLE_METHODDEF
   13890     UNICODE_CENTER_METHODDEF
   13891     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
   13892     UNICODE_EXPANDTABS_METHODDEF
   13893     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
   13894     UNICODE_PARTITION_METHODDEF
   13895     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
   13896     UNICODE_LJUST_METHODDEF
   13897     UNICODE_LOWER_METHODDEF
   13898     UNICODE_LSTRIP_METHODDEF
   13899     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
   13900     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
   13901     UNICODE_RJUST_METHODDEF
   13902     UNICODE_RSTRIP_METHODDEF
   13903     UNICODE_RPARTITION_METHODDEF
   13904     UNICODE_SPLITLINES_METHODDEF
   13905     UNICODE_STRIP_METHODDEF
   13906     UNICODE_SWAPCASE_METHODDEF
   13907     UNICODE_TRANSLATE_METHODDEF
   13908     UNICODE_UPPER_METHODDEF
   13909     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
   13910     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
   13911     UNICODE_ISASCII_METHODDEF
   13912     UNICODE_ISLOWER_METHODDEF
   13913     UNICODE_ISUPPER_METHODDEF
   13914     UNICODE_ISTITLE_METHODDEF
   13915     UNICODE_ISSPACE_METHODDEF
   13916     UNICODE_ISDECIMAL_METHODDEF
   13917     UNICODE_ISDIGIT_METHODDEF
   13918     UNICODE_ISNUMERIC_METHODDEF
   13919     UNICODE_ISALPHA_METHODDEF
   13920     UNICODE_ISALNUM_METHODDEF
   13921     UNICODE_ISIDENTIFIER_METHODDEF
   13922     UNICODE_ISPRINTABLE_METHODDEF
   13923     UNICODE_ZFILL_METHODDEF
   13924     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
   13925     {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
   13926     UNICODE___FORMAT___METHODDEF
   13927     UNICODE_MAKETRANS_METHODDEF
   13928     UNICODE_SIZEOF_METHODDEF
   13929 #if 0
   13930     /* These methods are just used for debugging the implementation. */
   13931     {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
   13932 #endif
   13933 
   13934     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
   13935     {NULL, NULL}
   13936 };
   13937 
   13938 static PyObject *
   13939 unicode_mod(PyObject *v, PyObject *w)
   13940 {
   13941     if (!PyUnicode_Check(v))
   13942         Py_RETURN_NOTIMPLEMENTED;
   13943     return PyUnicode_Format(v, w);
   13944 }
   13945 
   13946 static PyNumberMethods unicode_as_number = {
   13947     0,              /*nb_add*/
   13948     0,              /*nb_subtract*/
   13949     0,              /*nb_multiply*/
   13950     unicode_mod,            /*nb_remainder*/
   13951 };
   13952 
   13953 static PySequenceMethods unicode_as_sequence = {
   13954     (lenfunc) unicode_length,       /* sq_length */
   13955     PyUnicode_Concat,           /* sq_concat */
   13956     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
   13957     (ssizeargfunc) unicode_getitem,     /* sq_item */
   13958     0,                  /* sq_slice */
   13959     0,                  /* sq_ass_item */
   13960     0,                  /* sq_ass_slice */
   13961     PyUnicode_Contains,         /* sq_contains */
   13962 };
   13963 
   13964 static PyObject*
   13965 unicode_subscript(PyObject* self, PyObject* item)
   13966 {
   13967     if (PyUnicode_READY(self) == -1)
   13968         return NULL;
   13969 
   13970     if (PyIndex_Check(item)) {
   13971         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
   13972         if (i == -1 && PyErr_Occurred())
   13973             return NULL;
   13974         if (i < 0)
   13975             i += PyUnicode_GET_LENGTH(self);
   13976         return unicode_getitem(self, i);
   13977     } else if (PySlice_Check(item)) {
   13978         Py_ssize_t start, stop, step, slicelength, cur, i;
   13979         PyObject *result;
   13980         void *src_data, *dest_data;
   13981         int src_kind, dest_kind;
   13982         Py_UCS4 ch, max_char, kind_limit;
   13983 
   13984         if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
   13985             return NULL;
   13986         }
   13987         slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
   13988                                             &start, &stop, step);
   13989 
   13990         if (slicelength <= 0) {
   13991             _Py_RETURN_UNICODE_EMPTY();
   13992         } else if (start == 0 && step == 1 &&
   13993                    slicelength == PyUnicode_GET_LENGTH(self)) {
   13994             return unicode_result_unchanged(self);
   13995         } else if (step == 1) {
   13996             return PyUnicode_Substring(self,
   13997                                        start, start + slicelength);
   13998         }
   13999         /* General case */
   14000         src_kind = PyUnicode_KIND(self);
   14001         src_data = PyUnicode_DATA(self);
   14002         if (!PyUnicode_IS_ASCII(self)) {
   14003             kind_limit = kind_maxchar_limit(src_kind);
   14004             max_char = 0;
   14005             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
   14006                 ch = PyUnicode_READ(src_kind, src_data, cur);
   14007                 if (ch > max_char) {
   14008                     max_char = ch;
   14009                     if (max_char >= kind_limit)
   14010                         break;
   14011                 }
   14012             }
   14013         }
   14014         else
   14015             max_char = 127;
   14016         result = PyUnicode_New(slicelength, max_char);
   14017         if (result == NULL)
   14018             return NULL;
   14019         dest_kind = PyUnicode_KIND(result);
   14020         dest_data = PyUnicode_DATA(result);
   14021 
   14022         for (cur = start, i = 0; i < slicelength; cur += step, i++) {
   14023             Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
   14024             PyUnicode_WRITE(dest_kind, dest_data, i, ch);
   14025         }
   14026         assert(_PyUnicode_CheckConsistency(result, 1));
   14027         return result;
   14028     } else {
   14029         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
   14030         return NULL;
   14031     }
   14032 }
   14033 
   14034 static PyMappingMethods unicode_as_mapping = {
   14035     (lenfunc)unicode_length,        /* mp_length */
   14036     (binaryfunc)unicode_subscript,  /* mp_subscript */
   14037     (objobjargproc)0,           /* mp_ass_subscript */
   14038 };
   14039 
   14040 
   14041 /* Helpers for PyUnicode_Format() */
   14042 
   14043 struct unicode_formatter_t {
   14044     PyObject *args;
   14045     int args_owned;
   14046     Py_ssize_t arglen, argidx;
   14047     PyObject *dict;
   14048 
   14049     enum PyUnicode_Kind fmtkind;
   14050     Py_ssize_t fmtcnt, fmtpos;
   14051     void *fmtdata;
   14052     PyObject *fmtstr;
   14053 
   14054     _PyUnicodeWriter writer;
   14055 };
   14056 
   14057 struct unicode_format_arg_t {
   14058     Py_UCS4 ch;
   14059     int flags;
   14060     Py_ssize_t width;
   14061     int prec;
   14062     int sign;
   14063 };
   14064 
   14065 static PyObject *
   14066 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
   14067 {
   14068     Py_ssize_t argidx = ctx->argidx;
   14069 
   14070     if (argidx < ctx->arglen) {
   14071         ctx->argidx++;
   14072         if (ctx->arglen < 0)
   14073             return ctx->args;
   14074         else
   14075             return PyTuple_GetItem(ctx->args, argidx);
   14076     }
   14077     PyErr_SetString(PyExc_TypeError,
   14078                     "not enough arguments for format string");
   14079     return NULL;
   14080 }
   14081 
   14082 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
   14083 
   14084 /* Format a float into the writer if the writer is not NULL, or into *p_output
   14085    otherwise.
   14086 
   14087    Return 0 on success, raise an exception and return -1 on error. */
   14088 static int
   14089 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
   14090             PyObject **p_output,
   14091             _PyUnicodeWriter *writer)
   14092 {
   14093     char *p;
   14094     double x;
   14095     Py_ssize_t len;
   14096     int prec;
   14097     int dtoa_flags;
   14098 
   14099     x = PyFloat_AsDouble(v);
   14100     if (x == -1.0 && PyErr_Occurred())
   14101         return -1;
   14102 
   14103     prec = arg->prec;
   14104     if (prec < 0)
   14105         prec = 6;
   14106 
   14107     if (arg->flags & F_ALT)
   14108         dtoa_flags = Py_DTSF_ALT;
   14109     else
   14110         dtoa_flags = 0;
   14111     p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
   14112     if (p == NULL)
   14113         return -1;
   14114     len = strlen(p);
   14115     if (writer) {
   14116         if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
   14117             PyMem_Free(p);
   14118             return -1;
   14119         }
   14120     }
   14121     else
   14122         *p_output = _PyUnicode_FromASCII(p, len);
   14123     PyMem_Free(p);
   14124     return 0;
   14125 }
   14126 
   14127 /* formatlong() emulates the format codes d, u, o, x and X, and
   14128  * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
   14129  * Python's regular ints.
   14130  * Return value:  a new PyUnicodeObject*, or NULL if error.
   14131  *     The output string is of the form
   14132  *         "-"? ("0x" | "0X")? digit+
   14133  *     "0x"/"0X" are present only for x and X conversions, with F_ALT
   14134  *         set in flags.  The case of hex digits will be correct,
   14135  *     There will be at least prec digits, zero-filled on the left if
   14136  *         necessary to get that many.
   14137  * val          object to be converted
   14138  * flags        bitmask of format flags; only F_ALT is looked at
   14139  * prec         minimum number of digits; 0-fill on left if needed
   14140  * type         a character in [duoxX]; u acts the same as d
   14141  *
   14142  * CAUTION:  o, x and X conversions on regular ints can never
   14143  * produce a '-' sign, but can for Python's unbounded ints.
   14144  */
   14145 PyObject *
   14146 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
   14147 {
   14148     PyObject *result = NULL;
   14149     char *buf;
   14150     Py_ssize_t i;
   14151     int sign;           /* 1 if '-', else 0 */
   14152     int len;            /* number of characters */
   14153     Py_ssize_t llen;
   14154     int numdigits;      /* len == numnondigits + numdigits */
   14155     int numnondigits = 0;
   14156 
   14157     /* Avoid exceeding SSIZE_T_MAX */
   14158     if (prec > INT_MAX-3) {
   14159         PyErr_SetString(PyExc_OverflowError,
   14160                         "precision too large");
   14161         return NULL;
   14162     }
   14163 
   14164     assert(PyLong_Check(val));
   14165 
   14166     switch (type) {
   14167     default:
   14168         Py_UNREACHABLE();
   14169     case 'd':
   14170     case 'i':
   14171     case 'u':
   14172         /* int and int subclasses should print numerically when a numeric */
   14173         /* format code is used (see issue18780) */
   14174         result = PyNumber_ToBase(val, 10);
   14175         break;
   14176     case 'o':
   14177         numnondigits = 2;
   14178         result = PyNumber_ToBase(val, 8);
   14179         break;
   14180     case 'x':
   14181     case 'X':
   14182         numnondigits = 2;
   14183         result = PyNumber_ToBase(val, 16);
   14184         break;
   14185     }
   14186     if (!result)
   14187         return NULL;
   14188 
   14189     assert(unicode_modifiable(result));
   14190     assert(PyUnicode_IS_READY(result));
   14191     assert(PyUnicode_IS_ASCII(result));
   14192 
   14193     /* To modify the string in-place, there can only be one reference. */
   14194     if (Py_REFCNT(result) != 1) {
   14195         Py_DECREF(result);
   14196         PyErr_BadInternalCall();
   14197         return NULL;
   14198     }
   14199     buf = PyUnicode_DATA(result);
   14200     llen = PyUnicode_GET_LENGTH(result);
   14201     if (llen > INT_MAX) {
   14202         Py_DECREF(result);
   14203         PyErr_SetString(PyExc_ValueError,
   14204                         "string too large in _PyUnicode_FormatLong");
   14205         return NULL;
   14206     }
   14207     len = (int)llen;
   14208     sign = buf[0] == '-';
   14209     numnondigits += sign;
   14210     numdigits = len - numnondigits;
   14211     assert(numdigits > 0);
   14212 
   14213     /* Get rid of base marker unless F_ALT */
   14214     if (((alt) == 0 &&
   14215         (type == 'o' || type == 'x' || type == 'X'))) {
   14216         assert(buf[sign] == '0');
   14217         assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
   14218                buf[sign+1] == 'o');
   14219         numnondigits -= 2;
   14220         buf += 2;
   14221         len -= 2;
   14222         if (sign)
   14223             buf[0] = '-';
   14224         assert(len == numnondigits + numdigits);
   14225         assert(numdigits > 0);
   14226     }
   14227 
   14228     /* Fill with leading zeroes to meet minimum width. */
   14229     if (prec > numdigits) {
   14230         PyObject *r1 = PyBytes_FromStringAndSize(NULL,
   14231                                 numnondigits + prec);
   14232         char *b1;
   14233         if (!r1) {
   14234             Py_DECREF(result);
   14235             return NULL;
   14236         }
   14237         b1 = PyBytes_AS_STRING(r1);
   14238         for (i = 0; i < numnondigits; ++i)
   14239             *b1++ = *buf++;
   14240         for (i = 0; i < prec - numdigits; i++)
   14241             *b1++ = '0';
   14242         for (i = 0; i < numdigits; i++)
   14243             *b1++ = *buf++;
   14244         *b1 = '\0';
   14245         Py_DECREF(result);
   14246         result = r1;
   14247         buf = PyBytes_AS_STRING(result);
   14248         len = numnondigits + prec;
   14249     }
   14250 
   14251     /* Fix up case for hex conversions. */
   14252     if (type == 'X') {
   14253         /* Need to convert all lower case letters to upper case.
   14254            and need to convert 0x to 0X (and -0x to -0X). */
   14255         for (i = 0; i < len; i++)
   14256             if (buf[i] >= 'a' && buf[i] <= 'x')
   14257                 buf[i] -= 'a'-'A';
   14258     }
   14259     if (!PyUnicode_Check(result)
   14260         || buf != PyUnicode_DATA(result)) {
   14261         PyObject *unicode;
   14262         unicode = _PyUnicode_FromASCII(buf, len);
   14263         Py_DECREF(result);
   14264         result = unicode;
   14265     }
   14266     else if (len != PyUnicode_GET_LENGTH(result)) {
   14267         if (PyUnicode_Resize(&result, len) < 0)
   14268             Py_CLEAR(result);
   14269     }
   14270     return result;
   14271 }
   14272 
   14273 /* Format an integer or a float as an integer.
   14274  * Return 1 if the number has been formatted into the writer,
   14275  *        0 if the number has been formatted into *p_output
   14276  *       -1 and raise an exception on error */
   14277 static int
   14278 mainformatlong(PyObject *v,
   14279                struct unicode_format_arg_t *arg,
   14280                PyObject **p_output,
   14281                _PyUnicodeWriter *writer)
   14282 {
   14283     PyObject *iobj, *res;
   14284     char type = (char)arg->ch;
   14285 
   14286     if (!PyNumber_Check(v))
   14287         goto wrongtype;
   14288 
   14289     /* make sure number is a type of integer for o, x, and X */
   14290     if (!PyLong_Check(v)) {
   14291         if (type == 'o' || type == 'x' || type == 'X') {
   14292             iobj = PyNumber_Index(v);
   14293             if (iobj == NULL) {
   14294                 if (PyErr_ExceptionMatches(PyExc_TypeError))
   14295                     goto wrongtype;
   14296                 return -1;
   14297             }
   14298         }
   14299         else {
   14300             iobj = PyNumber_Long(v);
   14301             if (iobj == NULL ) {
   14302                 if (PyErr_ExceptionMatches(PyExc_TypeError))
   14303                     goto wrongtype;
   14304                 return -1;
   14305             }
   14306         }
   14307         assert(PyLong_Check(iobj));
   14308     }
   14309     else {
   14310         iobj = v;
   14311         Py_INCREF(iobj);
   14312     }
   14313 
   14314     if (PyLong_CheckExact(v)
   14315         && arg->width == -1 && arg->prec == -1
   14316         && !(arg->flags & (F_SIGN | F_BLANK))
   14317         && type != 'X')
   14318     {
   14319         /* Fast path */
   14320         int alternate = arg->flags & F_ALT;
   14321         int base;
   14322 
   14323         switch(type)
   14324         {
   14325             default:
   14326                 Py_UNREACHABLE();
   14327             case 'd':
   14328             case 'i':
   14329             case 'u':
   14330                 base = 10;
   14331                 break;
   14332             case 'o':
   14333                 base = 8;
   14334                 break;
   14335             case 'x':
   14336             case 'X':
   14337                 base = 16;
   14338                 break;
   14339         }
   14340 
   14341         if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
   14342             Py_DECREF(iobj);
   14343             return -1;
   14344         }
   14345         Py_DECREF(iobj);
   14346         return 1;
   14347     }
   14348 
   14349     res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
   14350     Py_DECREF(iobj);
   14351     if (res == NULL)
   14352         return -1;
   14353     *p_output = res;
   14354     return 0;
   14355 
   14356 wrongtype:
   14357     switch(type)
   14358     {
   14359         case 'o':
   14360         case 'x':
   14361         case 'X':
   14362             PyErr_Format(PyExc_TypeError,
   14363                     "%%%c format: an integer is required, "
   14364                     "not %.200s",
   14365                     type, Py_TYPE(v)->tp_name);
   14366             break;
   14367         default:
   14368             PyErr_Format(PyExc_TypeError,
   14369                     "%%%c format: a number is required, "
   14370                     "not %.200s",
   14371                     type, Py_TYPE(v)->tp_name);
   14372             break;
   14373     }
   14374     return -1;
   14375 }
   14376 
   14377 static Py_UCS4
   14378 formatchar(PyObject *v)
   14379 {
   14380     /* presume that the buffer is at least 3 characters long */
   14381     if (PyUnicode_Check(v)) {
   14382         if (PyUnicode_GET_LENGTH(v) == 1) {
   14383             return PyUnicode_READ_CHAR(v, 0);
   14384         }
   14385         goto onError;
   14386     }
   14387     else {
   14388         PyObject *iobj;
   14389         long x;
   14390         /* make sure number is a type of integer */
   14391         if (!PyLong_Check(v)) {
   14392             iobj = PyNumber_Index(v);
   14393             if (iobj == NULL) {
   14394                 goto onError;
   14395             }
   14396             x = PyLong_AsLong(iobj);
   14397             Py_DECREF(iobj);
   14398         }
   14399         else {
   14400             x = PyLong_AsLong(v);
   14401         }
   14402         if (x == -1 && PyErr_Occurred())
   14403             goto onError;
   14404 
   14405         if (x < 0 || x > MAX_UNICODE) {
   14406             PyErr_SetString(PyExc_OverflowError,
   14407                             "%c arg not in range(0x110000)");
   14408             return (Py_UCS4) -1;
   14409         }
   14410 
   14411         return (Py_UCS4) x;
   14412     }
   14413 
   14414   onError:
   14415     PyErr_SetString(PyExc_TypeError,
   14416                     "%c requires int or char");
   14417     return (Py_UCS4) -1;
   14418 }
   14419 
   14420 /* Parse options of an argument: flags, width, precision.
   14421    Handle also "%(name)" syntax.
   14422 
   14423    Return 0 if the argument has been formatted into arg->str.
   14424    Return 1 if the argument has been written into ctx->writer,
   14425    Raise an exception and return -1 on error. */
   14426 static int
   14427 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
   14428                          struct unicode_format_arg_t *arg)
   14429 {
   14430 #define FORMAT_READ(ctx) \
   14431         PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
   14432 
   14433     PyObject *v;
   14434 
   14435     if (arg->ch == '(') {
   14436         /* Get argument value from a dictionary. Example: "%(name)s". */
   14437         Py_ssize_t keystart;
   14438         Py_ssize_t keylen;
   14439         PyObject *key;
   14440         int pcount = 1;
   14441 
   14442         if (ctx->dict == NULL) {
   14443             PyErr_SetString(PyExc_TypeError,
   14444                             "format requires a mapping");
   14445             return -1;
   14446         }
   14447         ++ctx->fmtpos;
   14448         --ctx->fmtcnt;
   14449         keystart = ctx->fmtpos;
   14450         /* Skip over balanced parentheses */
   14451         while (pcount > 0 && --ctx->fmtcnt >= 0) {
   14452             arg->ch = FORMAT_READ(ctx);
   14453             if (arg->ch == ')')
   14454                 --pcount;
   14455             else if (arg->ch == '(')
   14456                 ++pcount;
   14457             ctx->fmtpos++;
   14458         }
   14459         keylen = ctx->fmtpos - keystart - 1;
   14460         if (ctx->fmtcnt < 0 || pcount > 0) {
   14461             PyErr_SetString(PyExc_ValueError,
   14462                             "incomplete format key");
   14463             return -1;
   14464         }
   14465         key = PyUnicode_Substring(ctx->fmtstr,
   14466                                   keystart, keystart + keylen);
   14467         if (key == NULL)
   14468             return -1;
   14469         if (ctx->args_owned) {
   14470             ctx->args_owned = 0;
   14471             Py_DECREF(ctx->args);
   14472         }
   14473         ctx->args = PyObject_GetItem(ctx->dict, key);
   14474         Py_DECREF(key);
   14475         if (ctx->args == NULL)
   14476             return -1;
   14477         ctx->args_owned = 1;
   14478         ctx->arglen = -1;
   14479         ctx->argidx = -2;
   14480     }
   14481 
   14482     /* Parse flags. Example: "%+i" => flags=F_SIGN. */
   14483     while (--ctx->fmtcnt >= 0) {
   14484         arg->ch = FORMAT_READ(ctx);
   14485         ctx->fmtpos++;
   14486         switch (arg->ch) {
   14487         case '-': arg->flags |= F_LJUST; continue;
   14488         case '+': arg->flags |= F_SIGN; continue;
   14489         case ' ': arg->flags |= F_BLANK; continue;
   14490         case '#': arg->flags |= F_ALT; continue;
   14491         case '0': arg->flags |= F_ZERO; continue;
   14492         }
   14493         break;
   14494     }
   14495 
   14496     /* Parse width. Example: "%10s" => width=10 */
   14497     if (arg->ch == '*') {
   14498         v = unicode_format_getnextarg(ctx);
   14499         if (v == NULL)
   14500             return -1;
   14501         if (!PyLong_Check(v)) {
   14502             PyErr_SetString(PyExc_TypeError,
   14503                             "* wants int");
   14504             return -1;
   14505         }
   14506         arg->width = PyLong_AsSsize_t(v);
   14507         if (arg->width == -1 && PyErr_Occurred())
   14508             return -1;
   14509         if (arg->width < 0) {
   14510             arg->flags |= F_LJUST;
   14511             arg->width = -arg->width;
   14512         }
   14513         if (--ctx->fmtcnt >= 0) {
   14514             arg->ch = FORMAT_READ(ctx);
   14515             ctx->fmtpos++;
   14516         }
   14517     }
   14518     else if (arg->ch >= '0' && arg->ch <= '9') {
   14519         arg->width = arg->ch - '0';
   14520         while (--ctx->fmtcnt >= 0) {
   14521             arg->ch = FORMAT_READ(ctx);
   14522             ctx->fmtpos++;
   14523             if (arg->ch < '0' || arg->ch > '9')
   14524                 break;
   14525             /* Since arg->ch is unsigned, the RHS would end up as unsigned,
   14526                mixing signed and unsigned comparison. Since arg->ch is between
   14527                '0' and '9', casting to int is safe. */
   14528             if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
   14529                 PyErr_SetString(PyExc_ValueError,
   14530                                 "width too big");
   14531                 return -1;
   14532             }
   14533             arg->width = arg->width*10 + (arg->ch - '0');
   14534         }
   14535     }
   14536 
   14537     /* Parse precision. Example: "%.3f" => prec=3 */
   14538     if (arg->ch == '.') {
   14539         arg->prec = 0;
   14540         if (--ctx->fmtcnt >= 0) {
   14541             arg->ch = FORMAT_READ(ctx);
   14542             ctx->fmtpos++;
   14543         }
   14544         if (arg->ch == '*') {
   14545             v = unicode_format_getnextarg(ctx);
   14546             if (v == NULL)
   14547                 return -1;
   14548             if (!PyLong_Check(v)) {
   14549                 PyErr_SetString(PyExc_TypeError,
   14550                                 "* wants int");
   14551                 return -1;
   14552             }
   14553             arg->prec = _PyLong_AsInt(v);
   14554             if (arg->prec == -1 && PyErr_Occurred())
   14555                 return -1;
   14556             if (arg->prec < 0)
   14557                 arg->prec = 0;
   14558             if (--ctx->fmtcnt >= 0) {
   14559                 arg->ch = FORMAT_READ(ctx);
   14560                 ctx->fmtpos++;
   14561             }
   14562         }
   14563         else if (arg->ch >= '0' && arg->ch <= '9') {
   14564             arg->prec = arg->ch - '0';
   14565             while (--ctx->fmtcnt >= 0) {
   14566                 arg->ch = FORMAT_READ(ctx);
   14567                 ctx->fmtpos++;
   14568                 if (arg->ch < '0' || arg->ch > '9')
   14569                     break;
   14570                 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
   14571                     PyErr_SetString(PyExc_ValueError,
   14572                                     "precision too big");
   14573                     return -1;
   14574                 }
   14575                 arg->prec = arg->prec*10 + (arg->ch - '0');
   14576             }
   14577         }
   14578     }
   14579 
   14580     /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
   14581     if (ctx->fmtcnt >= 0) {
   14582         if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
   14583             if (--ctx->fmtcnt >= 0) {
   14584                 arg->ch = FORMAT_READ(ctx);
   14585                 ctx->fmtpos++;
   14586             }
   14587         }
   14588     }
   14589     if (ctx->fmtcnt < 0) {
   14590         PyErr_SetString(PyExc_ValueError,
   14591                         "incomplete format");
   14592         return -1;
   14593     }
   14594     return 0;
   14595 
   14596 #undef FORMAT_READ
   14597 }
   14598 
   14599 /* Format one argument. Supported conversion specifiers:
   14600 
   14601    - "s", "r", "a": any type
   14602    - "i", "d", "u": int or float
   14603    - "o", "x", "X": int
   14604    - "e", "E", "f", "F", "g", "G": float
   14605    - "c": int or str (1 character)
   14606 
   14607    When possible, the output is written directly into the Unicode writer
   14608    (ctx->writer). A string is created when padding is required.
   14609 
   14610    Return 0 if the argument has been formatted into *p_str,
   14611           1 if the argument has been written into ctx->writer,
   14612          -1 on error. */
   14613 static int
   14614 unicode_format_arg_format(struct unicode_formatter_t *ctx,
   14615                           struct unicode_format_arg_t *arg,
   14616                           PyObject **p_str)
   14617 {
   14618     PyObject *v;
   14619     _PyUnicodeWriter *writer = &ctx->writer;
   14620 
   14621     if (ctx->fmtcnt == 0)
   14622         ctx->writer.overallocate = 0;
   14623 
   14624     v = unicode_format_getnextarg(ctx);
   14625     if (v == NULL)
   14626         return -1;
   14627 
   14628 
   14629     switch (arg->ch) {
   14630     case 's':
   14631     case 'r':
   14632     case 'a':
   14633         if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
   14634             /* Fast path */
   14635             if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
   14636                 return -1;
   14637             return 1;
   14638         }
   14639 
   14640         if (PyUnicode_CheckExact(v) && arg->ch == 's') {
   14641             *p_str = v;
   14642             Py_INCREF(*p_str);
   14643         }
   14644         else {
   14645             if (arg->ch == 's')
   14646                 *p_str = PyObject_Str(v);
   14647             else if (arg->ch == 'r')
   14648                 *p_str = PyObject_Repr(v);
   14649             else
   14650                 *p_str = PyObject_ASCII(v);
   14651         }
   14652         break;
   14653 
   14654     case 'i':
   14655     case 'd':
   14656     case 'u':
   14657     case 'o':
   14658     case 'x':
   14659     case 'X':
   14660     {
   14661         int ret = mainformatlong(v, arg, p_str, writer);
   14662         if (ret != 0)
   14663             return ret;
   14664         arg->sign = 1;
   14665         break;
   14666     }
   14667 
   14668     case 'e':
   14669     case 'E':
   14670     case 'f':
   14671     case 'F':
   14672     case 'g':
   14673     case 'G':
   14674         if (arg->width == -1 && arg->prec == -1
   14675             && !(arg->flags & (F_SIGN | F_BLANK)))
   14676         {
   14677             /* Fast path */
   14678             if (formatfloat(v, arg, NULL, writer) == -1)
   14679                 return -1;
   14680             return 1;
   14681         }
   14682 
   14683         arg->sign = 1;
   14684         if (formatfloat(v, arg, p_str, NULL) == -1)
   14685             return -1;
   14686         break;
   14687 
   14688     case 'c':
   14689     {
   14690         Py_UCS4 ch = formatchar(v);
   14691         if (ch == (Py_UCS4) -1)
   14692             return -1;
   14693         if (arg->width == -1 && arg->prec == -1) {
   14694             /* Fast path */
   14695             if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
   14696                 return -1;
   14697             return 1;
   14698         }
   14699         *p_str = PyUnicode_FromOrdinal(ch);
   14700         break;
   14701     }
   14702 
   14703     default:
   14704         PyErr_Format(PyExc_ValueError,
   14705                      "unsupported format character '%c' (0x%x) "
   14706                      "at index %zd",
   14707                      (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
   14708                      (int)arg->ch,
   14709                      ctx->fmtpos - 1);
   14710         return -1;
   14711     }
   14712     if (*p_str == NULL)
   14713         return -1;
   14714     assert (PyUnicode_Check(*p_str));
   14715     return 0;
   14716 }
   14717 
   14718 static int
   14719 unicode_format_arg_output(struct unicode_formatter_t *ctx,
   14720                           struct unicode_format_arg_t *arg,
   14721                           PyObject *str)
   14722 {
   14723     Py_ssize_t len;
   14724     enum PyUnicode_Kind kind;
   14725     void *pbuf;
   14726     Py_ssize_t pindex;
   14727     Py_UCS4 signchar;
   14728     Py_ssize_t buflen;
   14729     Py_UCS4 maxchar;
   14730     Py_ssize_t sublen;
   14731     _PyUnicodeWriter *writer = &ctx->writer;
   14732     Py_UCS4 fill;
   14733 
   14734     fill = ' ';
   14735     if (arg->sign && arg->flags & F_ZERO)
   14736         fill = '0';
   14737 
   14738     if (PyUnicode_READY(str) == -1)
   14739         return -1;
   14740 
   14741     len = PyUnicode_GET_LENGTH(str);
   14742     if ((arg->width == -1 || arg->width <= len)
   14743         && (arg->prec == -1 || arg->prec >= len)
   14744         && !(arg->flags & (F_SIGN | F_BLANK)))
   14745     {
   14746         /* Fast path */
   14747         if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
   14748             return -1;
   14749         return 0;
   14750     }
   14751 
   14752     /* Truncate the string for "s", "r" and "a" formats
   14753        if the precision is set */
   14754     if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
   14755         if (arg->prec >= 0 && len > arg->prec)
   14756             len = arg->prec;
   14757     }
   14758 
   14759     /* Adjust sign and width */
   14760     kind = PyUnicode_KIND(str);
   14761     pbuf = PyUnicode_DATA(str);
   14762     pindex = 0;
   14763     signchar = '\0';
   14764     if (arg->sign) {
   14765         Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
   14766         if (ch == '-' || ch == '+') {
   14767             signchar = ch;
   14768             len--;
   14769             pindex++;
   14770         }
   14771         else if (arg->flags & F_SIGN)
   14772             signchar = '+';
   14773         else if (arg->flags & F_BLANK)
   14774             signchar = ' ';
   14775         else
   14776             arg->sign = 0;
   14777     }
   14778     if (arg->width < len)
   14779         arg->width = len;
   14780 
   14781     /* Prepare the writer */
   14782     maxchar = writer->maxchar;
   14783     if (!(arg->flags & F_LJUST)) {
   14784         if (arg->sign) {
   14785             if ((arg->width-1) > len)
   14786                 maxchar = Py_MAX(maxchar, fill);
   14787         }
   14788         else {
   14789             if (arg->width > len)
   14790                 maxchar = Py_MAX(maxchar, fill);
   14791         }
   14792     }
   14793     if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
   14794         Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
   14795         maxchar = Py_MAX(maxchar, strmaxchar);
   14796     }
   14797 
   14798     buflen = arg->width;
   14799     if (arg->sign && len == arg->width)
   14800         buflen++;
   14801     if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
   14802         return -1;
   14803 
   14804     /* Write the sign if needed */
   14805     if (arg->sign) {
   14806         if (fill != ' ') {
   14807             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
   14808             writer->pos += 1;
   14809         }
   14810         if (arg->width > len)
   14811             arg->width--;
   14812     }
   14813 
   14814     /* Write the numeric prefix for "x", "X" and "o" formats
   14815        if the alternate form is used.
   14816        For example, write "0x" for the "%#x" format. */
   14817     if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
   14818         assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
   14819         assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
   14820         if (fill != ' ') {
   14821             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
   14822             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
   14823             writer->pos += 2;
   14824             pindex += 2;
   14825         }
   14826         arg->width -= 2;
   14827         if (arg->width < 0)
   14828             arg->width = 0;
   14829         len -= 2;
   14830     }
   14831 
   14832     /* Pad left with the fill character if needed */
   14833     if (arg->width > len && !(arg->flags & F_LJUST)) {
   14834         sublen = arg->width - len;
   14835         FILL(writer->kind, writer->data, fill, writer->pos, sublen);
   14836         writer->pos += sublen;
   14837         arg->width = len;
   14838     }
   14839 
   14840     /* If padding with spaces: write sign if needed and/or numeric prefix if
   14841        the alternate form is used */
   14842     if (fill == ' ') {
   14843         if (arg->sign) {
   14844             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
   14845             writer->pos += 1;
   14846         }
   14847         if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
   14848             assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
   14849             assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
   14850             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
   14851             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
   14852             writer->pos += 2;
   14853             pindex += 2;
   14854         }
   14855     }
   14856 
   14857     /* Write characters */
   14858     if (len) {
   14859         _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
   14860                                       str, pindex, len);
   14861         writer->pos += len;
   14862     }
   14863 
   14864     /* Pad right with the fill character if needed */
   14865     if (arg->width > len) {
   14866         sublen = arg->width - len;
   14867         FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
   14868         writer->pos += sublen;
   14869     }
   14870     return 0;
   14871 }
   14872 
   14873 /* Helper of PyUnicode_Format(): format one arg.
   14874    Return 0 on success, raise an exception and return -1 on error. */
   14875 static int
   14876 unicode_format_arg(struct unicode_formatter_t *ctx)
   14877 {
   14878     struct unicode_format_arg_t arg;
   14879     PyObject *str;
   14880     int ret;
   14881 
   14882     arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
   14883     if (arg.ch == '%') {
   14884         ctx->fmtpos++;
   14885         ctx->fmtcnt--;
   14886         if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
   14887             return -1;
   14888         return 0;
   14889     }
   14890     arg.flags = 0;
   14891     arg.width = -1;
   14892     arg.prec = -1;
   14893     arg.sign = 0;
   14894     str = NULL;
   14895 
   14896     ret = unicode_format_arg_parse(ctx, &arg);
   14897     if (ret == -1)
   14898         return -1;
   14899 
   14900     ret = unicode_format_arg_format(ctx, &arg, &str);
   14901     if (ret == -1)
   14902         return -1;
   14903 
   14904     if (ret != 1) {
   14905         ret = unicode_format_arg_output(ctx, &arg, str);
   14906         Py_DECREF(str);
   14907         if (ret == -1)
   14908             return -1;
   14909     }
   14910 
   14911     if (ctx->dict && (ctx->argidx < ctx->arglen)) {
   14912         PyErr_SetString(PyExc_TypeError,
   14913                         "not all arguments converted during string formatting");
   14914         return -1;
   14915     }
   14916     return 0;
   14917 }
   14918 
   14919 PyObject *
   14920 PyUnicode_Format(PyObject *format, PyObject *args)
   14921 {
   14922     struct unicode_formatter_t ctx;
   14923 
   14924     if (format == NULL || args == NULL) {
   14925         PyErr_BadInternalCall();
   14926         return NULL;
   14927     }
   14928 
   14929     if (ensure_unicode(format) < 0)
   14930         return NULL;
   14931 
   14932     ctx.fmtstr = format;
   14933     ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
   14934     ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
   14935     ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
   14936     ctx.fmtpos = 0;
   14937 
   14938     _PyUnicodeWriter_Init(&ctx.writer);
   14939     ctx.writer.min_length = ctx.fmtcnt + 100;
   14940     ctx.writer.overallocate = 1;
   14941 
   14942     if (PyTuple_Check(args)) {
   14943         ctx.arglen = PyTuple_Size(args);
   14944         ctx.argidx = 0;
   14945     }
   14946     else {
   14947         ctx.arglen = -1;
   14948         ctx.argidx = -2;
   14949     }
   14950     ctx.args_owned = 0;
   14951     if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
   14952         ctx.dict = args;
   14953     else
   14954         ctx.dict = NULL;
   14955     ctx.args = args;
   14956 
   14957     while (--ctx.fmtcnt >= 0) {
   14958         if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
   14959             Py_ssize_t nonfmtpos;
   14960 
   14961             nonfmtpos = ctx.fmtpos++;
   14962             while (ctx.fmtcnt >= 0 &&
   14963                    PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
   14964                 ctx.fmtpos++;
   14965                 ctx.fmtcnt--;
   14966             }
   14967             if (ctx.fmtcnt < 0) {
   14968                 ctx.fmtpos--;
   14969                 ctx.writer.overallocate = 0;
   14970             }
   14971 
   14972             if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
   14973                                                 nonfmtpos, ctx.fmtpos) < 0)
   14974                 goto onError;
   14975         }
   14976         else {
   14977             ctx.fmtpos++;
   14978             if (unicode_format_arg(&ctx) == -1)
   14979                 goto onError;
   14980         }
   14981     }
   14982 
   14983     if (ctx.argidx < ctx.arglen && !ctx.dict) {
   14984         PyErr_SetString(PyExc_TypeError,
   14985                         "not all arguments converted during string formatting");
   14986         goto onError;
   14987     }
   14988 
   14989     if (ctx.args_owned) {
   14990         Py_DECREF(ctx.args);
   14991     }
   14992     return _PyUnicodeWriter_Finish(&ctx.writer);
   14993 
   14994   onError:
   14995     _PyUnicodeWriter_Dealloc(&ctx.writer);
   14996     if (ctx.args_owned) {
   14997         Py_DECREF(ctx.args);
   14998     }
   14999     return NULL;
   15000 }
   15001 
   15002 static PyObject *
   15003 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
   15004 
   15005 static PyObject *
   15006 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
   15007 {
   15008     PyObject *x = NULL;
   15009     static char *kwlist[] = {"object", "encoding", "errors", 0};
   15010     char *encoding = NULL;
   15011     char *errors = NULL;
   15012 
   15013     if (type != &PyUnicode_Type)
   15014         return unicode_subtype_new(type, args, kwds);
   15015     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
   15016                                      kwlist, &x, &encoding, &errors))
   15017         return NULL;
   15018     if (x == NULL)
   15019         _Py_RETURN_UNICODE_EMPTY();
   15020     if (encoding == NULL && errors == NULL)
   15021         return PyObject_Str(x);
   15022     else
   15023         return PyUnicode_FromEncodedObject(x, encoding, errors);
   15024 }
   15025 
   15026 static PyObject *
   15027 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
   15028 {
   15029     PyObject *unicode, *self;
   15030     Py_ssize_t length, char_size;
   15031     int share_wstr, share_utf8;
   15032     unsigned int kind;
   15033     void *data;
   15034 
   15035     assert(PyType_IsSubtype(type, &PyUnicode_Type));
   15036 
   15037     unicode = unicode_new(&PyUnicode_Type, args, kwds);
   15038     if (unicode == NULL)
   15039         return NULL;
   15040     assert(_PyUnicode_CHECK(unicode));
   15041     if (PyUnicode_READY(unicode) == -1) {
   15042         Py_DECREF(unicode);
   15043         return NULL;
   15044     }
   15045 
   15046     self = type->tp_alloc(type, 0);
   15047     if (self == NULL) {
   15048         Py_DECREF(unicode);
   15049         return NULL;
   15050     }
   15051     kind = PyUnicode_KIND(unicode);
   15052     length = PyUnicode_GET_LENGTH(unicode);
   15053 
   15054     _PyUnicode_LENGTH(self) = length;
   15055 #ifdef Py_DEBUG
   15056     _PyUnicode_HASH(self) = -1;
   15057 #else
   15058     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
   15059 #endif
   15060     _PyUnicode_STATE(self).interned = 0;
   15061     _PyUnicode_STATE(self).kind = kind;
   15062     _PyUnicode_STATE(self).compact = 0;
   15063     _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
   15064     _PyUnicode_STATE(self).ready = 1;
   15065     _PyUnicode_WSTR(self) = NULL;
   15066     _PyUnicode_UTF8_LENGTH(self) = 0;
   15067     _PyUnicode_UTF8(self) = NULL;
   15068     _PyUnicode_WSTR_LENGTH(self) = 0;
   15069     _PyUnicode_DATA_ANY(self) = NULL;
   15070 
   15071     share_utf8 = 0;
   15072     share_wstr = 0;
   15073     if (kind == PyUnicode_1BYTE_KIND) {
   15074         char_size = 1;
   15075         if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
   15076             share_utf8 = 1;
   15077     }
   15078     else if (kind == PyUnicode_2BYTE_KIND) {
   15079         char_size = 2;
   15080         if (sizeof(wchar_t) == 2)
   15081             share_wstr = 1;
   15082     }
   15083     else {
   15084         assert(kind == PyUnicode_4BYTE_KIND);
   15085         char_size = 4;
   15086         if (sizeof(wchar_t) == 4)
   15087             share_wstr = 1;
   15088     }
   15089 
   15090     /* Ensure we won't overflow the length. */
   15091     if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
   15092         PyErr_NoMemory();
   15093         goto onError;
   15094     }
   15095     data = PyObject_MALLOC((length + 1) * char_size);
   15096     if (data == NULL) {
   15097         PyErr_NoMemory();
   15098         goto onError;
   15099     }
   15100 
   15101     _PyUnicode_DATA_ANY(self) = data;
   15102     if (share_utf8) {
   15103         _PyUnicode_UTF8_LENGTH(self) = length;
   15104         _PyUnicode_UTF8(self) = data;
   15105     }
   15106     if (share_wstr) {
   15107         _PyUnicode_WSTR_LENGTH(self) = length;
   15108         _PyUnicode_WSTR(self) = (wchar_t *)data;
   15109     }
   15110 
   15111     memcpy(data, PyUnicode_DATA(unicode),
   15112               kind * (length + 1));
   15113     assert(_PyUnicode_CheckConsistency(self, 1));
   15114 #ifdef Py_DEBUG
   15115     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
   15116 #endif
   15117     Py_DECREF(unicode);
   15118     return self;
   15119 
   15120 onError:
   15121     Py_DECREF(unicode);
   15122     Py_DECREF(self);
   15123     return NULL;
   15124 }
   15125 
   15126 PyDoc_STRVAR(unicode_doc,
   15127 "str(object='') -> str\n\
   15128 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
   15129 \n\
   15130 Create a new string object from the given object. If encoding or\n\
   15131 errors is specified, then the object must expose a data buffer\n\
   15132 that will be decoded using the given encoding and error handler.\n\
   15133 Otherwise, returns the result of object.__str__() (if defined)\n\
   15134 or repr(object).\n\
   15135 encoding defaults to sys.getdefaultencoding().\n\
   15136 errors defaults to 'strict'.");
   15137 
   15138 static PyObject *unicode_iter(PyObject *seq);
   15139 
   15140 PyTypeObject PyUnicode_Type = {
   15141     PyVarObject_HEAD_INIT(&PyType_Type, 0)
   15142     "str",              /* tp_name */
   15143     sizeof(PyUnicodeObject),        /* tp_size */
   15144     0,                  /* tp_itemsize */
   15145     /* Slots */
   15146     (destructor)unicode_dealloc,    /* tp_dealloc */
   15147     0,                  /* tp_print */
   15148     0,                  /* tp_getattr */
   15149     0,                  /* tp_setattr */
   15150     0,                  /* tp_reserved */
   15151     unicode_repr,           /* tp_repr */
   15152     &unicode_as_number,         /* tp_as_number */
   15153     &unicode_as_sequence,       /* tp_as_sequence */
   15154     &unicode_as_mapping,        /* tp_as_mapping */
   15155     (hashfunc) unicode_hash,        /* tp_hash*/
   15156     0,                  /* tp_call*/
   15157     (reprfunc) unicode_str,     /* tp_str */
   15158     PyObject_GenericGetAttr,        /* tp_getattro */
   15159     0,                  /* tp_setattro */
   15160     0,                  /* tp_as_buffer */
   15161     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
   15162     Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
   15163     unicode_doc,            /* tp_doc */
   15164     0,                  /* tp_traverse */
   15165     0,                  /* tp_clear */
   15166     PyUnicode_RichCompare,      /* tp_richcompare */
   15167     0,                  /* tp_weaklistoffset */
   15168     unicode_iter,           /* tp_iter */
   15169     0,                  /* tp_iternext */
   15170     unicode_methods,            /* tp_methods */
   15171     0,                  /* tp_members */
   15172     0,                  /* tp_getset */
   15173     &PyBaseObject_Type,         /* tp_base */
   15174     0,                  /* tp_dict */
   15175     0,                  /* tp_descr_get */
   15176     0,                  /* tp_descr_set */
   15177     0,                  /* tp_dictoffset */
   15178     0,                  /* tp_init */
   15179     0,                  /* tp_alloc */
   15180     unicode_new,            /* tp_new */
   15181     PyObject_Del,           /* tp_free */
   15182 };
   15183 
   15184 /* Initialize the Unicode implementation */
   15185 
   15186 int _PyUnicode_Init(void)
   15187 {
   15188     /* XXX - move this array to unicodectype.c ? */
   15189     Py_UCS2 linebreak[] = {
   15190         0x000A, /* LINE FEED */
   15191         0x000D, /* CARRIAGE RETURN */
   15192         0x001C, /* FILE SEPARATOR */
   15193         0x001D, /* GROUP SEPARATOR */
   15194         0x001E, /* RECORD SEPARATOR */
   15195         0x0085, /* NEXT LINE */
   15196         0x2028, /* LINE SEPARATOR */
   15197         0x2029, /* PARAGRAPH SEPARATOR */
   15198     };
   15199 
   15200     /* Init the implementation */
   15201     _Py_INCREF_UNICODE_EMPTY();
   15202     if (!unicode_empty)
   15203         Py_FatalError("Can't create empty string");
   15204     Py_DECREF(unicode_empty);
   15205 
   15206     if (PyType_Ready(&PyUnicode_Type) < 0)
   15207         Py_FatalError("Can't initialize 'unicode'");
   15208 
   15209     /* initialize the linebreak bloom filter */
   15210     bloom_linebreak = make_bloom_mask(
   15211         PyUnicode_2BYTE_KIND, linebreak,
   15212         Py_ARRAY_LENGTH(linebreak));
   15213 
   15214     if (PyType_Ready(&EncodingMapType) < 0)
   15215          Py_FatalError("Can't initialize encoding map type");
   15216 
   15217     if (PyType_Ready(&PyFieldNameIter_Type) < 0)
   15218         Py_FatalError("Can't initialize field name iterator type");
   15219 
   15220     if (PyType_Ready(&PyFormatterIter_Type) < 0)
   15221         Py_FatalError("Can't initialize formatter iter type");
   15222 
   15223     return 0;
   15224 }
   15225 
   15226 /* Finalize the Unicode implementation */
   15227 
   15228 int
   15229 PyUnicode_ClearFreeList(void)
   15230 {
   15231     return 0;
   15232 }
   15233 
   15234 void
   15235 _PyUnicode_Fini(void)
   15236 {
   15237     int i;
   15238 
   15239     Py_CLEAR(unicode_empty);
   15240 
   15241     for (i = 0; i < 256; i++)
   15242         Py_CLEAR(unicode_latin1[i]);
   15243     _PyUnicode_ClearStaticStrings();
   15244     (void)PyUnicode_ClearFreeList();
   15245 }
   15246 
   15247 void
   15248 PyUnicode_InternInPlace(PyObject **p)
   15249 {
   15250     PyObject *s = *p;
   15251     PyObject *t;
   15252 #ifdef Py_DEBUG
   15253     assert(s != NULL);
   15254     assert(_PyUnicode_CHECK(s));
   15255 #else
   15256     if (s == NULL || !PyUnicode_Check(s))
   15257         return;
   15258 #endif
   15259     /* If it's a subclass, we don't really know what putting
   15260        it in the interned dict might do. */
   15261     if (!PyUnicode_CheckExact(s))
   15262         return;
   15263     if (PyUnicode_CHECK_INTERNED(s))
   15264         return;
   15265     if (interned == NULL) {
   15266         interned = PyDict_New();
   15267         if (interned == NULL) {
   15268             PyErr_Clear(); /* Don't leave an exception */
   15269             return;
   15270         }
   15271     }
   15272     Py_ALLOW_RECURSION
   15273     t = PyDict_SetDefault(interned, s, s);
   15274     Py_END_ALLOW_RECURSION
   15275     if (t == NULL) {
   15276         PyErr_Clear();
   15277         return;
   15278     }
   15279     if (t != s) {
   15280         Py_INCREF(t);
   15281         Py_SETREF(*p, t);
   15282         return;
   15283     }
   15284     /* The two references in interned are not counted by refcnt.
   15285        The deallocator will take care of this */
   15286     Py_REFCNT(s) -= 2;
   15287     _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
   15288 }
   15289 
   15290 void
   15291 PyUnicode_InternImmortal(PyObject **p)
   15292 {
   15293     PyUnicode_InternInPlace(p);
   15294     if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
   15295         _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
   15296         Py_INCREF(*p);
   15297     }
   15298 }
   15299 
   15300 PyObject *
   15301 PyUnicode_InternFromString(const char *cp)
   15302 {
   15303     PyObject *s = PyUnicode_FromString(cp);
   15304     if (s == NULL)
   15305         return NULL;
   15306     PyUnicode_InternInPlace(&s);
   15307     return s;
   15308 }
   15309 
   15310 void
   15311 _Py_ReleaseInternedUnicodeStrings(void)
   15312 {
   15313     PyObject *keys;
   15314     PyObject *s;
   15315     Py_ssize_t i, n;
   15316     Py_ssize_t immortal_size = 0, mortal_size = 0;
   15317 
   15318     if (interned == NULL || !PyDict_Check(interned))
   15319         return;
   15320     keys = PyDict_Keys(interned);
   15321     if (keys == NULL || !PyList_Check(keys)) {
   15322         PyErr_Clear();
   15323         return;
   15324     }
   15325 
   15326     /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
   15327        detector, interned unicode strings are not forcibly deallocated;
   15328        rather, we give them their stolen references back, and then clear
   15329        and DECREF the interned dict. */
   15330 
   15331     n = PyList_GET_SIZE(keys);
   15332     fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
   15333             n);
   15334     for (i = 0; i < n; i++) {
   15335         s = PyList_GET_ITEM(keys, i);
   15336         if (PyUnicode_READY(s) == -1) {
   15337             Py_UNREACHABLE();
   15338         }
   15339         switch (PyUnicode_CHECK_INTERNED(s)) {
   15340         case SSTATE_NOT_INTERNED:
   15341             /* XXX Shouldn't happen */
   15342             break;
   15343         case SSTATE_INTERNED_IMMORTAL:
   15344             Py_REFCNT(s) += 1;
   15345             immortal_size += PyUnicode_GET_LENGTH(s);
   15346             break;
   15347         case SSTATE_INTERNED_MORTAL:
   15348             Py_REFCNT(s) += 2;
   15349             mortal_size += PyUnicode_GET_LENGTH(s);
   15350             break;
   15351         default:
   15352             Py_FatalError("Inconsistent interned string state.");
   15353         }
   15354         _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
   15355     }
   15356     fprintf(stderr, "total size of all interned strings: "
   15357             "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
   15358             "mortal/immortal\n", mortal_size, immortal_size);
   15359     Py_DECREF(keys);
   15360     PyDict_Clear(interned);
   15361     Py_CLEAR(interned);
   15362 }
   15363 
   15364 
   15365 /********************* Unicode Iterator **************************/
   15366 
   15367 typedef struct {
   15368     PyObject_HEAD
   15369     Py_ssize_t it_index;
   15370     PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
   15371 } unicodeiterobject;
   15372 
   15373 static void
   15374 unicodeiter_dealloc(unicodeiterobject *it)
   15375 {
   15376     _PyObject_GC_UNTRACK(it);
   15377     Py_XDECREF(it->it_seq);
   15378     PyObject_GC_Del(it);
   15379 }
   15380 
   15381 static int
   15382 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
   15383 {
   15384     Py_VISIT(it->it_seq);
   15385     return 0;
   15386 }
   15387 
   15388 static PyObject *
   15389 unicodeiter_next(unicodeiterobject *it)
   15390 {
   15391     PyObject *seq, *item;
   15392 
   15393     assert(it != NULL);
   15394     seq = it->it_seq;
   15395     if (seq == NULL)
   15396         return NULL;
   15397     assert(_PyUnicode_CHECK(seq));
   15398 
   15399     if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
   15400         int kind = PyUnicode_KIND(seq);
   15401         void *data = PyUnicode_DATA(seq);
   15402         Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
   15403         item = PyUnicode_FromOrdinal(chr);
   15404         if (item != NULL)
   15405             ++it->it_index;
   15406         return item;
   15407     }
   15408 
   15409     it->it_seq = NULL;
   15410     Py_DECREF(seq);
   15411     return NULL;
   15412 }
   15413 
   15414 static PyObject *
   15415 unicodeiter_len(unicodeiterobject *it)
   15416 {
   15417     Py_ssize_t len = 0;
   15418     if (it->it_seq)
   15419         len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
   15420     return PyLong_FromSsize_t(len);
   15421 }
   15422 
   15423 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
   15424 
   15425 static PyObject *
   15426 unicodeiter_reduce(unicodeiterobject *it)
   15427 {
   15428     if (it->it_seq != NULL) {
   15429         return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
   15430                              it->it_seq, it->it_index);
   15431     } else {
   15432         PyObject *u = (PyObject *)_PyUnicode_New(0);
   15433         if (u == NULL)
   15434             return NULL;
   15435         return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
   15436     }
   15437 }
   15438 
   15439 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
   15440 
   15441 static PyObject *
   15442 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
   15443 {
   15444     Py_ssize_t index = PyLong_AsSsize_t(state);
   15445     if (index == -1 && PyErr_Occurred())
   15446         return NULL;
   15447     if (it->it_seq != NULL) {
   15448         if (index < 0)
   15449             index = 0;
   15450         else if (index > PyUnicode_GET_LENGTH(it->it_seq))
   15451             index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
   15452         it->it_index = index;
   15453     }
   15454     Py_RETURN_NONE;
   15455 }
   15456 
   15457 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
   15458 
   15459 static PyMethodDef unicodeiter_methods[] = {
   15460     {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
   15461      length_hint_doc},
   15462     {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
   15463      reduce_doc},
   15464     {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
   15465      setstate_doc},
   15466     {NULL,      NULL}       /* sentinel */
   15467 };
   15468 
   15469 PyTypeObject PyUnicodeIter_Type = {
   15470     PyVarObject_HEAD_INIT(&PyType_Type, 0)
   15471     "str_iterator",         /* tp_name */
   15472     sizeof(unicodeiterobject),      /* tp_basicsize */
   15473     0,                  /* tp_itemsize */
   15474     /* methods */
   15475     (destructor)unicodeiter_dealloc,    /* tp_dealloc */
   15476     0,                  /* tp_print */
   15477     0,                  /* tp_getattr */
   15478     0,                  /* tp_setattr */
   15479     0,                  /* tp_reserved */
   15480     0,                  /* tp_repr */
   15481     0,                  /* tp_as_number */
   15482     0,                  /* tp_as_sequence */
   15483     0,                  /* tp_as_mapping */
   15484     0,                  /* tp_hash */
   15485     0,                  /* tp_call */
   15486     0,                  /* tp_str */
   15487     PyObject_GenericGetAttr,        /* tp_getattro */
   15488     0,                  /* tp_setattro */
   15489     0,                  /* tp_as_buffer */
   15490     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
   15491     0,                  /* tp_doc */
   15492     (traverseproc)unicodeiter_traverse, /* tp_traverse */
   15493     0,                  /* tp_clear */
   15494     0,                  /* tp_richcompare */
   15495     0,                  /* tp_weaklistoffset */
   15496     PyObject_SelfIter,          /* tp_iter */
   15497     (iternextfunc)unicodeiter_next,     /* tp_iternext */
   15498     unicodeiter_methods,            /* tp_methods */
   15499     0,
   15500 };
   15501 
   15502 static PyObject *
   15503 unicode_iter(PyObject *seq)
   15504 {
   15505     unicodeiterobject *it;
   15506 
   15507     if (!PyUnicode_Check(seq)) {
   15508         PyErr_BadInternalCall();
   15509         return NULL;
   15510     }
   15511     if (PyUnicode_READY(seq) == -1)
   15512         return NULL;
   15513     it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
   15514     if (it == NULL)
   15515         return NULL;
   15516     it->it_index = 0;
   15517     Py_INCREF(seq);
   15518     it->it_seq = seq;
   15519     _PyObject_GC_TRACK(it);
   15520     return (PyObject *)it;
   15521 }
   15522 
   15523 
   15524 size_t
   15525 Py_UNICODE_strlen(const Py_UNICODE *u)
   15526 {
   15527     return wcslen(u);
   15528 }
   15529 
   15530 Py_UNICODE*
   15531 Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
   15532 {
   15533     Py_UNICODE *u = s1;
   15534     while ((*u++ = *s2++));
   15535     return s1;
   15536 }
   15537 
   15538 Py_UNICODE*
   15539 Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
   15540 {
   15541     Py_UNICODE *u = s1;
   15542     while ((*u++ = *s2++))
   15543         if (n-- == 0)
   15544             break;
   15545     return s1;
   15546 }
   15547 
   15548 Py_UNICODE*
   15549 Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
   15550 {
   15551     Py_UNICODE *u1 = s1;
   15552     u1 += wcslen(u1);
   15553     while ((*u1++ = *s2++));
   15554     return s1;
   15555 }
   15556 
   15557 int
   15558 Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
   15559 {
   15560     while (*s1 && *s2 && *s1 == *s2)
   15561         s1++, s2++;
   15562     if (*s1 && *s2)
   15563         return (*s1 < *s2) ? -1 : +1;
   15564     if (*s1)
   15565         return 1;
   15566     if (*s2)
   15567         return -1;
   15568     return 0;
   15569 }
   15570 
   15571 int
   15572 Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
   15573 {
   15574     Py_UNICODE u1, u2;
   15575     for (; n != 0; n--) {
   15576         u1 = *s1;
   15577         u2 = *s2;
   15578         if (u1 != u2)
   15579             return (u1 < u2) ? -1 : +1;
   15580         if (u1 == '\0')
   15581             return 0;
   15582         s1++;
   15583         s2++;
   15584     }
   15585     return 0;
   15586 }
   15587 
   15588 Py_UNICODE*
   15589 Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
   15590 {
   15591     const Py_UNICODE *p;
   15592     for (p = s; *p; p++)
   15593         if (*p == c)
   15594             return (Py_UNICODE*)p;
   15595     return NULL;
   15596 }
   15597 
   15598 Py_UNICODE*
   15599 Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
   15600 {
   15601     const Py_UNICODE *p;
   15602     p = s + wcslen(s);
   15603     while (p != s) {
   15604         p--;
   15605         if (*p == c)
   15606             return (Py_UNICODE*)p;
   15607     }
   15608     return NULL;
   15609 }
   15610 
   15611 Py_UNICODE*
   15612 PyUnicode_AsUnicodeCopy(PyObject *unicode)
   15613 {
   15614     Py_UNICODE *u, *copy;
   15615     Py_ssize_t len, size;
   15616 
   15617     if (!PyUnicode_Check(unicode)) {
   15618         PyErr_BadArgument();
   15619         return NULL;
   15620     }
   15621     u = PyUnicode_AsUnicodeAndSize(unicode, &len);
   15622     if (u == NULL)
   15623         return NULL;
   15624     /* Ensure we won't overflow the size. */
   15625     if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
   15626         PyErr_NoMemory();
   15627         return NULL;
   15628     }
   15629     size = len + 1; /* copy the null character */
   15630     size *= sizeof(Py_UNICODE);
   15631     copy = PyMem_Malloc(size);
   15632     if (copy == NULL) {
   15633         PyErr_NoMemory();
   15634         return NULL;
   15635     }
   15636     memcpy(copy, u, size);
   15637     return copy;
   15638 }
   15639 
   15640 /* A _string module, to export formatter_parser and formatter_field_name_split
   15641    to the string.Formatter class implemented in Python. */
   15642 
   15643 static PyMethodDef _string_methods[] = {
   15644     {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
   15645      METH_O, PyDoc_STR("split the argument as a field name")},
   15646     {"formatter_parser", (PyCFunction) formatter_parser,
   15647      METH_O, PyDoc_STR("parse the argument as a format string")},
   15648     {NULL, NULL}
   15649 };
   15650 
   15651 static struct PyModuleDef _string_module = {
   15652     PyModuleDef_HEAD_INIT,
   15653     "_string",
   15654     PyDoc_STR("string helper module"),
   15655     0,
   15656     _string_methods,
   15657     NULL,
   15658     NULL,
   15659     NULL,
   15660     NULL
   15661 };
   15662 
   15663 PyMODINIT_FUNC
   15664 PyInit__string(void)
   15665 {
   15666     return PyModule_Create(&_string_module);
   15667 }
   15668 
   15669 
   15670 #ifdef __cplusplus
   15671 }
   15672 #endif
   15673