Home | History | Annotate | Download | only in Objects
      1 /*
      2 
      3 Unicode implementation based on original code by Fredrik Lundh,
      4 modified by Marc-Andre Lemburg <mal (at) lemburg.com> according to the
      5 Unicode Integration Proposal (see file Misc/unicode.txt).
      6 
      7 Major speed upgrades to the method implementations at the Reykjavik
      8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
      9 
     10 Copyright (c) Corporation for National Research Initiatives.
     11 
     12 --------------------------------------------------------------------
     13 The original string type implementation is:
     14 
     15   Copyright (c) 1999 by Secret Labs AB
     16   Copyright (c) 1999 by Fredrik Lundh
     17 
     18 By obtaining, using, and/or copying this software and/or its
     19 associated documentation, you agree that you have read, understood,
     20 and will comply with the following terms and conditions:
     21 
     22 Permission to use, copy, modify, and distribute this software and its
     23 associated documentation for any purpose and without fee is hereby
     24 granted, provided that the above copyright notice appears in all
     25 copies, and that both that copyright notice and this permission notice
     26 appear in supporting documentation, and that the name of Secret Labs
     27 AB or the author not be used in advertising or publicity pertaining to
     28 distribution of the software without specific, written prior
     29 permission.
     30 
     31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
     32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
     33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
     34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
     37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     38 --------------------------------------------------------------------
     39 
     40 */
     41 
     42 #define PY_SSIZE_T_CLEAN
     43 #include "Python.h"
     44 
     45 #include "unicodeobject.h"
     46 #include "ucnhash.h"
     47 
     48 #ifdef MS_WINDOWS
     49 #include <windows.h>
     50 #endif
     51 
     52 /* Limit for the Unicode object free list */
     53 
     54 #define PyUnicode_MAXFREELIST       1024
     55 
     56 /* Limit for the Unicode object free list stay alive optimization.
     57 
     58    The implementation will keep allocated Unicode memory intact for
     59    all objects on the free list having a size less than this
     60    limit. This reduces malloc() overhead for small Unicode objects.
     61 
     62    At worst this will result in PyUnicode_MAXFREELIST *
     63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
     64    malloc()-overhead) bytes of unused garbage.
     65 
     66    Setting the limit to 0 effectively turns the feature off.
     67 
     68    Note: This is an experimental feature ! If you get core dumps when
     69    using Unicode objects, turn this feature off.
     70 
     71 */
     72 
     73 #define KEEPALIVE_SIZE_LIMIT       9
     74 
     75 /* Endianness switches; defaults to little endian */
     76 
     77 #ifdef WORDS_BIGENDIAN
     78 # define BYTEORDER_IS_BIG_ENDIAN
     79 #else
     80 # define BYTEORDER_IS_LITTLE_ENDIAN
     81 #endif
     82 
     83 /* --- Globals ------------------------------------------------------------
     84 
     85 NOTE: In the interpreter's initialization phase, some globals are currently
     86       initialized dynamically as needed. In the process Unicode objects may
     87       be created before the Unicode type is ready.
     88 
     89 */
     90 
     91 
     92 #ifdef __cplusplus
     93 extern "C" {
     94 #endif
     95 
     96 /* Free list for Unicode objects */
     97 static PyUnicodeObject *free_list = NULL;
     98 static int numfree = 0;
     99 
    100 /* The empty Unicode object is shared to improve performance. */
    101 static PyUnicodeObject *unicode_empty = NULL;
    102 
    103 #define _Py_RETURN_UNICODE_EMPTY()                      \
    104     do {                                                \
    105         if (unicode_empty != NULL)                      \
    106             Py_INCREF(unicode_empty);                   \
    107         else {                                          \
    108             unicode_empty = _PyUnicode_New(0);          \
    109             if (unicode_empty != NULL)                  \
    110                 Py_INCREF(unicode_empty);               \
    111         }                                               \
    112         return (PyObject *)unicode_empty;               \
    113     } while (0)
    114 
    115 /* Single character Unicode strings in the Latin-1 range are being
    116    shared as well. */
    117 static PyUnicodeObject *unicode_latin1[256] = {NULL};
    118 
    119 /* Default encoding to use and assume when NULL is passed as encoding
    120    parameter; it is initialized by _PyUnicode_Init().
    121 
    122    Always use the PyUnicode_SetDefaultEncoding() and
    123    PyUnicode_GetDefaultEncoding() APIs to access this global.
    124 
    125 */
    126 static char unicode_default_encoding[100 + 1] = "ascii";
    127 
    128 /* Fast detection of the most frequent whitespace characters */
    129 const unsigned char _Py_ascii_whitespace[] = {
    130     0, 0, 0, 0, 0, 0, 0, 0,
    131 /*     case 0x0009: * CHARACTER TABULATION */
    132 /*     case 0x000A: * LINE FEED */
    133 /*     case 0x000B: * LINE TABULATION */
    134 /*     case 0x000C: * FORM FEED */
    135 /*     case 0x000D: * CARRIAGE RETURN */
    136     0, 1, 1, 1, 1, 1, 0, 0,
    137     0, 0, 0, 0, 0, 0, 0, 0,
    138 /*     case 0x001C: * FILE SEPARATOR */
    139 /*     case 0x001D: * GROUP SEPARATOR */
    140 /*     case 0x001E: * RECORD SEPARATOR */
    141 /*     case 0x001F: * UNIT SEPARATOR */
    142     0, 0, 0, 0, 1, 1, 1, 1,
    143 /*     case 0x0020: * SPACE */
    144     1, 0, 0, 0, 0, 0, 0, 0,
    145     0, 0, 0, 0, 0, 0, 0, 0,
    146     0, 0, 0, 0, 0, 0, 0, 0,
    147     0, 0, 0, 0, 0, 0, 0, 0,
    148 
    149     0, 0, 0, 0, 0, 0, 0, 0,
    150     0, 0, 0, 0, 0, 0, 0, 0,
    151     0, 0, 0, 0, 0, 0, 0, 0,
    152     0, 0, 0, 0, 0, 0, 0, 0,
    153     0, 0, 0, 0, 0, 0, 0, 0,
    154     0, 0, 0, 0, 0, 0, 0, 0,
    155     0, 0, 0, 0, 0, 0, 0, 0,
    156     0, 0, 0, 0, 0, 0, 0, 0
    157 };
    158 
    159 /* Same for linebreaks */
    160 static unsigned char ascii_linebreak[] = {
    161     0, 0, 0, 0, 0, 0, 0, 0,
    162 /*         0x000A, * LINE FEED */
    163 /*         0x000B, * LINE TABULATION */
    164 /*         0x000C, * FORM FEED */
    165 /*         0x000D, * CARRIAGE RETURN */
    166     0, 0, 1, 1, 1, 1, 0, 0,
    167     0, 0, 0, 0, 0, 0, 0, 0,
    168 /*         0x001C, * FILE SEPARATOR */
    169 /*         0x001D, * GROUP SEPARATOR */
    170 /*         0x001E, * RECORD SEPARATOR */
    171     0, 0, 0, 0, 1, 1, 1, 0,
    172     0, 0, 0, 0, 0, 0, 0, 0,
    173     0, 0, 0, 0, 0, 0, 0, 0,
    174     0, 0, 0, 0, 0, 0, 0, 0,
    175     0, 0, 0, 0, 0, 0, 0, 0,
    176 
    177     0, 0, 0, 0, 0, 0, 0, 0,
    178     0, 0, 0, 0, 0, 0, 0, 0,
    179     0, 0, 0, 0, 0, 0, 0, 0,
    180     0, 0, 0, 0, 0, 0, 0, 0,
    181     0, 0, 0, 0, 0, 0, 0, 0,
    182     0, 0, 0, 0, 0, 0, 0, 0,
    183     0, 0, 0, 0, 0, 0, 0, 0,
    184     0, 0, 0, 0, 0, 0, 0, 0
    185 };
    186 
    187 
    188 Py_UNICODE
    189 PyUnicode_GetMax(void)
    190 {
    191 #ifdef Py_UNICODE_WIDE
    192     return 0x10FFFF;
    193 #else
    194     /* This is actually an illegal character, so it should
    195        not be passed to unichr. */
    196     return 0xFFFF;
    197 #endif
    198 }
    199 
    200 /* --- Bloom Filters ----------------------------------------------------- */
    201 
    202 /* stuff to implement simple "bloom filters" for Unicode characters.
    203    to keep things simple, we use a single bitmask, using the least 5
    204    bits from each unicode characters as the bit index. */
    205 
    206 /* the linebreak mask is set up by Unicode_Init below */
    207 
    208 #if LONG_BIT >= 128
    209 #define BLOOM_WIDTH 128
    210 #elif LONG_BIT >= 64
    211 #define BLOOM_WIDTH 64
    212 #elif LONG_BIT >= 32
    213 #define BLOOM_WIDTH 32
    214 #else
    215 #error "LONG_BIT is smaller than 32"
    216 #endif
    217 
    218 #define BLOOM_MASK unsigned long
    219 
    220 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
    221 
    222 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
    223 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
    224 
    225 #define BLOOM_LINEBREAK(ch)                                             \
    226     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
    227      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
    228 
    229 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
    230 {
    231     /* calculate simple bloom-style bitmask for a given unicode string */
    232 
    233     BLOOM_MASK mask;
    234     Py_ssize_t i;
    235 
    236     mask = 0;
    237     for (i = 0; i < len; i++)
    238         BLOOM_ADD(mask, ptr[i]);
    239 
    240     return mask;
    241 }
    242 
    243 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
    244 {
    245     Py_ssize_t i;
    246 
    247     for (i = 0; i < setlen; i++)
    248         if (set[i] == chr)
    249             return 1;
    250 
    251     return 0;
    252 }
    253 
    254 #define BLOOM_MEMBER(mask, chr, set, setlen)                    \
    255     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
    256 
    257 /* --- Unicode Object ----------------------------------------------------- */
    258 
    259 static
    260 int unicode_resize(register PyUnicodeObject *unicode,
    261                    Py_ssize_t length)
    262 {
    263     void *oldstr;
    264 
    265     /* Shortcut if there's nothing much to do. */
    266     if (unicode->length == length)
    267         goto reset;
    268 
    269     /* Resizing shared object (unicode_empty or single character
    270        objects) in-place is not allowed. Use PyUnicode_Resize()
    271        instead ! */
    272 
    273     if (unicode == unicode_empty ||
    274         (unicode->length == 1 &&
    275          unicode->str[0] < 256U &&
    276          unicode_latin1[unicode->str[0]] == unicode)) {
    277         PyErr_SetString(PyExc_SystemError,
    278                         "can't resize shared unicode objects");
    279         return -1;
    280     }
    281 
    282     /* We allocate one more byte to make sure the string is Ux0000 terminated.
    283        The overallocation is also used by fastsearch, which assumes that it's
    284        safe to look at str[length] (without making any assumptions about what
    285        it contains). */
    286 
    287     oldstr = unicode->str;
    288     unicode->str = PyObject_REALLOC(unicode->str,
    289                                     sizeof(Py_UNICODE) * (length + 1));
    290     if (!unicode->str) {
    291         unicode->str = (Py_UNICODE *)oldstr;
    292         PyErr_NoMemory();
    293         return -1;
    294     }
    295     unicode->str[length] = 0;
    296     unicode->length = length;
    297 
    298   reset:
    299     /* Reset the object caches */
    300     if (unicode->defenc) {
    301         Py_CLEAR(unicode->defenc);
    302     }
    303     unicode->hash = -1;
    304 
    305     return 0;
    306 }
    307 
    308 /* We allocate one more byte to make sure the string is
    309    Ux0000 terminated; some code relies on that.
    310 
    311    XXX This allocator could further be enhanced by assuring that the
    312    free list never reduces its size below 1.
    313 
    314 */
    315 
    316 static
    317 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
    318 {
    319     register PyUnicodeObject *unicode;
    320 
    321     /* Optimization for empty strings */
    322     if (length == 0 && unicode_empty != NULL) {
    323         Py_INCREF(unicode_empty);
    324         return unicode_empty;
    325     }
    326 
    327     /* Ensure we won't overflow the size. */
    328     if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
    329         return (PyUnicodeObject *)PyErr_NoMemory();
    330     }
    331 
    332     /* Unicode freelist & memory allocation */
    333     if (free_list) {
    334         unicode = free_list;
    335         free_list = *(PyUnicodeObject **)unicode;
    336         numfree--;
    337         if (unicode->str) {
    338             /* Keep-Alive optimization: we only upsize the buffer,
    339                never downsize it. */
    340             if ((unicode->length < length) &&
    341                 unicode_resize(unicode, length) < 0) {
    342                 PyObject_DEL(unicode->str);
    343                 unicode->str = NULL;
    344             }
    345         }
    346         else {
    347             size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
    348             unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
    349         }
    350         (void)PyObject_INIT(unicode, &PyUnicode_Type);
    351     }
    352     else {
    353         size_t new_size;
    354         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
    355         if (unicode == NULL)
    356             return NULL;
    357         new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
    358         unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
    359     }
    360 
    361     if (!unicode->str) {
    362         PyErr_NoMemory();
    363         goto onError;
    364     }
    365     /* Initialize the first element to guard against cases where
    366      * the caller fails before initializing str -- unicode_resize()
    367      * reads str[0], and the Keep-Alive optimization can keep memory
    368      * allocated for str alive across a call to unicode_dealloc(unicode).
    369      * We don't want unicode_resize to read uninitialized memory in
    370      * that case.
    371      */
    372     unicode->str[0] = 0;
    373     unicode->str[length] = 0;
    374     unicode->length = length;
    375     unicode->hash = -1;
    376     unicode->defenc = NULL;
    377     return unicode;
    378 
    379   onError:
    380     /* XXX UNREF/NEWREF interface should be more symmetrical */
    381     _Py_DEC_REFTOTAL;
    382     _Py_ForgetReference((PyObject *)unicode);
    383     PyObject_Del(unicode);
    384     return NULL;
    385 }
    386 
    387 static
    388 void unicode_dealloc(register PyUnicodeObject *unicode)
    389 {
    390     if (PyUnicode_CheckExact(unicode) &&
    391         numfree < PyUnicode_MAXFREELIST) {
    392         /* Keep-Alive optimization */
    393         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
    394             PyObject_DEL(unicode->str);
    395             unicode->str = NULL;
    396             unicode->length = 0;
    397         }
    398         if (unicode->defenc) {
    399             Py_CLEAR(unicode->defenc);
    400         }
    401         /* Add to free list */
    402         *(PyUnicodeObject **)unicode = free_list;
    403         free_list = unicode;
    404         numfree++;
    405     }
    406     else {
    407         PyObject_DEL(unicode->str);
    408         Py_XDECREF(unicode->defenc);
    409         Py_TYPE(unicode)->tp_free((PyObject *)unicode);
    410     }
    411 }
    412 
    413 static
    414 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
    415 {
    416     register PyUnicodeObject *v;
    417 
    418     /* Argument checks */
    419     if (unicode == NULL) {
    420         PyErr_BadInternalCall();
    421         return -1;
    422     }
    423     v = *unicode;
    424     if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
    425         PyErr_BadInternalCall();
    426         return -1;
    427     }
    428 
    429     /* Resizing unicode_empty and single character objects is not
    430        possible since these are being shared. We simply return a fresh
    431        copy with the same Unicode content. */
    432     if (v->length != length &&
    433         (v == unicode_empty || v->length == 1)) {
    434         PyUnicodeObject *w = _PyUnicode_New(length);
    435         if (w == NULL)
    436             return -1;
    437         Py_UNICODE_COPY(w->str, v->str,
    438                         length < v->length ? length : v->length);
    439         Py_SETREF(*unicode, w);
    440         return 0;
    441     }
    442 
    443     /* Note that we don't have to modify *unicode for unshared Unicode
    444        objects, since we can modify them in-place. */
    445     return unicode_resize(v, length);
    446 }
    447 
    448 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
    449 {
    450     return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
    451 }
    452 
    453 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
    454                                 Py_ssize_t size)
    455 {
    456     PyUnicodeObject *unicode;
    457 
    458     /* If the Unicode data is known at construction time, we can apply
    459        some optimizations which share commonly used objects. */
    460     if (u != NULL) {
    461 
    462         /* Optimization for empty strings */
    463         if (size == 0)
    464             _Py_RETURN_UNICODE_EMPTY();
    465 
    466         /* Single character Unicode objects in the Latin-1 range are
    467            shared when using this constructor */
    468         if (size == 1 && *u < 256) {
    469             unicode = unicode_latin1[*u];
    470             if (!unicode) {
    471                 unicode = _PyUnicode_New(1);
    472                 if (!unicode)
    473                     return NULL;
    474                 unicode->str[0] = *u;
    475                 unicode_latin1[*u] = unicode;
    476             }
    477             Py_INCREF(unicode);
    478             return (PyObject *)unicode;
    479         }
    480     }
    481 
    482     unicode = _PyUnicode_New(size);
    483     if (!unicode)
    484         return NULL;
    485 
    486     /* Copy the Unicode data into the new object */
    487     if (u != NULL)
    488         Py_UNICODE_COPY(unicode->str, u, size);
    489 
    490     return (PyObject *)unicode;
    491 }
    492 
    493 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
    494 {
    495     PyUnicodeObject *unicode;
    496 
    497     if (size < 0) {
    498         PyErr_SetString(PyExc_SystemError,
    499                         "Negative size passed to PyUnicode_FromStringAndSize");
    500         return NULL;
    501     }
    502 
    503     /* If the Unicode data is known at construction time, we can apply
    504        some optimizations which share commonly used objects.
    505        Also, this means the input must be UTF-8, so fall back to the
    506        UTF-8 decoder at the end. */
    507     if (u != NULL) {
    508 
    509         /* Optimization for empty strings */
    510         if (size == 0)
    511             _Py_RETURN_UNICODE_EMPTY();
    512 
    513         /* Single characters are shared when using this constructor.
    514            Restrict to ASCII, since the input must be UTF-8. */
    515         if (size == 1 && Py_CHARMASK(*u) < 128) {
    516             unicode = unicode_latin1[Py_CHARMASK(*u)];
    517             if (!unicode) {
    518                 unicode = _PyUnicode_New(1);
    519                 if (!unicode)
    520                     return NULL;
    521                 unicode->str[0] = Py_CHARMASK(*u);
    522                 unicode_latin1[Py_CHARMASK(*u)] = unicode;
    523             }
    524             Py_INCREF(unicode);
    525             return (PyObject *)unicode;
    526         }
    527 
    528         return PyUnicode_DecodeUTF8(u, size, NULL);
    529     }
    530 
    531     unicode = _PyUnicode_New(size);
    532     if (!unicode)
    533         return NULL;
    534 
    535     return (PyObject *)unicode;
    536 }
    537 
    538 PyObject *PyUnicode_FromString(const char *u)
    539 {
    540     size_t size = strlen(u);
    541     if (size > PY_SSIZE_T_MAX) {
    542         PyErr_SetString(PyExc_OverflowError, "input too long");
    543         return NULL;
    544     }
    545 
    546     return PyUnicode_FromStringAndSize(u, size);
    547 }
    548 
    549 /* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
    550  * by 'ptr', possibly combining surrogate pairs on narrow builds.
    551  * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
    552  * that should be returned and 'end' pointing to the end of the buffer.
    553  * ('end' is used on narrow builds to detect a lone surrogate at the
    554  * end of the buffer that should be returned unchanged.)
    555  * The ptr and end arguments should be side-effect free and ptr must an lvalue.
    556  * The type of the returned char is always Py_UCS4.
    557  *
    558  * Note: the macro advances ptr to next char, so it might have side-effects
    559  *       (especially if used with other macros).
    560  */
    561 
    562 /* helper macros used by _Py_UNICODE_NEXT */
    563 #define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
    564 #define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
    565 /* Join two surrogate characters and return a single Py_UCS4 value. */
    566 #define _Py_UNICODE_JOIN_SURROGATES(high, low)  \
    567     (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
    568       ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
    569 
    570 #ifdef Py_UNICODE_WIDE
    571 #define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
    572 #else
    573 #define _Py_UNICODE_NEXT(ptr, end)                                      \
    574      (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) &&      \
    575         _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ?                       \
    576        ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
    577        (Py_UCS4)*(ptr)++)
    578 #endif
    579 
    580 #ifdef HAVE_WCHAR_H
    581 
    582 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
    583 # define CONVERT_WCHAR_TO_SURROGATES
    584 #endif
    585 
    586 #ifdef CONVERT_WCHAR_TO_SURROGATES
    587 
    588 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
    589    to convert from UTF32 to UTF16. */
    590 
    591 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
    592                                  Py_ssize_t size)
    593 {
    594     PyUnicodeObject *unicode;
    595     register Py_ssize_t i;
    596     Py_ssize_t alloc;
    597     const wchar_t *orig_w;
    598 
    599     if (w == NULL) {
    600         PyErr_BadInternalCall();
    601         return NULL;
    602     }
    603 
    604     alloc = size;
    605     orig_w = w;
    606     for (i = size; i > 0; i--) {
    607         if (*w > 0xFFFF)
    608             alloc++;
    609         w++;
    610     }
    611     w = orig_w;
    612     unicode = _PyUnicode_New(alloc);
    613     if (!unicode)
    614         return NULL;
    615 
    616     /* Copy the wchar_t data into the new object */
    617     {
    618         register Py_UNICODE *u;
    619         u = PyUnicode_AS_UNICODE(unicode);
    620         for (i = size; i > 0; i--) {
    621             if (*w > 0xFFFF) {
    622                 wchar_t ordinal = *w++;
    623                 ordinal -= 0x10000;
    624                 *u++ = 0xD800 | (ordinal >> 10);
    625                 *u++ = 0xDC00 | (ordinal & 0x3FF);
    626             }
    627             else
    628                 *u++ = *w++;
    629         }
    630     }
    631     return (PyObject *)unicode;
    632 }
    633 
    634 #else
    635 
    636 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
    637                                  Py_ssize_t size)
    638 {
    639     PyUnicodeObject *unicode;
    640 
    641     if (w == NULL) {
    642         PyErr_BadInternalCall();
    643         return NULL;
    644     }
    645 
    646     unicode = _PyUnicode_New(size);
    647     if (!unicode)
    648         return NULL;
    649 
    650     /* Copy the wchar_t data into the new object */
    651 #ifdef HAVE_USABLE_WCHAR_T
    652     memcpy(unicode->str, w, size * sizeof(wchar_t));
    653 #else
    654     {
    655         register Py_UNICODE *u;
    656         register Py_ssize_t i;
    657         u = PyUnicode_AS_UNICODE(unicode);
    658         for (i = size; i > 0; i--)
    659             *u++ = *w++;
    660     }
    661 #endif
    662 
    663     return (PyObject *)unicode;
    664 }
    665 
    666 #endif /* CONVERT_WCHAR_TO_SURROGATES */
    667 
    668 #undef CONVERT_WCHAR_TO_SURROGATES
    669 
    670 static void
    671 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
    672 {
    673     *fmt++ = '%';
    674     if (width) {
    675         if (zeropad)
    676             *fmt++ = '0';
    677         fmt += sprintf(fmt, "%d", width);
    678     }
    679     if (precision)
    680         fmt += sprintf(fmt, ".%d", precision);
    681     if (longflag)
    682         *fmt++ = 'l';
    683     else if (size_tflag) {
    684         char *f = PY_FORMAT_SIZE_T;
    685         while (*f)
    686             *fmt++ = *f++;
    687     }
    688     *fmt++ = c;
    689     *fmt = '\0';
    690 }
    691 
    692 #define appendstring(string) \
    693     do { \
    694         for (copy = string;*copy; copy++) { \
    695             *s++ = (unsigned char)*copy; \
    696         } \
    697     } while (0)
    698 
    699 PyObject *
    700 PyUnicode_FromFormatV(const char *format, va_list vargs)
    701 {
    702     va_list count;
    703     Py_ssize_t callcount = 0;
    704     PyObject **callresults = NULL;
    705     PyObject **callresult = NULL;
    706     Py_ssize_t n = 0;
    707     int width = 0;
    708     int precision = 0;
    709     int zeropad;
    710     const char* f;
    711     Py_UNICODE *s;
    712     PyObject *string;
    713     /* used by sprintf */
    714     char buffer[21];
    715     /* use abuffer instead of buffer, if we need more space
    716      * (which can happen if there's a format specifier with width). */
    717     char *abuffer = NULL;
    718     char *realbuffer;
    719     Py_ssize_t abuffersize = 0;
    720     char fmt[60]; /* should be enough for %0width.precisionld */
    721     const char *copy;
    722 
    723 #ifdef VA_LIST_IS_ARRAY
    724     Py_MEMCPY(count, vargs, sizeof(va_list));
    725 #else
    726 #ifdef  __va_copy
    727     __va_copy(count, vargs);
    728 #else
    729     count = vargs;
    730 #endif
    731 #endif
    732      /* step 1: count the number of %S/%R/%s format specifications
    733       * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
    734       * objects once during step 3 and put the result in an array) */
    735     for (f = format; *f; f++) {
    736          if (*f == '%') {
    737              f++;
    738              while (*f && *f != '%' && !isalpha((unsigned)*f))
    739                  f++;
    740              if (!*f)
    741                  break;
    742              if (*f == 's' || *f=='S' || *f=='R')
    743                  ++callcount;
    744          }
    745     }
    746     /* step 2: allocate memory for the results of
    747      * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
    748     if (callcount) {
    749         callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
    750         if (!callresults) {
    751             PyErr_NoMemory();
    752             return NULL;
    753         }
    754         callresult = callresults;
    755     }
    756     /* step 3: figure out how large a buffer we need */
    757     for (f = format; *f; f++) {
    758         if (*f == '%') {
    759             const char* p = f++;
    760             width = 0;
    761             while (isdigit((unsigned)*f))
    762                 width = (width*10) + *f++ - '0';
    763             precision = 0;
    764             if (*f == '.') {
    765                 f++;
    766                 while (isdigit((unsigned)*f))
    767                     precision = (precision*10) + *f++ - '0';
    768             }
    769 
    770             /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
    771              * they don't affect the amount of space we reserve.
    772              */
    773             if ((*f == 'l' || *f == 'z') &&
    774                 (f[1] == 'd' || f[1] == 'u'))
    775                 ++f;
    776 
    777             switch (*f) {
    778             case 'c':
    779             {
    780                 int ordinal = va_arg(count, int);
    781 #ifdef Py_UNICODE_WIDE
    782                 if (ordinal < 0 || ordinal > 0x10ffff) {
    783                     PyErr_SetString(PyExc_OverflowError,
    784                                     "%c arg not in range(0x110000) "
    785                                     "(wide Python build)");
    786                     goto fail;
    787                 }
    788 #else
    789                 if (ordinal < 0 || ordinal > 0xffff) {
    790                     PyErr_SetString(PyExc_OverflowError,
    791                                     "%c arg not in range(0x10000) "
    792                                     "(narrow Python build)");
    793                     goto fail;
    794                 }
    795 #endif
    796                 /* fall through... */
    797             }
    798             case '%':
    799                 n++;
    800                 break;
    801             case 'd': case 'u': case 'i': case 'x':
    802                 (void) va_arg(count, int);
    803                 if (width < precision)
    804                     width = precision;
    805                 /* 20 bytes is enough to hold a 64-bit
    806                    integer.  Decimal takes the most space.
    807                    This isn't enough for octal.
    808                    If a width is specified we need more
    809                    (which we allocate later). */
    810                 if (width < 20)
    811                     width = 20;
    812                 n += width;
    813                 if (abuffersize < width)
    814                     abuffersize = width;
    815                 break;
    816             case 's':
    817             {
    818                 /* UTF-8 */
    819                 const char *s = va_arg(count, const char*);
    820                 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
    821                 if (!str)
    822                     goto fail;
    823                 n += PyUnicode_GET_SIZE(str);
    824                 /* Remember the str and switch to the next slot */
    825                 *callresult++ = str;
    826                 break;
    827             }
    828             case 'U':
    829             {
    830                 PyObject *obj = va_arg(count, PyObject *);
    831                 assert(obj && PyUnicode_Check(obj));
    832                 n += PyUnicode_GET_SIZE(obj);
    833                 break;
    834             }
    835             case 'V':
    836             {
    837                 PyObject *obj = va_arg(count, PyObject *);
    838                 const char *str = va_arg(count, const char *);
    839                 assert(obj || str);
    840                 assert(!obj || PyUnicode_Check(obj));
    841                 if (obj)
    842                     n += PyUnicode_GET_SIZE(obj);
    843                 else
    844                     n += strlen(str);
    845                 break;
    846             }
    847             case 'S':
    848             {
    849                 PyObject *obj = va_arg(count, PyObject *);
    850                 PyObject *str;
    851                 assert(obj);
    852                 str = PyObject_Str(obj);
    853                 if (!str)
    854                     goto fail;
    855                 n += PyString_GET_SIZE(str);
    856                 /* Remember the str and switch to the next slot */
    857                 *callresult++ = str;
    858                 break;
    859             }
    860             case 'R':
    861             {
    862                 PyObject *obj = va_arg(count, PyObject *);
    863                 PyObject *repr;
    864                 assert(obj);
    865                 repr = PyObject_Repr(obj);
    866                 if (!repr)
    867                     goto fail;
    868                 n += PyUnicode_GET_SIZE(repr);
    869                 /* Remember the repr and switch to the next slot */
    870                 *callresult++ = repr;
    871                 break;
    872             }
    873             case 'p':
    874                 (void) va_arg(count, int);
    875                 /* maximum 64-bit pointer representation:
    876                  * 0xffffffffffffffff
    877                  * so 19 characters is enough.
    878                  * XXX I count 18 -- what's the extra for?
    879                  */
    880                 n += 19;
    881                 break;
    882             default:
    883                 /* if we stumble upon an unknown
    884                    formatting code, copy the rest of
    885                    the format string to the output
    886                    string. (we cannot just skip the
    887                    code, since there's no way to know
    888                    what's in the argument list) */
    889                 n += strlen(p);
    890                 goto expand;
    891             }
    892         } else
    893             n++;
    894     }
    895   expand:
    896     if (abuffersize > 20) {
    897         /* add 1 for sprintf's trailing null byte */
    898         abuffer = PyObject_Malloc(abuffersize + 1);
    899         if (!abuffer) {
    900             PyErr_NoMemory();
    901             goto fail;
    902         }
    903         realbuffer = abuffer;
    904     }
    905     else
    906         realbuffer = buffer;
    907     /* step 4: fill the buffer */
    908     /* Since we've analyzed how much space we need for the worst case,
    909        we don't have to resize the string.
    910        There can be no errors beyond this point. */
    911     string = PyUnicode_FromUnicode(NULL, n);
    912     if (!string)
    913         goto fail;
    914 
    915     s = PyUnicode_AS_UNICODE(string);
    916     callresult = callresults;
    917 
    918     for (f = format; *f; f++) {
    919         if (*f == '%') {
    920             const char* p = f++;
    921             int longflag = 0;
    922             int size_tflag = 0;
    923             zeropad = (*f == '0');
    924             /* parse the width.precision part */
    925             width = 0;
    926             while (isdigit((unsigned)*f))
    927                 width = (width*10) + *f++ - '0';
    928             precision = 0;
    929             if (*f == '.') {
    930                 f++;
    931                 while (isdigit((unsigned)*f))
    932                     precision = (precision*10) + *f++ - '0';
    933             }
    934             /* handle the long flag, but only for %ld and %lu.
    935                others can be added when necessary. */
    936             if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
    937                 longflag = 1;
    938                 ++f;
    939             }
    940             /* handle the size_t flag. */
    941             if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
    942                 size_tflag = 1;
    943                 ++f;
    944             }
    945 
    946             switch (*f) {
    947             case 'c':
    948                 *s++ = va_arg(vargs, int);
    949                 break;
    950             case 'd':
    951                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
    952                 if (longflag)
    953                     sprintf(realbuffer, fmt, va_arg(vargs, long));
    954                 else if (size_tflag)
    955                     sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
    956                 else
    957                     sprintf(realbuffer, fmt, va_arg(vargs, int));
    958                 appendstring(realbuffer);
    959                 break;
    960             case 'u':
    961                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
    962                 if (longflag)
    963                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
    964                 else if (size_tflag)
    965                     sprintf(realbuffer, fmt, va_arg(vargs, size_t));
    966                 else
    967                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
    968                 appendstring(realbuffer);
    969                 break;
    970             case 'i':
    971                 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
    972                 sprintf(realbuffer, fmt, va_arg(vargs, int));
    973                 appendstring(realbuffer);
    974                 break;
    975             case 'x':
    976                 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
    977                 sprintf(realbuffer, fmt, va_arg(vargs, int));
    978                 appendstring(realbuffer);
    979                 break;
    980             case 's':
    981             {
    982                 /* unused, since we already have the result */
    983                 (void) va_arg(vargs, char *);
    984                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
    985                                 PyUnicode_GET_SIZE(*callresult));
    986                 s += PyUnicode_GET_SIZE(*callresult);
    987                 /* We're done with the unicode()/repr() => forget it */
    988                 Py_DECREF(*callresult);
    989                 /* switch to next unicode()/repr() result */
    990                 ++callresult;
    991                 break;
    992             }
    993             case 'U':
    994             {
    995                 PyObject *obj = va_arg(vargs, PyObject *);
    996                 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
    997                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
    998                 s += size;
    999                 break;
   1000             }
   1001             case 'V':
   1002             {
   1003                 PyObject *obj = va_arg(vargs, PyObject *);
   1004                 const char *str = va_arg(vargs, const char *);
   1005                 if (obj) {
   1006                     Py_ssize_t size = PyUnicode_GET_SIZE(obj);
   1007                     Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
   1008                     s += size;
   1009                 } else {
   1010                     appendstring(str);
   1011                 }
   1012                 break;
   1013             }
   1014             case 'S':
   1015             case 'R':
   1016             {
   1017                 const char *str = PyString_AS_STRING(*callresult);
   1018                 /* unused, since we already have the result */
   1019                 (void) va_arg(vargs, PyObject *);
   1020                 appendstring(str);
   1021                 /* We're done with the unicode()/repr() => forget it */
   1022                 Py_DECREF(*callresult);
   1023                 /* switch to next unicode()/repr() result */
   1024                 ++callresult;
   1025                 break;
   1026             }
   1027             case 'p':
   1028                 sprintf(buffer, "%p", va_arg(vargs, void*));
   1029                 /* %p is ill-defined:  ensure leading 0x. */
   1030                 if (buffer[1] == 'X')
   1031                     buffer[1] = 'x';
   1032                 else if (buffer[1] != 'x') {
   1033                     memmove(buffer+2, buffer, strlen(buffer)+1);
   1034                     buffer[0] = '0';
   1035                     buffer[1] = 'x';
   1036                 }
   1037                 appendstring(buffer);
   1038                 break;
   1039             case '%':
   1040                 *s++ = '%';
   1041                 break;
   1042             default:
   1043                 appendstring(p);
   1044                 goto end;
   1045             }
   1046         } else
   1047             *s++ = *f;
   1048     }
   1049 
   1050   end:
   1051     if (callresults)
   1052         PyObject_Free(callresults);
   1053     if (abuffer)
   1054         PyObject_Free(abuffer);
   1055     PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
   1056     return string;
   1057   fail:
   1058     if (callresults) {
   1059         PyObject **callresult2 = callresults;
   1060         while (callresult2 < callresult) {
   1061             Py_DECREF(*callresult2);
   1062             ++callresult2;
   1063         }
   1064         PyObject_Free(callresults);
   1065     }
   1066     if (abuffer)
   1067         PyObject_Free(abuffer);
   1068     return NULL;
   1069 }
   1070 
   1071 #undef appendstring
   1072 
   1073 PyObject *
   1074 PyUnicode_FromFormat(const char *format, ...)
   1075 {
   1076     PyObject* ret;
   1077     va_list vargs;
   1078 
   1079 #ifdef HAVE_STDARG_PROTOTYPES
   1080     va_start(vargs, format);
   1081 #else
   1082     va_start(vargs);
   1083 #endif
   1084     ret = PyUnicode_FromFormatV(format, vargs);
   1085     va_end(vargs);
   1086     return ret;
   1087 }
   1088 
   1089 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
   1090                                 wchar_t *w,
   1091                                 Py_ssize_t size)
   1092 {
   1093     if (unicode == NULL) {
   1094         PyErr_BadInternalCall();
   1095         return -1;
   1096     }
   1097 
   1098     /* If possible, try to copy the 0-termination as well */
   1099     if (size > PyUnicode_GET_SIZE(unicode))
   1100         size = PyUnicode_GET_SIZE(unicode) + 1;
   1101 
   1102 #ifdef HAVE_USABLE_WCHAR_T
   1103     memcpy(w, unicode->str, size * sizeof(wchar_t));
   1104 #else
   1105     {
   1106         register Py_UNICODE *u;
   1107         register Py_ssize_t i;
   1108         u = PyUnicode_AS_UNICODE(unicode);
   1109         for (i = size; i > 0; i--)
   1110             *w++ = *u++;
   1111     }
   1112 #endif
   1113 
   1114     if (size > PyUnicode_GET_SIZE(unicode))
   1115         return PyUnicode_GET_SIZE(unicode);
   1116     else
   1117         return size;
   1118 }
   1119 
   1120 #endif
   1121 
   1122 PyObject *PyUnicode_FromOrdinal(int ordinal)
   1123 {
   1124     Py_UNICODE s[1];
   1125 
   1126 #ifdef Py_UNICODE_WIDE
   1127     if (ordinal < 0 || ordinal > 0x10ffff) {
   1128         PyErr_SetString(PyExc_ValueError,
   1129                         "unichr() arg not in range(0x110000) "
   1130                         "(wide Python build)");
   1131         return NULL;
   1132     }
   1133 #else
   1134     if (ordinal < 0 || ordinal > 0xffff) {
   1135         PyErr_SetString(PyExc_ValueError,
   1136                         "unichr() arg not in range(0x10000) "
   1137                         "(narrow Python build)");
   1138         return NULL;
   1139     }
   1140 #endif
   1141 
   1142     s[0] = (Py_UNICODE)ordinal;
   1143     return PyUnicode_FromUnicode(s, 1);
   1144 }
   1145 
   1146 PyObject *PyUnicode_FromObject(register PyObject *obj)
   1147 {
   1148     /* XXX Perhaps we should make this API an alias of
   1149        PyObject_Unicode() instead ?! */
   1150     if (PyUnicode_CheckExact(obj)) {
   1151         Py_INCREF(obj);
   1152         return obj;
   1153     }
   1154     if (PyUnicode_Check(obj)) {
   1155         /* For a Unicode subtype that's not a Unicode object,
   1156            return a true Unicode object with the same data. */
   1157         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
   1158                                      PyUnicode_GET_SIZE(obj));
   1159     }
   1160     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
   1161 }
   1162 
   1163 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
   1164                                       const char *encoding,
   1165                                       const char *errors)
   1166 {
   1167     const char *s = NULL;
   1168     Py_ssize_t len;
   1169     PyObject *v;
   1170 
   1171     if (obj == NULL) {
   1172         PyErr_BadInternalCall();
   1173         return NULL;
   1174     }
   1175 
   1176 #if 0
   1177     /* For b/w compatibility we also accept Unicode objects provided
   1178        that no encodings is given and then redirect to
   1179        PyObject_Unicode() which then applies the additional logic for
   1180        Unicode subclasses.
   1181 
   1182        NOTE: This API should really only be used for object which
   1183        represent *encoded* Unicode !
   1184 
   1185     */
   1186     if (PyUnicode_Check(obj)) {
   1187         if (encoding) {
   1188             PyErr_SetString(PyExc_TypeError,
   1189                             "decoding Unicode is not supported");
   1190             return NULL;
   1191         }
   1192         return PyObject_Unicode(obj);
   1193     }
   1194 #else
   1195     if (PyUnicode_Check(obj)) {
   1196         PyErr_SetString(PyExc_TypeError,
   1197                         "decoding Unicode is not supported");
   1198         return NULL;
   1199     }
   1200 #endif
   1201 
   1202     /* Coerce object */
   1203     if (PyString_Check(obj)) {
   1204         s = PyString_AS_STRING(obj);
   1205         len = PyString_GET_SIZE(obj);
   1206     }
   1207     else if (PyByteArray_Check(obj)) {
   1208         /* Python 2.x specific */
   1209         PyErr_Format(PyExc_TypeError,
   1210                      "decoding bytearray is not supported");
   1211         return NULL;
   1212     }
   1213     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
   1214         /* Overwrite the error message with something more useful in
   1215            case of a TypeError. */
   1216         if (PyErr_ExceptionMatches(PyExc_TypeError))
   1217             PyErr_Format(PyExc_TypeError,
   1218                          "coercing to Unicode: need string or buffer, "
   1219                          "%.80s found",
   1220                          Py_TYPE(obj)->tp_name);
   1221         goto onError;
   1222     }
   1223 
   1224     /* Convert to Unicode */
   1225     if (len == 0)
   1226         _Py_RETURN_UNICODE_EMPTY();
   1227 
   1228     v = PyUnicode_Decode(s, len, encoding, errors);
   1229     return v;
   1230 
   1231   onError:
   1232     return NULL;
   1233 }
   1234 
   1235 PyObject *PyUnicode_Decode(const char *s,
   1236                            Py_ssize_t size,
   1237                            const char *encoding,
   1238                            const char *errors)
   1239 {
   1240     PyObject *buffer = NULL, *unicode;
   1241 
   1242     if (encoding == NULL)
   1243         encoding = PyUnicode_GetDefaultEncoding();
   1244 
   1245     /* Shortcuts for common default encodings */
   1246     if (strcmp(encoding, "utf-8") == 0)
   1247         return PyUnicode_DecodeUTF8(s, size, errors);
   1248     else if (strcmp(encoding, "latin-1") == 0)
   1249         return PyUnicode_DecodeLatin1(s, size, errors);
   1250 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
   1251     else if (strcmp(encoding, "mbcs") == 0)
   1252         return PyUnicode_DecodeMBCS(s, size, errors);
   1253 #endif
   1254     else if (strcmp(encoding, "ascii") == 0)
   1255         return PyUnicode_DecodeASCII(s, size, errors);
   1256 
   1257     /* Decode via the codec registry */
   1258     buffer = PyBuffer_FromMemory((void *)s, size);
   1259     if (buffer == NULL)
   1260         goto onError;
   1261     unicode = _PyCodec_DecodeText(buffer, encoding, errors);
   1262     if (unicode == NULL)
   1263         goto onError;
   1264     if (!PyUnicode_Check(unicode)) {
   1265         PyErr_Format(PyExc_TypeError,
   1266                      "decoder did not return an unicode object (type=%.400s)",
   1267                      Py_TYPE(unicode)->tp_name);
   1268         Py_DECREF(unicode);
   1269         goto onError;
   1270     }
   1271     Py_DECREF(buffer);
   1272     return unicode;
   1273 
   1274   onError:
   1275     Py_XDECREF(buffer);
   1276     return NULL;
   1277 }
   1278 
   1279 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
   1280                                     const char *encoding,
   1281                                     const char *errors)
   1282 {
   1283     PyObject *v;
   1284 
   1285     if (!PyUnicode_Check(unicode)) {
   1286         PyErr_BadArgument();
   1287         goto onError;
   1288     }
   1289 
   1290     if (PyErr_WarnPy3k("decoding Unicode is not supported in 3.x", 1) < 0)
   1291         goto onError;
   1292 
   1293     if (encoding == NULL)
   1294         encoding = PyUnicode_GetDefaultEncoding();
   1295 
   1296     /* Decode via the codec registry */
   1297     v = _PyCodec_DecodeText(unicode, encoding, errors);
   1298     if (v == NULL)
   1299         goto onError;
   1300     return v;
   1301 
   1302   onError:
   1303     return NULL;
   1304 }
   1305 
   1306 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
   1307                            Py_ssize_t size,
   1308                            const char *encoding,
   1309                            const char *errors)
   1310 {
   1311     PyObject *v, *unicode;
   1312 
   1313     unicode = PyUnicode_FromUnicode(s, size);
   1314     if (unicode == NULL)
   1315         return NULL;
   1316     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
   1317     Py_DECREF(unicode);
   1318     return v;
   1319 }
   1320 
   1321 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
   1322                                     const char *encoding,
   1323                                     const char *errors)
   1324 {
   1325     PyObject *v;
   1326 
   1327     if (!PyUnicode_Check(unicode)) {
   1328         PyErr_BadArgument();
   1329         goto onError;
   1330     }
   1331 
   1332     if (encoding == NULL)
   1333         encoding = PyUnicode_GetDefaultEncoding();
   1334 
   1335     /* Encode via the codec registry */
   1336     v = _PyCodec_EncodeText(unicode, encoding, errors);
   1337     if (v == NULL)
   1338         goto onError;
   1339     return v;
   1340 
   1341   onError:
   1342     return NULL;
   1343 }
   1344 
   1345 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
   1346                                     const char *encoding,
   1347                                     const char *errors)
   1348 {
   1349     PyObject *v;
   1350 
   1351     if (!PyUnicode_Check(unicode)) {
   1352         PyErr_BadArgument();
   1353         goto onError;
   1354     }
   1355 
   1356     if (encoding == NULL)
   1357         encoding = PyUnicode_GetDefaultEncoding();
   1358 
   1359     /* Shortcuts for common default encodings */
   1360     if (errors == NULL) {
   1361         if (strcmp(encoding, "utf-8") == 0)
   1362             return PyUnicode_AsUTF8String(unicode);
   1363         else if (strcmp(encoding, "latin-1") == 0)
   1364             return PyUnicode_AsLatin1String(unicode);
   1365 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
   1366         else if (strcmp(encoding, "mbcs") == 0)
   1367             return PyUnicode_AsMBCSString(unicode);
   1368 #endif
   1369         else if (strcmp(encoding, "ascii") == 0)
   1370             return PyUnicode_AsASCIIString(unicode);
   1371     }
   1372 
   1373     /* Encode via the codec registry */
   1374     v = _PyCodec_EncodeText(unicode, encoding, errors);
   1375     if (v == NULL)
   1376         goto onError;
   1377     if (!PyString_Check(v)) {
   1378         PyErr_Format(PyExc_TypeError,
   1379                      "encoder did not return a string object (type=%.400s)",
   1380                      Py_TYPE(v)->tp_name);
   1381         Py_DECREF(v);
   1382         goto onError;
   1383     }
   1384     return v;
   1385 
   1386   onError:
   1387     return NULL;
   1388 }
   1389 
   1390 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
   1391                                             const char *errors)
   1392 {
   1393     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
   1394 
   1395     if (v)
   1396         return v;
   1397     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
   1398     if (v && errors == NULL)
   1399         ((PyUnicodeObject *)unicode)->defenc = v;
   1400     return v;
   1401 }
   1402 
   1403 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
   1404 {
   1405     if (!PyUnicode_Check(unicode)) {
   1406         PyErr_BadArgument();
   1407         goto onError;
   1408     }
   1409     return PyUnicode_AS_UNICODE(unicode);
   1410 
   1411   onError:
   1412     return NULL;
   1413 }
   1414 
   1415 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
   1416 {
   1417     if (!PyUnicode_Check(unicode)) {
   1418         PyErr_BadArgument();
   1419         goto onError;
   1420     }
   1421     return PyUnicode_GET_SIZE(unicode);
   1422 
   1423   onError:
   1424     return -1;
   1425 }
   1426 
   1427 const char *PyUnicode_GetDefaultEncoding(void)
   1428 {
   1429     return unicode_default_encoding;
   1430 }
   1431 
   1432 int PyUnicode_SetDefaultEncoding(const char *encoding)
   1433 {
   1434     PyObject *v;
   1435 
   1436     /* Make sure the encoding is valid. As side effect, this also
   1437        loads the encoding into the codec registry cache. */
   1438     v = _PyCodec_Lookup(encoding);
   1439     if (v == NULL)
   1440         goto onError;
   1441     Py_DECREF(v);
   1442     strncpy(unicode_default_encoding,
   1443             encoding,
   1444             sizeof(unicode_default_encoding) - 1);
   1445     return 0;
   1446 
   1447   onError:
   1448     return -1;
   1449 }
   1450 
   1451 /* error handling callback helper:
   1452    build arguments, call the callback and check the arguments,
   1453    if no exception occurred, copy the replacement to the output
   1454    and adjust various state variables.
   1455    return 0 on success, -1 on error
   1456 */
   1457 
   1458 static
   1459 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
   1460                                      const char *encoding, const char *reason,
   1461                                      const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
   1462                                      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
   1463                                      PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
   1464 {
   1465     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
   1466 
   1467     PyObject *restuple = NULL;
   1468     PyObject *repunicode = NULL;
   1469     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
   1470     Py_ssize_t requiredsize;
   1471     Py_ssize_t newpos;
   1472     Py_UNICODE *repptr;
   1473     Py_ssize_t repsize;
   1474     int res = -1;
   1475 
   1476     if (*errorHandler == NULL) {
   1477         *errorHandler = PyCodec_LookupError(errors);
   1478         if (*errorHandler == NULL)
   1479             goto onError;
   1480     }
   1481 
   1482     if (*exceptionObject == NULL) {
   1483         *exceptionObject = PyUnicodeDecodeError_Create(
   1484             encoding, input, insize, *startinpos, *endinpos, reason);
   1485         if (*exceptionObject == NULL)
   1486             goto onError;
   1487     }
   1488     else {
   1489         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
   1490             goto onError;
   1491         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
   1492             goto onError;
   1493         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
   1494             goto onError;
   1495     }
   1496 
   1497     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
   1498     if (restuple == NULL)
   1499         goto onError;
   1500     if (!PyTuple_Check(restuple)) {
   1501         PyErr_SetString(PyExc_TypeError, &argparse[4]);
   1502         goto onError;
   1503     }
   1504     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
   1505         goto onError;
   1506     if (newpos<0)
   1507         newpos = insize+newpos;
   1508     if (newpos<0 || newpos>insize) {
   1509         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
   1510         goto onError;
   1511     }
   1512 
   1513     /* need more space? (at least enough for what we
   1514        have+the replacement+the rest of the string (starting
   1515        at the new input position), so we won't have to check space
   1516        when there are no errors in the rest of the string) */
   1517     repptr = PyUnicode_AS_UNICODE(repunicode);
   1518     repsize = PyUnicode_GET_SIZE(repunicode);
   1519     requiredsize = *outpos;
   1520     if (requiredsize > PY_SSIZE_T_MAX - repsize)
   1521         goto overflow;
   1522     requiredsize += repsize;
   1523     if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
   1524         goto overflow;
   1525     requiredsize += insize - newpos;
   1526     if (requiredsize > outsize) {
   1527         if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
   1528             requiredsize = 2*outsize;
   1529         if (_PyUnicode_Resize(output, requiredsize) < 0)
   1530             goto onError;
   1531         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
   1532     }
   1533     *endinpos = newpos;
   1534     *inptr = input + newpos;
   1535     Py_UNICODE_COPY(*outptr, repptr, repsize);
   1536     *outptr += repsize;
   1537     *outpos += repsize;
   1538     /* we made it! */
   1539     res = 0;
   1540 
   1541   onError:
   1542     Py_XDECREF(restuple);
   1543     return res;
   1544 
   1545   overflow:
   1546     PyErr_SetString(PyExc_OverflowError,
   1547                     "decoded result is too long for a Python string");
   1548     goto onError;
   1549 }
   1550 
   1551 /* --- UTF-7 Codec -------------------------------------------------------- */
   1552 
   1553 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
   1554 
   1555 /* Three simple macros defining base-64. */
   1556 
   1557 /* Is c a base-64 character? */
   1558 
   1559 #define IS_BASE64(c) \
   1560     (((c) >= 'A' && (c) <= 'Z') ||     \
   1561      ((c) >= 'a' && (c) <= 'z') ||     \
   1562      ((c) >= '0' && (c) <= '9') ||     \
   1563      (c) == '+' || (c) == '/')
   1564 
   1565 /* given that c is a base-64 character, what is its base-64 value? */
   1566 
   1567 #define FROM_BASE64(c)                                                  \
   1568     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
   1569      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
   1570      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
   1571      (c) == '+' ? 62 : 63)
   1572 
   1573 /* What is the base-64 character of the bottom 6 bits of n? */
   1574 
   1575 #define TO_BASE64(n)  \
   1576     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
   1577 
   1578 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
   1579  * decoded as itself.  We are permissive on decoding; the only ASCII
   1580  * byte not decoding to itself is the + which begins a base64
   1581  * string. */
   1582 
   1583 #define DECODE_DIRECT(c)                                \
   1584     ((c) <= 127 && (c) != '+')
   1585 
   1586 /* The UTF-7 encoder treats ASCII characters differently according to
   1587  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
   1588  * the above).  See RFC2152.  This array identifies these different
   1589  * sets:
   1590  * 0 : "Set D"
   1591  *     alphanumeric and '(),-./:?
   1592  * 1 : "Set O"
   1593  *     !"#$%&*;<=>@[]^_`{|}
   1594  * 2 : "whitespace"
   1595  *     ht nl cr sp
   1596  * 3 : special (must be base64 encoded)
   1597  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
   1598  */
   1599 
   1600 static
   1601 char utf7_category[128] = {
   1602 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
   1603     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
   1604 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
   1605     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
   1606 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
   1607     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
   1608 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
   1609     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
   1610 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
   1611     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   1612 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
   1613     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
   1614 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
   1615     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   1616 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
   1617     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
   1618 };
   1619 
   1620 /* ENCODE_DIRECT: this character should be encoded as itself.  The
   1621  * answer depends on whether we are encoding set O as itself, and also
   1622  * on whether we are encoding whitespace as itself.  RFC2152 makes it
   1623  * clear that the answers to these questions vary between
   1624  * applications, so this code needs to be flexible.  */
   1625 
   1626 #define ENCODE_DIRECT(c, directO, directWS)             \
   1627     ((c) < 128 && (c) > 0 &&                            \
   1628      ((utf7_category[(c)] == 0) ||                      \
   1629       (directWS && (utf7_category[(c)] == 2)) ||        \
   1630       (directO && (utf7_category[(c)] == 1))))
   1631 
   1632 PyObject *PyUnicode_DecodeUTF7(const char *s,
   1633                                Py_ssize_t size,
   1634                                const char *errors)
   1635 {
   1636     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
   1637 }
   1638 
   1639 /* The decoder.  The only state we preserve is our read position,
   1640  * i.e. how many characters we have consumed.  So if we end in the
   1641  * middle of a shift sequence we have to back off the read position
   1642  * and the output to the beginning of the sequence, otherwise we lose
   1643  * all the shift state (seen bits, number of bits seen, high
   1644  * surrogate). */
   1645 
   1646 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
   1647                                        Py_ssize_t size,
   1648                                        const char *errors,
   1649                                        Py_ssize_t *consumed)
   1650 {
   1651     const char *starts = s;
   1652     Py_ssize_t startinpos;
   1653     Py_ssize_t endinpos;
   1654     Py_ssize_t outpos;
   1655     const char *e;
   1656     PyUnicodeObject *unicode;
   1657     Py_UNICODE *p;
   1658     const char *errmsg = "";
   1659     int inShift = 0;
   1660     Py_UNICODE *shiftOutStart;
   1661     unsigned int base64bits = 0;
   1662     unsigned long base64buffer = 0;
   1663     Py_UNICODE surrogate = 0;
   1664     PyObject *errorHandler = NULL;
   1665     PyObject *exc = NULL;
   1666 
   1667     unicode = _PyUnicode_New(size);
   1668     if (!unicode)
   1669         return NULL;
   1670     if (size == 0) {
   1671         if (consumed)
   1672             *consumed = 0;
   1673         return (PyObject *)unicode;
   1674     }
   1675 
   1676     p = unicode->str;
   1677     shiftOutStart = p;
   1678     e = s + size;
   1679 
   1680     while (s < e) {
   1681         Py_UNICODE ch = (unsigned char) *s;
   1682 
   1683         if (inShift) { /* in a base-64 section */
   1684             if (IS_BASE64(ch)) { /* consume a base-64 character */
   1685                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
   1686                 base64bits += 6;
   1687                 s++;
   1688                 if (base64bits >= 16) {
   1689                     /* we have enough bits for a UTF-16 value */
   1690                     Py_UNICODE outCh = (Py_UNICODE)
   1691                                        (base64buffer >> (base64bits-16));
   1692                     base64bits -= 16;
   1693                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
   1694                     assert(outCh <= 0xffff);
   1695                     if (surrogate) {
   1696                         /* expecting a second surrogate */
   1697                         if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
   1698 #ifdef Py_UNICODE_WIDE
   1699                             *p++ = (((surrogate & 0x3FF)<<10)
   1700                                     | (outCh & 0x3FF)) + 0x10000;
   1701 #else
   1702                             *p++ = surrogate;
   1703                             *p++ = outCh;
   1704 #endif
   1705                             surrogate = 0;
   1706                             continue;
   1707                         }
   1708                         else {
   1709                             *p++ = surrogate;
   1710                             surrogate = 0;
   1711                         }
   1712                     }
   1713                     if (outCh >= 0xD800 && outCh <= 0xDBFF) {
   1714                         /* first surrogate */
   1715                         surrogate = outCh;
   1716                     }
   1717                     else {
   1718                         *p++ = outCh;
   1719                     }
   1720                 }
   1721             }
   1722             else { /* now leaving a base-64 section */
   1723                 inShift = 0;
   1724                 if (base64bits > 0) { /* left-over bits */
   1725                     if (base64bits >= 6) {
   1726                         /* We've seen at least one base-64 character */
   1727                         s++;
   1728                         errmsg = "partial character in shift sequence";
   1729                         goto utf7Error;
   1730                     }
   1731                     else {
   1732                         /* Some bits remain; they should be zero */
   1733                         if (base64buffer != 0) {
   1734                             s++;
   1735                             errmsg = "non-zero padding bits in shift sequence";
   1736                             goto utf7Error;
   1737                         }
   1738                     }
   1739                 }
   1740                 if (surrogate && DECODE_DIRECT(ch))
   1741                     *p++ = surrogate;
   1742                 surrogate = 0;
   1743                 if (ch == '-') {
   1744                     /* '-' is absorbed; other terminating
   1745                        characters are preserved */
   1746                     s++;
   1747                 }
   1748             }
   1749         }
   1750         else if ( ch == '+' ) {
   1751             startinpos = s-starts;
   1752             s++; /* consume '+' */
   1753             if (s < e && *s == '-') { /* '+-' encodes '+' */
   1754                 s++;
   1755                 *p++ = '+';
   1756             }
   1757             else { /* begin base64-encoded section */
   1758                 inShift = 1;
   1759                 surrogate = 0;
   1760                 shiftOutStart = p;
   1761                 base64bits = 0;
   1762                 base64buffer = 0;
   1763             }
   1764         }
   1765         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
   1766             *p++ = ch;
   1767             s++;
   1768         }
   1769         else {
   1770             startinpos = s-starts;
   1771             s++;
   1772             errmsg = "unexpected special character";
   1773             goto utf7Error;
   1774         }
   1775         continue;
   1776 utf7Error:
   1777         outpos = p-PyUnicode_AS_UNICODE(unicode);
   1778         endinpos = s-starts;
   1779         if (unicode_decode_call_errorhandler(
   1780                 errors, &errorHandler,
   1781                 "utf7", errmsg,
   1782                 starts, size, &startinpos, &endinpos, &exc, &s,
   1783                 &unicode, &outpos, &p))
   1784             goto onError;
   1785     }
   1786 
   1787     /* end of string */
   1788 
   1789     if (inShift && !consumed) { /* in shift sequence, no more to follow */
   1790         /* if we're in an inconsistent state, that's an error */
   1791         inShift = 0;
   1792         if (surrogate ||
   1793                 (base64bits >= 6) ||
   1794                 (base64bits > 0 && base64buffer != 0)) {
   1795             outpos = p-PyUnicode_AS_UNICODE(unicode);
   1796             endinpos = size;
   1797             if (unicode_decode_call_errorhandler(
   1798                     errors, &errorHandler,
   1799                     "utf7", "unterminated shift sequence",
   1800                     starts, size, &startinpos, &endinpos, &exc, &s,
   1801                     &unicode, &outpos, &p))
   1802                 goto onError;
   1803         }
   1804     }
   1805 
   1806     /* return state */
   1807     if (consumed) {
   1808         if (inShift) {
   1809             p = shiftOutStart; /* back off output */
   1810             *consumed = startinpos;
   1811         }
   1812         else {
   1813             *consumed = s-starts;
   1814         }
   1815     }
   1816 
   1817     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
   1818         goto onError;
   1819 
   1820     Py_XDECREF(errorHandler);
   1821     Py_XDECREF(exc);
   1822     return (PyObject *)unicode;
   1823 
   1824   onError:
   1825     Py_XDECREF(errorHandler);
   1826     Py_XDECREF(exc);
   1827     Py_DECREF(unicode);
   1828     return NULL;
   1829 }
   1830 
   1831 
   1832 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
   1833                                Py_ssize_t size,
   1834                                int base64SetO,
   1835                                int base64WhiteSpace,
   1836                                const char *errors)
   1837 {
   1838     PyObject *v;
   1839     /* It might be possible to tighten this worst case */
   1840     Py_ssize_t allocated = 8 * size;
   1841     int inShift = 0;
   1842     Py_ssize_t i = 0;
   1843     unsigned int base64bits = 0;
   1844     unsigned long base64buffer = 0;
   1845     char * out;
   1846     char * start;
   1847 
   1848     if (allocated / 8 != size)
   1849         return PyErr_NoMemory();
   1850 
   1851     if (size == 0)
   1852         return PyString_FromStringAndSize(NULL, 0);
   1853 
   1854     v = PyString_FromStringAndSize(NULL, allocated);
   1855     if (v == NULL)
   1856         return NULL;
   1857 
   1858     start = out = PyString_AS_STRING(v);
   1859     for (;i < size; ++i) {
   1860         Py_UNICODE ch = s[i];
   1861 
   1862         if (inShift) {
   1863             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
   1864                 /* shifting out */
   1865                 if (base64bits) { /* output remaining bits */
   1866                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
   1867                     base64buffer = 0;
   1868                     base64bits = 0;
   1869                 }
   1870                 inShift = 0;
   1871                 /* Characters not in the BASE64 set implicitly unshift the sequence
   1872                    so no '-' is required, except if the character is itself a '-' */
   1873                 if (IS_BASE64(ch) || ch == '-') {
   1874                     *out++ = '-';
   1875                 }
   1876                 *out++ = (char) ch;
   1877             }
   1878             else {
   1879                 goto encode_char;
   1880             }
   1881         }
   1882         else { /* not in a shift sequence */
   1883             if (ch == '+') {
   1884                 *out++ = '+';
   1885                         *out++ = '-';
   1886             }
   1887             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
   1888                 *out++ = (char) ch;
   1889             }
   1890             else {
   1891                 *out++ = '+';
   1892                 inShift = 1;
   1893                 goto encode_char;
   1894             }
   1895         }
   1896         continue;
   1897 encode_char:
   1898 #ifdef Py_UNICODE_WIDE
   1899         if (ch >= 0x10000) {
   1900             /* code first surrogate */
   1901             base64bits += 16;
   1902             base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
   1903             while (base64bits >= 6) {
   1904                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
   1905                 base64bits -= 6;
   1906             }
   1907             /* prepare second surrogate */
   1908             ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
   1909         }
   1910 #endif
   1911         base64bits += 16;
   1912         base64buffer = (base64buffer << 16) | ch;
   1913         while (base64bits >= 6) {
   1914             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
   1915             base64bits -= 6;
   1916         }
   1917     }
   1918     if (base64bits)
   1919         *out++= TO_BASE64(base64buffer << (6-base64bits) );
   1920     if (inShift)
   1921         *out++ = '-';
   1922 
   1923     if (_PyString_Resize(&v, out - start))
   1924         return NULL;
   1925     return v;
   1926 }
   1927 
   1928 #undef IS_BASE64
   1929 #undef FROM_BASE64
   1930 #undef TO_BASE64
   1931 #undef DECODE_DIRECT
   1932 #undef ENCODE_DIRECT
   1933 
   1934 /* --- UTF-8 Codec -------------------------------------------------------- */
   1935 
   1936 static
   1937 char utf8_code_length[256] = {
   1938     /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
   1939        illegal prefix.  See RFC 3629 for details */
   1940     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
   1941     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1942     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1943     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1944     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1945     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1946     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1947     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
   1948     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
   1949     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1950     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1951     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
   1952     0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
   1953     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
   1954     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
   1955     4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
   1956 };
   1957 
   1958 PyObject *PyUnicode_DecodeUTF8(const char *s,
   1959                                Py_ssize_t size,
   1960                                const char *errors)
   1961 {
   1962     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
   1963 }
   1964 
   1965 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
   1966                                        Py_ssize_t size,
   1967                                        const char *errors,
   1968                                        Py_ssize_t *consumed)
   1969 {
   1970     const char *starts = s;
   1971     int n;
   1972     int k;
   1973     Py_ssize_t startinpos;
   1974     Py_ssize_t endinpos;
   1975     Py_ssize_t outpos;
   1976     const char *e;
   1977     PyUnicodeObject *unicode;
   1978     Py_UNICODE *p;
   1979     const char *errmsg = "";
   1980     PyObject *errorHandler = NULL;
   1981     PyObject *exc = NULL;
   1982 
   1983     /* Note: size will always be longer than the resulting Unicode
   1984        character count */
   1985     unicode = _PyUnicode_New(size);
   1986     if (!unicode)
   1987         return NULL;
   1988     if (size == 0) {
   1989         if (consumed)
   1990             *consumed = 0;
   1991         return (PyObject *)unicode;
   1992     }
   1993 
   1994     /* Unpack UTF-8 encoded data */
   1995     p = unicode->str;
   1996     e = s + size;
   1997 
   1998     while (s < e) {
   1999         Py_UCS4 ch = (unsigned char)*s;
   2000 
   2001         if (ch < 0x80) {
   2002             *p++ = (Py_UNICODE)ch;
   2003             s++;
   2004             continue;
   2005         }
   2006 
   2007         n = utf8_code_length[ch];
   2008 
   2009         if (s + n > e) {
   2010             if (consumed)
   2011                 break;
   2012             else {
   2013                 errmsg = "unexpected end of data";
   2014                 startinpos = s-starts;
   2015                 endinpos = startinpos+1;
   2016                 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
   2017                     endinpos++;
   2018                 goto utf8Error;
   2019             }
   2020         }
   2021 
   2022         switch (n) {
   2023 
   2024         case 0:
   2025             errmsg = "invalid start byte";
   2026             startinpos = s-starts;
   2027             endinpos = startinpos+1;
   2028             goto utf8Error;
   2029 
   2030         case 1:
   2031             errmsg = "internal error";
   2032             startinpos = s-starts;
   2033             endinpos = startinpos+1;
   2034             goto utf8Error;
   2035 
   2036         case 2:
   2037             if ((s[1] & 0xc0) != 0x80) {
   2038                 errmsg = "invalid continuation byte";
   2039                 startinpos = s-starts;
   2040                 endinpos = startinpos + 1;
   2041                 goto utf8Error;
   2042             }
   2043             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
   2044             assert ((ch > 0x007F) && (ch <= 0x07FF));
   2045             *p++ = (Py_UNICODE)ch;
   2046             break;
   2047 
   2048         case 3:
   2049             /* XXX: surrogates shouldn't be valid UTF-8!
   2050                see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
   2051                (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
   2052                Uncomment the 2 lines below to make them invalid,
   2053                code points: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
   2054             if ((s[1] & 0xc0) != 0x80 ||
   2055                 (s[2] & 0xc0) != 0x80 ||
   2056                 ((unsigned char)s[0] == 0xE0 &&
   2057                  (unsigned char)s[1] < 0xA0)/* ||
   2058                 ((unsigned char)s[0] == 0xED &&
   2059                  (unsigned char)s[1] > 0x9F)*/) {
   2060                 errmsg = "invalid continuation byte";
   2061                 startinpos = s-starts;
   2062                 endinpos = startinpos + 1;
   2063 
   2064                 /* if s[1] first two bits are 1 and 0, then the invalid
   2065                    continuation byte is s[2], so increment endinpos by 1,
   2066                    if not, s[1] is invalid and endinpos doesn't need to
   2067                    be incremented. */
   2068                 if ((s[1] & 0xC0) == 0x80)
   2069                     endinpos++;
   2070                 goto utf8Error;
   2071             }
   2072             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
   2073             assert ((ch > 0x07FF) && (ch <= 0xFFFF));
   2074             *p++ = (Py_UNICODE)ch;
   2075             break;
   2076 
   2077         case 4:
   2078             if ((s[1] & 0xc0) != 0x80 ||
   2079                 (s[2] & 0xc0) != 0x80 ||
   2080                 (s[3] & 0xc0) != 0x80 ||
   2081                 ((unsigned char)s[0] == 0xF0 &&
   2082                  (unsigned char)s[1] < 0x90) ||
   2083                 ((unsigned char)s[0] == 0xF4 &&
   2084                  (unsigned char)s[1] > 0x8F)) {
   2085                 errmsg = "invalid continuation byte";
   2086                 startinpos = s-starts;
   2087                 endinpos = startinpos + 1;
   2088                 if ((s[1] & 0xC0) == 0x80) {
   2089                     endinpos++;
   2090                     if ((s[2] & 0xC0) == 0x80)
   2091                         endinpos++;
   2092                 }
   2093                 goto utf8Error;
   2094             }
   2095             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
   2096                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
   2097             assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
   2098 
   2099 #ifdef Py_UNICODE_WIDE
   2100             *p++ = (Py_UNICODE)ch;
   2101 #else
   2102             /*  compute and append the two surrogates: */
   2103 
   2104             /*  translate from 10000..10FFFF to 0..FFFF */
   2105             ch -= 0x10000;
   2106 
   2107             /*  high surrogate = top 10 bits added to D800 */
   2108             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
   2109 
   2110             /*  low surrogate = bottom 10 bits added to DC00 */
   2111             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
   2112 #endif
   2113             break;
   2114         }
   2115         s += n;
   2116         continue;
   2117 
   2118       utf8Error:
   2119         outpos = p-PyUnicode_AS_UNICODE(unicode);
   2120         if (unicode_decode_call_errorhandler(
   2121                 errors, &errorHandler,
   2122                 "utf8", errmsg,
   2123                 starts, size, &startinpos, &endinpos, &exc, &s,
   2124                 &unicode, &outpos, &p))
   2125             goto onError;
   2126     }
   2127     if (consumed)
   2128         *consumed = s-starts;
   2129 
   2130     /* Adjust length */
   2131     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
   2132         goto onError;
   2133 
   2134     Py_XDECREF(errorHandler);
   2135     Py_XDECREF(exc);
   2136     return (PyObject *)unicode;
   2137 
   2138   onError:
   2139     Py_XDECREF(errorHandler);
   2140     Py_XDECREF(exc);
   2141     Py_DECREF(unicode);
   2142     return NULL;
   2143 }
   2144 
   2145 /* Allocation strategy:  if the string is short, convert into a stack buffer
   2146    and allocate exactly as much space needed at the end.  Else allocate the
   2147    maximum possible needed (4 result bytes per Unicode character), and return
   2148    the excess memory at the end.
   2149 */
   2150 PyObject *
   2151 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
   2152                      Py_ssize_t size,
   2153                      const char *errors)
   2154 {
   2155 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
   2156 
   2157     Py_ssize_t i;           /* index into s of next input byte */
   2158     PyObject *v;        /* result string object */
   2159     char *p;            /* next free byte in output buffer */
   2160     Py_ssize_t nallocated;  /* number of result bytes allocated */
   2161     Py_ssize_t nneeded;        /* number of result bytes needed */
   2162     char stackbuf[MAX_SHORT_UNICHARS * 4];
   2163 
   2164     assert(s != NULL);
   2165     assert(size >= 0);
   2166 
   2167     if (size <= MAX_SHORT_UNICHARS) {
   2168         /* Write into the stack buffer; nallocated can't overflow.
   2169          * At the end, we'll allocate exactly as much heap space as it
   2170          * turns out we need.
   2171          */
   2172         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
   2173         v = NULL;   /* will allocate after we're done */
   2174         p = stackbuf;
   2175     }
   2176     else {
   2177         /* Overallocate on the heap, and give the excess back at the end. */
   2178         nallocated = size * 4;
   2179         if (nallocated / 4 != size)  /* overflow! */
   2180             return PyErr_NoMemory();
   2181         v = PyString_FromStringAndSize(NULL, nallocated);
   2182         if (v == NULL)
   2183             return NULL;
   2184         p = PyString_AS_STRING(v);
   2185     }
   2186 
   2187     for (i = 0; i < size;) {
   2188         Py_UCS4 ch = s[i++];
   2189 
   2190         if (ch < 0x80)
   2191             /* Encode ASCII */
   2192             *p++ = (char) ch;
   2193 
   2194         else if (ch < 0x0800) {
   2195             /* Encode Latin-1 */
   2196             *p++ = (char)(0xc0 | (ch >> 6));
   2197             *p++ = (char)(0x80 | (ch & 0x3f));
   2198         }
   2199         else {
   2200             /* Encode UCS2 Unicode ordinals */
   2201             if (ch < 0x10000) {
   2202                 /* Special case: check for high surrogate */
   2203                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
   2204                     Py_UCS4 ch2 = s[i];
   2205                     /* Check for low surrogate and combine the two to
   2206                        form a UCS4 value */
   2207                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
   2208                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
   2209                         i++;
   2210                         goto encodeUCS4;
   2211                     }
   2212                     /* Fall through: handles isolated high surrogates */
   2213                 }
   2214                 *p++ = (char)(0xe0 | (ch >> 12));
   2215                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
   2216                 *p++ = (char)(0x80 | (ch & 0x3f));
   2217                 continue;
   2218             }
   2219           encodeUCS4:
   2220             /* Encode UCS4 Unicode ordinals */
   2221             *p++ = (char)(0xf0 | (ch >> 18));
   2222             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
   2223             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
   2224             *p++ = (char)(0x80 | (ch & 0x3f));
   2225         }
   2226     }
   2227 
   2228     if (v == NULL) {
   2229         /* This was stack allocated. */
   2230         nneeded = p - stackbuf;
   2231         assert(nneeded <= nallocated);
   2232         v = PyString_FromStringAndSize(stackbuf, nneeded);
   2233     }
   2234     else {
   2235         /* Cut back to size actually needed. */
   2236         nneeded = p - PyString_AS_STRING(v);
   2237         assert(nneeded <= nallocated);
   2238         if (_PyString_Resize(&v, nneeded))
   2239             return NULL;
   2240     }
   2241     return v;
   2242 
   2243 #undef MAX_SHORT_UNICHARS
   2244 }
   2245 
   2246 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
   2247 {
   2248     if (!PyUnicode_Check(unicode)) {
   2249         PyErr_BadArgument();
   2250         return NULL;
   2251     }
   2252     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
   2253                                 PyUnicode_GET_SIZE(unicode),
   2254                                 NULL);
   2255 }
   2256 
   2257 /* --- UTF-32 Codec ------------------------------------------------------- */
   2258 
   2259 PyObject *
   2260 PyUnicode_DecodeUTF32(const char *s,
   2261                       Py_ssize_t size,
   2262                       const char *errors,
   2263                       int *byteorder)
   2264 {
   2265     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
   2266 }
   2267 
   2268 PyObject *
   2269 PyUnicode_DecodeUTF32Stateful(const char *s,
   2270                               Py_ssize_t size,
   2271                               const char *errors,
   2272                               int *byteorder,
   2273                               Py_ssize_t *consumed)
   2274 {
   2275     const char *starts = s;
   2276     Py_ssize_t startinpos;
   2277     Py_ssize_t endinpos;
   2278     Py_ssize_t outpos;
   2279     PyUnicodeObject *unicode;
   2280     Py_UNICODE *p;
   2281 #ifndef Py_UNICODE_WIDE
   2282     int pairs = 0;
   2283     const unsigned char *qq;
   2284 #else
   2285     const int pairs = 0;
   2286 #endif
   2287     const unsigned char *q, *e;
   2288     int bo = 0;       /* assume native ordering by default */
   2289     const char *errmsg = "";
   2290     /* Offsets from q for retrieving bytes in the right order. */
   2291 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
   2292     int iorder[] = {0, 1, 2, 3};
   2293 #else
   2294     int iorder[] = {3, 2, 1, 0};
   2295 #endif
   2296     PyObject *errorHandler = NULL;
   2297     PyObject *exc = NULL;
   2298 
   2299     q = (unsigned char *)s;
   2300     e = q + size;
   2301 
   2302     if (byteorder)
   2303         bo = *byteorder;
   2304 
   2305     /* Check for BOM marks (U+FEFF) in the input and adjust current
   2306        byte order setting accordingly. In native mode, the leading BOM
   2307        mark is skipped, in all other modes, it is copied to the output
   2308        stream as-is (giving a ZWNBSP character). */
   2309     if (bo == 0) {
   2310         if (size >= 4) {
   2311             const Py_UCS4 bom = ((unsigned int)q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
   2312                 (q[iorder[1]] << 8) | q[iorder[0]];
   2313 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
   2314             if (bom == 0x0000FEFF) {
   2315                 q += 4;
   2316                 bo = -1;
   2317             }
   2318             else if (bom == 0xFFFE0000) {
   2319                 q += 4;
   2320                 bo = 1;
   2321             }
   2322 #else
   2323             if (bom == 0x0000FEFF) {
   2324                 q += 4;
   2325                 bo = 1;
   2326             }
   2327             else if (bom == 0xFFFE0000) {
   2328                 q += 4;
   2329                 bo = -1;
   2330             }
   2331 #endif
   2332         }
   2333     }
   2334 
   2335     if (bo == -1) {
   2336         /* force LE */
   2337         iorder[0] = 0;
   2338         iorder[1] = 1;
   2339         iorder[2] = 2;
   2340         iorder[3] = 3;
   2341     }
   2342     else if (bo == 1) {
   2343         /* force BE */
   2344         iorder[0] = 3;
   2345         iorder[1] = 2;
   2346         iorder[2] = 1;
   2347         iorder[3] = 0;
   2348     }
   2349 
   2350     /* On narrow builds we split characters outside the BMP into two
   2351        code points => count how much extra space we need. */
   2352 #ifndef Py_UNICODE_WIDE
   2353     for (qq = q; e - qq >= 4; qq += 4)
   2354         if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
   2355             pairs++;
   2356 #endif
   2357 
   2358     /* This might be one to much, because of a BOM */
   2359     unicode = _PyUnicode_New((size+3)/4+pairs);
   2360     if (!unicode)
   2361         return NULL;
   2362     if (size == 0)
   2363         return (PyObject *)unicode;
   2364 
   2365     /* Unpack UTF-32 encoded data */
   2366     p = unicode->str;
   2367 
   2368     while (q < e) {
   2369         Py_UCS4 ch;
   2370         /* remaining bytes at the end? (size should be divisible by 4) */
   2371         if (e-q<4) {
   2372             if (consumed)
   2373                 break;
   2374             errmsg = "truncated data";
   2375             startinpos = ((const char *)q)-starts;
   2376             endinpos = ((const char *)e)-starts;
   2377             goto utf32Error;
   2378             /* The remaining input chars are ignored if the callback
   2379                chooses to skip the input */
   2380         }
   2381         ch = ((unsigned int)q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
   2382             (q[iorder[1]] << 8) | q[iorder[0]];
   2383 
   2384         if (ch >= 0x110000)
   2385         {
   2386             errmsg = "code point not in range(0x110000)";
   2387             startinpos = ((const char *)q)-starts;
   2388             endinpos = startinpos+4;
   2389             goto utf32Error;
   2390         }
   2391 #ifndef Py_UNICODE_WIDE
   2392         if (ch >= 0x10000)
   2393         {
   2394             *p++ = 0xD800 | ((ch-0x10000) >> 10);
   2395             *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
   2396         }
   2397         else
   2398 #endif
   2399             *p++ = ch;
   2400         q += 4;
   2401         continue;
   2402       utf32Error:
   2403         outpos = p-PyUnicode_AS_UNICODE(unicode);
   2404         if (unicode_decode_call_errorhandler(
   2405                 errors, &errorHandler,
   2406                 "utf32", errmsg,
   2407                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
   2408                 &unicode, &outpos, &p))
   2409             goto onError;
   2410     }
   2411 
   2412     if (byteorder)
   2413         *byteorder = bo;
   2414 
   2415     if (consumed)
   2416         *consumed = (const char *)q-starts;
   2417 
   2418     /* Adjust length */
   2419     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
   2420         goto onError;
   2421 
   2422     Py_XDECREF(errorHandler);
   2423     Py_XDECREF(exc);
   2424     return (PyObject *)unicode;
   2425 
   2426   onError:
   2427     Py_DECREF(unicode);
   2428     Py_XDECREF(errorHandler);
   2429     Py_XDECREF(exc);
   2430     return NULL;
   2431 }
   2432 
   2433 PyObject *
   2434 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
   2435                       Py_ssize_t size,
   2436                       const char *errors,
   2437                       int byteorder)
   2438 {
   2439     PyObject *v;
   2440     unsigned char *p;
   2441     Py_ssize_t nsize, bytesize;
   2442 #ifndef Py_UNICODE_WIDE
   2443     Py_ssize_t i, pairs;
   2444 #else
   2445     const int pairs = 0;
   2446 #endif
   2447     /* Offsets from p for storing byte pairs in the right order. */
   2448 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
   2449     int iorder[] = {0, 1, 2, 3};
   2450 #else
   2451     int iorder[] = {3, 2, 1, 0};
   2452 #endif
   2453 
   2454 #define STORECHAR(CH)                           \
   2455     do {                                        \
   2456         p[iorder[3]] = ((CH) >> 24) & 0xff;     \
   2457         p[iorder[2]] = ((CH) >> 16) & 0xff;     \
   2458         p[iorder[1]] = ((CH) >> 8) & 0xff;      \
   2459         p[iorder[0]] = (CH) & 0xff;             \
   2460         p += 4;                                 \
   2461     } while(0)
   2462 
   2463     /* In narrow builds we can output surrogate pairs as one code point,
   2464        so we need less space. */
   2465 #ifndef Py_UNICODE_WIDE
   2466     for (i = pairs = 0; i < size-1; i++)
   2467         if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
   2468             0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
   2469             pairs++;
   2470 #endif
   2471     nsize = (size - pairs + (byteorder == 0));
   2472     bytesize = nsize * 4;
   2473     if (bytesize / 4 != nsize)
   2474         return PyErr_NoMemory();
   2475     v = PyString_FromStringAndSize(NULL, bytesize);
   2476     if (v == NULL)
   2477         return NULL;
   2478 
   2479     p = (unsigned char *)PyString_AS_STRING(v);
   2480     if (byteorder == 0)
   2481         STORECHAR(0xFEFF);
   2482     if (size == 0)
   2483         return v;
   2484 
   2485     if (byteorder == -1) {
   2486         /* force LE */
   2487         iorder[0] = 0;
   2488         iorder[1] = 1;
   2489         iorder[2] = 2;
   2490         iorder[3] = 3;
   2491     }
   2492     else if (byteorder == 1) {
   2493         /* force BE */
   2494         iorder[0] = 3;
   2495         iorder[1] = 2;
   2496         iorder[2] = 1;
   2497         iorder[3] = 0;
   2498     }
   2499 
   2500     while (size-- > 0) {
   2501         Py_UCS4 ch = *s++;
   2502 #ifndef Py_UNICODE_WIDE
   2503         if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
   2504             Py_UCS4 ch2 = *s;
   2505             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
   2506                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
   2507                 s++;
   2508                 size--;
   2509             }
   2510         }
   2511 #endif
   2512         STORECHAR(ch);
   2513     }
   2514     return v;
   2515 #undef STORECHAR
   2516 }
   2517 
   2518 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
   2519 {
   2520     if (!PyUnicode_Check(unicode)) {
   2521         PyErr_BadArgument();
   2522         return NULL;
   2523     }
   2524     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
   2525                                  PyUnicode_GET_SIZE(unicode),
   2526                                  NULL,
   2527                                  0);
   2528 }
   2529 
   2530 /* --- UTF-16 Codec ------------------------------------------------------- */
   2531 
   2532 PyObject *
   2533 PyUnicode_DecodeUTF16(const char *s,
   2534                       Py_ssize_t size,
   2535                       const char *errors,
   2536                       int *byteorder)
   2537 {
   2538     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
   2539 }
   2540 
   2541 PyObject *
   2542 PyUnicode_DecodeUTF16Stateful(const char *s,
   2543                               Py_ssize_t size,
   2544                               const char *errors,
   2545                               int *byteorder,
   2546                               Py_ssize_t *consumed)
   2547 {
   2548     const char *starts = s;
   2549     Py_ssize_t startinpos;
   2550     Py_ssize_t endinpos;
   2551     Py_ssize_t outpos;
   2552     PyUnicodeObject *unicode;
   2553     Py_UNICODE *p;
   2554     const unsigned char *q, *e;
   2555     int bo = 0;       /* assume native ordering by default */
   2556     const char *errmsg = "";
   2557     /* Offsets from q for retrieving byte pairs in the right order. */
   2558 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
   2559     int ihi = 1, ilo = 0;
   2560 #else
   2561     int ihi = 0, ilo = 1;
   2562 #endif
   2563     PyObject *errorHandler = NULL;
   2564     PyObject *exc = NULL;
   2565 
   2566     /* Note: size will always be longer than the resulting Unicode
   2567        character count */
   2568     unicode = _PyUnicode_New(size);
   2569     if (!unicode)
   2570         return NULL;
   2571     if (size == 0)
   2572         return (PyObject *)unicode;
   2573 
   2574     /* Unpack UTF-16 encoded data */
   2575     p = unicode->str;
   2576     q = (unsigned char *)s;
   2577     e = q + size;
   2578 
   2579     if (byteorder)
   2580         bo = *byteorder;
   2581 
   2582     /* Check for BOM marks (U+FEFF) in the input and adjust current
   2583        byte order setting accordingly. In native mode, the leading BOM
   2584        mark is skipped, in all other modes, it is copied to the output
   2585        stream as-is (giving a ZWNBSP character). */
   2586     if (bo == 0) {
   2587         if (size >= 2) {
   2588             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
   2589 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
   2590             if (bom == 0xFEFF) {
   2591                 q += 2;
   2592                 bo = -1;
   2593             }
   2594             else if (bom == 0xFFFE) {
   2595                 q += 2;
   2596                 bo = 1;
   2597             }
   2598 #else
   2599             if (bom == 0xFEFF) {
   2600                 q += 2;
   2601                 bo = 1;
   2602             }
   2603             else if (bom == 0xFFFE) {
   2604                 q += 2;
   2605                 bo = -1;
   2606             }
   2607 #endif
   2608         }
   2609     }
   2610 
   2611     if (bo == -1) {
   2612         /* force LE */
   2613         ihi = 1;
   2614         ilo = 0;
   2615     }
   2616     else if (bo == 1) {
   2617         /* force BE */
   2618         ihi = 0;
   2619         ilo = 1;
   2620     }
   2621 
   2622     while (q < e) {
   2623         Py_UNICODE ch;
   2624         /* remaining bytes at the end? (size should be even) */
   2625         if (e-q<2) {
   2626             if (consumed)
   2627                 break;
   2628             errmsg = "truncated data";
   2629             startinpos = ((const char *)q)-starts;
   2630             endinpos = ((const char *)e)-starts;
   2631             goto utf16Error;
   2632             /* The remaining input chars are ignored if the callback
   2633                chooses to skip the input */
   2634         }
   2635         ch = (q[ihi] << 8) | q[ilo];
   2636 
   2637         q += 2;
   2638 
   2639         if (ch < 0xD800 || ch > 0xDFFF) {
   2640             *p++ = ch;
   2641             continue;
   2642         }
   2643 
   2644         /* UTF-16 code pair: */
   2645         if (e - q < 2) {
   2646             q -= 2;
   2647             if (consumed)
   2648                 break;
   2649             errmsg = "unexpected end of data";
   2650             startinpos = ((const char *)q)-starts;
   2651             endinpos = ((const char *)e)-starts;
   2652             goto utf16Error;
   2653         }
   2654         if (0xD800 <= ch && ch <= 0xDBFF) {
   2655             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
   2656             q += 2;
   2657             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
   2658 #ifndef Py_UNICODE_WIDE
   2659                 *p++ = ch;
   2660                 *p++ = ch2;
   2661 #else
   2662                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
   2663 #endif
   2664                 continue;
   2665             }
   2666             else {
   2667                 errmsg = "illegal UTF-16 surrogate";
   2668                 startinpos = (((const char *)q)-4)-starts;
   2669                 endinpos = startinpos+2;
   2670                 goto utf16Error;
   2671             }
   2672 
   2673         }
   2674         errmsg = "illegal encoding";
   2675         startinpos = (((const char *)q)-2)-starts;
   2676         endinpos = startinpos+2;
   2677         /* Fall through to report the error */
   2678 
   2679       utf16Error:
   2680         outpos = p-PyUnicode_AS_UNICODE(unicode);
   2681         if (unicode_decode_call_errorhandler(
   2682                 errors, &errorHandler,
   2683                 "utf16", errmsg,
   2684                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
   2685                 &unicode, &outpos, &p))
   2686             goto onError;
   2687     }
   2688 
   2689     if (byteorder)
   2690         *byteorder = bo;
   2691 
   2692     if (consumed)
   2693         *consumed = (const char *)q-starts;
   2694 
   2695     /* Adjust length */
   2696     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
   2697         goto onError;
   2698 
   2699     Py_XDECREF(errorHandler);
   2700     Py_XDECREF(exc);
   2701     return (PyObject *)unicode;
   2702 
   2703   onError:
   2704     Py_DECREF(unicode);
   2705     Py_XDECREF(errorHandler);
   2706     Py_XDECREF(exc);
   2707     return NULL;
   2708 }
   2709 
   2710 PyObject *
   2711 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
   2712                       Py_ssize_t size,
   2713                       const char *errors,
   2714                       int byteorder)
   2715 {
   2716     PyObject *v;
   2717     unsigned char *p;
   2718     Py_ssize_t nsize, bytesize;
   2719 #ifdef Py_UNICODE_WIDE
   2720     Py_ssize_t i, pairs;
   2721 #else
   2722     const int pairs = 0;
   2723 #endif
   2724     /* Offsets from p for storing byte pairs in the right order. */
   2725 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
   2726     int ihi = 1, ilo = 0;
   2727 #else
   2728     int ihi = 0, ilo = 1;
   2729 #endif
   2730 
   2731 #define STORECHAR(CH)                           \
   2732     do {                                        \
   2733         p[ihi] = ((CH) >> 8) & 0xff;            \
   2734         p[ilo] = (CH) & 0xff;                   \
   2735         p += 2;                                 \
   2736     } while(0)
   2737 
   2738 #ifdef Py_UNICODE_WIDE
   2739     for (i = pairs = 0; i < size; i++)
   2740         if (s[i] >= 0x10000)
   2741             pairs++;
   2742 #endif
   2743     /* 2 * (size + pairs + (byteorder == 0)) */
   2744     if (size > PY_SSIZE_T_MAX ||
   2745         size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
   2746         return PyErr_NoMemory();
   2747     nsize = size + pairs + (byteorder == 0);
   2748     bytesize = nsize * 2;
   2749     if (bytesize / 2 != nsize)
   2750         return PyErr_NoMemory();
   2751     v = PyString_FromStringAndSize(NULL, bytesize);
   2752     if (v == NULL)
   2753         return NULL;
   2754 
   2755     p = (unsigned char *)PyString_AS_STRING(v);
   2756     if (byteorder == 0)
   2757         STORECHAR(0xFEFF);
   2758     if (size == 0)
   2759         return v;
   2760 
   2761     if (byteorder == -1) {
   2762         /* force LE */
   2763         ihi = 1;
   2764         ilo = 0;
   2765     }
   2766     else if (byteorder == 1) {
   2767         /* force BE */
   2768         ihi = 0;
   2769         ilo = 1;
   2770     }
   2771 
   2772     while (size-- > 0) {
   2773         Py_UNICODE ch = *s++;
   2774         Py_UNICODE ch2 = 0;
   2775 #ifdef Py_UNICODE_WIDE
   2776         if (ch >= 0x10000) {
   2777             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
   2778             ch  = 0xD800 | ((ch-0x10000) >> 10);
   2779         }
   2780 #endif
   2781         STORECHAR(ch);
   2782         if (ch2)
   2783             STORECHAR(ch2);
   2784     }
   2785     return v;
   2786 #undef STORECHAR
   2787 }
   2788 
   2789 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
   2790 {
   2791     if (!PyUnicode_Check(unicode)) {
   2792         PyErr_BadArgument();
   2793         return NULL;
   2794     }
   2795     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
   2796                                  PyUnicode_GET_SIZE(unicode),
   2797                                  NULL,
   2798                                  0);
   2799 }
   2800 
   2801 /* --- Unicode Escape Codec ----------------------------------------------- */
   2802 
   2803 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
   2804 
   2805 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
   2806                                         Py_ssize_t size,
   2807                                         const char *errors)
   2808 {
   2809     const char *starts = s;
   2810     Py_ssize_t startinpos;
   2811     Py_ssize_t endinpos;
   2812     Py_ssize_t outpos;
   2813     PyUnicodeObject *v;
   2814     Py_UNICODE *p;
   2815     const char *end;
   2816     char* message;
   2817     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
   2818     PyObject *errorHandler = NULL;
   2819     PyObject *exc = NULL;
   2820 
   2821     /* Escaped strings will always be longer than the resulting
   2822        Unicode string, so we start with size here and then reduce the
   2823        length after conversion to the true value.
   2824        (but if the error callback returns a long replacement string
   2825        we'll have to allocate more space) */
   2826     v = _PyUnicode_New(size);
   2827     if (v == NULL)
   2828         goto onError;
   2829     if (size == 0)
   2830         return (PyObject *)v;
   2831 
   2832     p = PyUnicode_AS_UNICODE(v);
   2833     end = s + size;
   2834 
   2835     while (s < end) {
   2836         unsigned char c;
   2837         Py_UNICODE x;
   2838         int digits;
   2839 
   2840         /* Non-escape characters are interpreted as Unicode ordinals */
   2841         if (*s != '\\') {
   2842             *p++ = (unsigned char) *s++;
   2843             continue;
   2844         }
   2845 
   2846         startinpos = s-starts;
   2847         /* \ - Escapes */
   2848         s++;
   2849         c = *s++;
   2850         if (s > end)
   2851             c = '\0'; /* Invalid after \ */
   2852         switch (c) {
   2853 
   2854             /* \x escapes */
   2855         case '\n': break;
   2856         case '\\': *p++ = '\\'; break;
   2857         case '\'': *p++ = '\''; break;
   2858         case '\"': *p++ = '\"'; break;
   2859         case 'b': *p++ = '\b'; break;
   2860         case 'f': *p++ = '\014'; break; /* FF */
   2861         case 't': *p++ = '\t'; break;
   2862         case 'n': *p++ = '\n'; break;
   2863         case 'r': *p++ = '\r'; break;
   2864         case 'v': *p++ = '\013'; break; /* VT */
   2865         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
   2866 
   2867             /* \OOO (octal) escapes */
   2868         case '0': case '1': case '2': case '3':
   2869         case '4': case '5': case '6': case '7':
   2870             x = s[-1] - '0';
   2871             if (s < end && '0' <= *s && *s <= '7') {
   2872                 x = (x<<3) + *s++ - '0';
   2873                 if (s < end && '0' <= *s && *s <= '7')
   2874                     x = (x<<3) + *s++ - '0';
   2875             }
   2876             *p++ = x;
   2877             break;
   2878 
   2879             /* hex escapes */
   2880             /* \xXX */
   2881         case 'x':
   2882             digits = 2;
   2883             message = "truncated \\xXX escape";
   2884             goto hexescape;
   2885 
   2886             /* \uXXXX */
   2887         case 'u':
   2888             digits = 4;
   2889             message = "truncated \\uXXXX escape";
   2890             goto hexescape;
   2891 
   2892             /* \UXXXXXXXX */
   2893         case 'U':
   2894             digits = 8;
   2895             message = "truncated \\UXXXXXXXX escape";
   2896         hexescape:
   2897             chr = 0;
   2898             if (end - s < digits) {
   2899                 /* count only hex digits */
   2900                 for (; s < end; ++s) {
   2901                     c = (unsigned char)*s;
   2902                     if (!Py_ISXDIGIT(c))
   2903                         goto error;
   2904                 }
   2905                 goto error;
   2906             }
   2907             for (; digits--; ++s) {
   2908                 c = (unsigned char)*s;
   2909                 if (!Py_ISXDIGIT(c))
   2910                     goto error;
   2911                 chr = (chr<<4) & ~0xF;
   2912                 if (c >= '0' && c <= '9')
   2913                     chr += c - '0';
   2914                 else if (c >= 'a' && c <= 'f')
   2915                     chr += 10 + c - 'a';
   2916                 else
   2917                     chr += 10 + c - 'A';
   2918             }
   2919             if (chr == 0xffffffff && PyErr_Occurred())
   2920                 /* _decoding_error will have already written into the
   2921                    target buffer. */
   2922                 break;
   2923         store:
   2924             /* when we get here, chr is a 32-bit unicode character */
   2925             if (chr <= 0xffff)
   2926                 /* UCS-2 character */
   2927                 *p++ = (Py_UNICODE) chr;
   2928             else if (chr <= 0x10ffff) {
   2929                 /* UCS-4 character. Either store directly, or as
   2930                    surrogate pair. */
   2931 #ifdef Py_UNICODE_WIDE
   2932                 *p++ = chr;
   2933 #else
   2934                 chr -= 0x10000L;
   2935                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
   2936                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
   2937 #endif
   2938             } else {
   2939                 message = "illegal Unicode character";
   2940                 goto error;
   2941             }
   2942             break;
   2943 
   2944             /* \N{name} */
   2945         case 'N':
   2946             message = "malformed \\N character escape";
   2947             if (ucnhash_CAPI == NULL) {
   2948                 /* load the unicode data module */
   2949                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
   2950                 if (ucnhash_CAPI == NULL)
   2951                     goto ucnhashError;
   2952             }
   2953             if (*s == '{') {
   2954                 const char *start = s+1;
   2955                 /* look for the closing brace */
   2956                 while (*s != '}' && s < end)
   2957                     s++;
   2958                 if (s > start && s < end && *s == '}') {
   2959                     /* found a name.  look it up in the unicode database */
   2960                     message = "unknown Unicode character name";
   2961                     s++;
   2962                     if (s - start - 1 <= INT_MAX &&
   2963                         ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
   2964                         goto store;
   2965                 }
   2966             }
   2967             goto error;
   2968 
   2969         default:
   2970             if (s > end) {
   2971                 message = "\\ at end of string";
   2972                 s--;
   2973                 goto error;
   2974             }
   2975             else {
   2976                 *p++ = '\\';
   2977                 *p++ = (unsigned char)s[-1];
   2978             }
   2979             break;
   2980         }
   2981         continue;
   2982 
   2983       error:
   2984         endinpos = s-starts;
   2985         outpos = p-PyUnicode_AS_UNICODE(v);
   2986         if (unicode_decode_call_errorhandler(
   2987                 errors, &errorHandler,
   2988                 "unicodeescape", message,
   2989                 starts, size, &startinpos, &endinpos, &exc, &s,
   2990                 &v, &outpos, &p))
   2991             goto onError;
   2992         continue;
   2993     }
   2994     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
   2995         goto onError;
   2996     Py_XDECREF(errorHandler);
   2997     Py_XDECREF(exc);
   2998     return (PyObject *)v;
   2999 
   3000   ucnhashError:
   3001     PyErr_SetString(
   3002         PyExc_UnicodeError,
   3003         "\\N escapes not supported (can't load unicodedata module)"
   3004         );
   3005     Py_XDECREF(v);
   3006     Py_XDECREF(errorHandler);
   3007     Py_XDECREF(exc);
   3008     return NULL;
   3009 
   3010   onError:
   3011     Py_XDECREF(v);
   3012     Py_XDECREF(errorHandler);
   3013     Py_XDECREF(exc);
   3014     return NULL;
   3015 }
   3016 
   3017 /* Return a Unicode-Escape string version of the Unicode object.
   3018 
   3019    If quotes is true, the string is enclosed in u"" or u'' quotes as
   3020    appropriate.
   3021 
   3022 */
   3023 
   3024 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
   3025                                              Py_ssize_t size,
   3026                                              Py_UNICODE ch)
   3027 {
   3028     /* like wcschr, but doesn't stop at NULL characters */
   3029 
   3030     while (size-- > 0) {
   3031         if (*s == ch)
   3032             return s;
   3033         s++;
   3034     }
   3035 
   3036     return NULL;
   3037 }
   3038 
   3039 static
   3040 PyObject *unicodeescape_string(const Py_UNICODE *s,
   3041                                Py_ssize_t size,
   3042                                int quotes)
   3043 {
   3044     PyObject *repr;
   3045     char *p;
   3046 
   3047     static const char *hexdigit = "0123456789abcdef";
   3048 #ifdef Py_UNICODE_WIDE
   3049     const Py_ssize_t expandsize = 10;
   3050 #else
   3051     const Py_ssize_t expandsize = 6;
   3052 #endif
   3053 
   3054     /* XXX(nnorwitz): rather than over-allocating, it would be
   3055        better to choose a different scheme.  Perhaps scan the
   3056        first N-chars of the string and allocate based on that size.
   3057     */
   3058     /* Initial allocation is based on the longest-possible unichr
   3059        escape.
   3060 
   3061        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
   3062        unichr, so in this case it's the longest unichr escape. In
   3063        narrow (UTF-16) builds this is five chars per source unichr
   3064        since there are two unichrs in the surrogate pair, so in narrow
   3065        (UTF-16) builds it's not the longest unichr escape.
   3066 
   3067        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
   3068        so in the narrow (UTF-16) build case it's the longest unichr
   3069        escape.
   3070     */
   3071 
   3072     if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
   3073         return PyErr_NoMemory();
   3074 
   3075     repr = PyString_FromStringAndSize(NULL,
   3076                                       2
   3077                                       + expandsize*size
   3078                                       + 1);
   3079     if (repr == NULL)
   3080         return NULL;
   3081 
   3082     p = PyString_AS_STRING(repr);
   3083 
   3084     if (quotes) {
   3085         *p++ = 'u';
   3086         *p++ = (findchar(s, size, '\'') &&
   3087                 !findchar(s, size, '"')) ? '"' : '\'';
   3088     }
   3089     while (size-- > 0) {
   3090         Py_UNICODE ch = *s++;
   3091 
   3092         /* Escape quotes and backslashes */
   3093         if ((quotes &&
   3094              ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
   3095             *p++ = '\\';
   3096             *p++ = (char) ch;
   3097             continue;
   3098         }
   3099 
   3100 #ifdef Py_UNICODE_WIDE
   3101         /* Map 21-bit characters to '\U00xxxxxx' */
   3102         else if (ch >= 0x10000) {
   3103             *p++ = '\\';
   3104             *p++ = 'U';
   3105             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
   3106             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
   3107             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
   3108             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
   3109             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
   3110             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
   3111             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
   3112             *p++ = hexdigit[ch & 0x0000000F];
   3113             continue;
   3114         }
   3115 #else
   3116         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
   3117         else if (ch >= 0xD800 && ch < 0xDC00) {
   3118             Py_UNICODE ch2;
   3119             Py_UCS4 ucs;
   3120 
   3121             ch2 = *s++;
   3122             size--;
   3123             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
   3124                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
   3125                 *p++ = '\\';
   3126                 *p++ = 'U';
   3127                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
   3128                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
   3129                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
   3130                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
   3131                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
   3132                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
   3133                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
   3134                 *p++ = hexdigit[ucs & 0x0000000F];
   3135                 continue;
   3136             }
   3137             /* Fall through: isolated surrogates are copied as-is */
   3138             s--;
   3139             size++;
   3140         }
   3141 #endif
   3142 
   3143         /* Map 16-bit characters to '\uxxxx' */
   3144         if (ch >= 256) {
   3145             *p++ = '\\';
   3146             *p++ = 'u';
   3147             *p++ = hexdigit[(ch >> 12) & 0x000F];
   3148             *p++ = hexdigit[(ch >> 8) & 0x000F];
   3149             *p++ = hexdigit[(ch >> 4) & 0x000F];
   3150             *p++ = hexdigit[ch & 0x000F];
   3151         }
   3152 
   3153         /* Map special whitespace to '\t', \n', '\r' */
   3154         else if (ch == '\t') {
   3155             *p++ = '\\';
   3156             *p++ = 't';
   3157         }
   3158         else if (ch == '\n') {
   3159             *p++ = '\\';
   3160             *p++ = 'n';
   3161         }
   3162         else if (ch == '\r') {
   3163             *p++ = '\\';
   3164             *p++ = 'r';
   3165         }
   3166 
   3167         /* Map non-printable US ASCII to '\xhh' */
   3168         else if (ch < ' ' || ch >= 0x7F) {
   3169             *p++ = '\\';
   3170             *p++ = 'x';
   3171             *p++ = hexdigit[(ch >> 4) & 0x000F];
   3172             *p++ = hexdigit[ch & 0x000F];
   3173         }
   3174 
   3175         /* Copy everything else as-is */
   3176         else
   3177             *p++ = (char) ch;
   3178     }
   3179     if (quotes)
   3180         *p++ = PyString_AS_STRING(repr)[1];
   3181 
   3182     *p = '\0';
   3183     if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
   3184         return NULL;
   3185     return repr;
   3186 }
   3187 
   3188 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
   3189                                         Py_ssize_t size)
   3190 {
   3191     return unicodeescape_string(s, size, 0);
   3192 }
   3193 
   3194 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
   3195 {
   3196     if (!PyUnicode_Check(unicode)) {
   3197         PyErr_BadArgument();
   3198         return NULL;
   3199     }
   3200     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
   3201                                          PyUnicode_GET_SIZE(unicode));
   3202 }
   3203 
   3204 /* --- Raw Unicode Escape Codec ------------------------------------------- */
   3205 
   3206 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
   3207                                            Py_ssize_t size,
   3208                                            const char *errors)
   3209 {
   3210     const char *starts = s;
   3211     Py_ssize_t startinpos;
   3212     Py_ssize_t endinpos;
   3213     Py_ssize_t outpos;
   3214     PyUnicodeObject *v;
   3215     Py_UNICODE *p;
   3216     const char *end;
   3217     const char *bs;
   3218     PyObject *errorHandler = NULL;
   3219     PyObject *exc = NULL;
   3220 
   3221     /* Escaped strings will always be longer than the resulting
   3222        Unicode string, so we start with size here and then reduce the
   3223        length after conversion to the true value. (But decoding error
   3224        handler might have to resize the string) */
   3225     v = _PyUnicode_New(size);
   3226     if (v == NULL)
   3227         goto onError;
   3228     if (size == 0)
   3229         return (PyObject *)v;
   3230     p = PyUnicode_AS_UNICODE(v);
   3231     end = s + size;
   3232     while (s < end) {
   3233         unsigned char c;
   3234         Py_UCS4 x;
   3235         int i;
   3236         int count;
   3237 
   3238         /* Non-escape characters are interpreted as Unicode ordinals */
   3239         if (*s != '\\') {
   3240             *p++ = (unsigned char)*s++;
   3241             continue;
   3242         }
   3243         startinpos = s-starts;
   3244 
   3245         /* \u-escapes are only interpreted iff the number of leading
   3246            backslashes if odd */
   3247         bs = s;
   3248         for (;s < end;) {
   3249             if (*s != '\\')
   3250                 break;
   3251             *p++ = (unsigned char)*s++;
   3252         }
   3253         if (((s - bs) & 1) == 0 ||
   3254             s >= end ||
   3255             (*s != 'u' && *s != 'U')) {
   3256             continue;
   3257         }
   3258         p--;
   3259         count = *s=='u' ? 4 : 8;
   3260         s++;
   3261 
   3262         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
   3263         outpos = p-PyUnicode_AS_UNICODE(v);
   3264         for (x = 0, i = 0; i < count; ++i, ++s) {
   3265             c = (unsigned char)*s;
   3266             if (!isxdigit(c)) {
   3267                 endinpos = s-starts;
   3268                 if (unicode_decode_call_errorhandler(
   3269                         errors, &errorHandler,
   3270                         "rawunicodeescape", "truncated \\uXXXX",
   3271                         starts, size, &startinpos, &endinpos, &exc, &s,
   3272                         &v, &outpos, &p))
   3273                     goto onError;
   3274                 goto nextByte;
   3275             }
   3276             x = (x<<4) & ~0xF;
   3277             if (c >= '0' && c <= '9')
   3278                 x += c - '0';
   3279             else if (c >= 'a' && c <= 'f')
   3280                 x += 10 + c - 'a';
   3281             else
   3282                 x += 10 + c - 'A';
   3283         }
   3284         if (x <= 0xffff)
   3285             /* UCS-2 character */
   3286             *p++ = (Py_UNICODE) x;
   3287         else if (x <= 0x10ffff) {
   3288             /* UCS-4 character. Either store directly, or as
   3289                surrogate pair. */
   3290 #ifdef Py_UNICODE_WIDE
   3291             *p++ = (Py_UNICODE) x;
   3292 #else
   3293             x -= 0x10000L;
   3294             *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
   3295             *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
   3296 #endif
   3297         } else {
   3298             endinpos = s-starts;
   3299             outpos = p-PyUnicode_AS_UNICODE(v);
   3300             if (unicode_decode_call_errorhandler(
   3301                     errors, &errorHandler,
   3302                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
   3303                     starts, size, &startinpos, &endinpos, &exc, &s,
   3304                     &v, &outpos, &p))
   3305                 goto onError;
   3306         }
   3307       nextByte:
   3308         ;
   3309     }
   3310     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
   3311         goto onError;
   3312     Py_XDECREF(errorHandler);
   3313     Py_XDECREF(exc);
   3314     return (PyObject *)v;
   3315 
   3316   onError:
   3317     Py_XDECREF(v);
   3318     Py_XDECREF(errorHandler);
   3319     Py_XDECREF(exc);
   3320     return NULL;
   3321 }
   3322 
   3323 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
   3324                                            Py_ssize_t size)
   3325 {
   3326     PyObject *repr;
   3327     char *p;
   3328     char *q;
   3329 
   3330     static const char *hexdigit = "0123456789abcdef";
   3331 #ifdef Py_UNICODE_WIDE
   3332     const Py_ssize_t expandsize = 10;
   3333 #else
   3334     const Py_ssize_t expandsize = 6;
   3335 #endif
   3336 
   3337     if (size > PY_SSIZE_T_MAX / expandsize)
   3338         return PyErr_NoMemory();
   3339 
   3340     repr = PyString_FromStringAndSize(NULL, expandsize * size);
   3341     if (repr == NULL)
   3342         return NULL;
   3343     if (size == 0)
   3344         return repr;
   3345 
   3346     p = q = PyString_AS_STRING(repr);
   3347     while (size-- > 0) {
   3348         Py_UNICODE ch = *s++;
   3349 #ifdef Py_UNICODE_WIDE
   3350         /* Map 32-bit characters to '\Uxxxxxxxx' */
   3351         if (ch >= 0x10000) {
   3352             *p++ = '\\';
   3353             *p++ = 'U';
   3354             *p++ = hexdigit[(ch >> 28) & 0xf];
   3355             *p++ = hexdigit[(ch >> 24) & 0xf];
   3356             *p++ = hexdigit[(ch >> 20) & 0xf];
   3357             *p++ = hexdigit[(ch >> 16) & 0xf];
   3358             *p++ = hexdigit[(ch >> 12) & 0xf];
   3359             *p++ = hexdigit[(ch >> 8) & 0xf];
   3360             *p++ = hexdigit[(ch >> 4) & 0xf];
   3361             *p++ = hexdigit[ch & 15];
   3362         }
   3363         else
   3364 #else
   3365             /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
   3366             if (ch >= 0xD800 && ch < 0xDC00) {
   3367                 Py_UNICODE ch2;
   3368                 Py_UCS4 ucs;
   3369 
   3370                 ch2 = *s++;
   3371                 size--;
   3372                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
   3373                     ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
   3374                     *p++ = '\\';
   3375                     *p++ = 'U';
   3376                     *p++ = hexdigit[(ucs >> 28) & 0xf];
   3377                     *p++ = hexdigit[(ucs >> 24) & 0xf];
   3378                     *p++ = hexdigit[(ucs >> 20) & 0xf];
   3379                     *p++ = hexdigit[(ucs >> 16) & 0xf];
   3380                     *p++ = hexdigit[(ucs >> 12) & 0xf];
   3381                     *p++ = hexdigit[(ucs >> 8) & 0xf];
   3382                     *p++ = hexdigit[(ucs >> 4) & 0xf];
   3383                     *p++ = hexdigit[ucs & 0xf];
   3384                     continue;
   3385                 }
   3386                 /* Fall through: isolated surrogates are copied as-is */
   3387                 s--;
   3388                 size++;
   3389             }
   3390 #endif
   3391         /* Map 16-bit characters to '\uxxxx' */
   3392         if (ch >= 256) {
   3393             *p++ = '\\';
   3394             *p++ = 'u';
   3395             *p++ = hexdigit[(ch >> 12) & 0xf];
   3396             *p++ = hexdigit[(ch >> 8) & 0xf];
   3397             *p++ = hexdigit[(ch >> 4) & 0xf];
   3398             *p++ = hexdigit[ch & 15];
   3399         }
   3400         /* Copy everything else as-is */
   3401         else
   3402             *p++ = (char) ch;
   3403     }
   3404     *p = '\0';
   3405     if (_PyString_Resize(&repr, p - q))
   3406         return NULL;
   3407     return repr;
   3408 }
   3409 
   3410 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
   3411 {
   3412     if (!PyUnicode_Check(unicode)) {
   3413         PyErr_BadArgument();
   3414         return NULL;
   3415     }
   3416     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
   3417                                             PyUnicode_GET_SIZE(unicode));
   3418 }
   3419 
   3420 /* --- Unicode Internal Codec ------------------------------------------- */
   3421 
   3422 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
   3423                                            Py_ssize_t size,
   3424                                            const char *errors)
   3425 {
   3426     const char *starts = s;
   3427     Py_ssize_t startinpos;
   3428     Py_ssize_t endinpos;
   3429     Py_ssize_t outpos;
   3430     PyUnicodeObject *v;
   3431     Py_UNICODE *p;
   3432     const char *end;
   3433     const char *reason;
   3434     PyObject *errorHandler = NULL;
   3435     PyObject *exc = NULL;
   3436 
   3437 #ifdef Py_UNICODE_WIDE
   3438     Py_UNICODE unimax = PyUnicode_GetMax();
   3439 #endif
   3440 
   3441     /* XXX overflow detection missing */
   3442     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
   3443     if (v == NULL)
   3444         goto onError;
   3445     if (PyUnicode_GetSize((PyObject *)v) == 0)
   3446         return (PyObject *)v;
   3447     p = PyUnicode_AS_UNICODE(v);
   3448     end = s + size;
   3449 
   3450     while (s < end) {
   3451         if (end-s < Py_UNICODE_SIZE) {
   3452             endinpos = end-starts;
   3453             reason = "truncated input";
   3454             goto error;
   3455         }
   3456         memcpy(p, s, sizeof(Py_UNICODE));
   3457 #ifdef Py_UNICODE_WIDE
   3458         /* We have to sanity check the raw data, otherwise doom looms for
   3459            some malformed UCS-4 data. */
   3460         if (*p > unimax || *p < 0) {
   3461             endinpos = s - starts + Py_UNICODE_SIZE;
   3462             reason = "illegal code point (> 0x10FFFF)";
   3463             goto error;
   3464         }
   3465 #endif
   3466         p++;
   3467         s += Py_UNICODE_SIZE;
   3468         continue;
   3469 
   3470   error:
   3471         startinpos = s - starts;
   3472         outpos = p - PyUnicode_AS_UNICODE(v);
   3473         if (unicode_decode_call_errorhandler(
   3474                 errors, &errorHandler,
   3475                 "unicode_internal", reason,
   3476                 starts, size, &startinpos, &endinpos, &exc, &s,
   3477                 &v, &outpos, &p)) {
   3478             goto onError;
   3479         }
   3480     }
   3481 
   3482     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
   3483         goto onError;
   3484     Py_XDECREF(errorHandler);
   3485     Py_XDECREF(exc);
   3486     return (PyObject *)v;
   3487 
   3488   onError:
   3489     Py_XDECREF(v);
   3490     Py_XDECREF(errorHandler);
   3491     Py_XDECREF(exc);
   3492     return NULL;
   3493 }
   3494 
   3495 /* --- Latin-1 Codec ------------------------------------------------------ */
   3496 
   3497 PyObject *PyUnicode_DecodeLatin1(const char *s,
   3498                                  Py_ssize_t size,
   3499                                  const char *errors)
   3500 {
   3501     PyUnicodeObject *v;
   3502     Py_UNICODE *p;
   3503 
   3504     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
   3505     if (size == 1) {
   3506         Py_UNICODE r = *(unsigned char*)s;
   3507         return PyUnicode_FromUnicode(&r, 1);
   3508     }
   3509 
   3510     v = _PyUnicode_New(size);
   3511     if (v == NULL)
   3512         goto onError;
   3513     if (size == 0)
   3514         return (PyObject *)v;
   3515     p = PyUnicode_AS_UNICODE(v);
   3516     while (size-- > 0)
   3517         *p++ = (unsigned char)*s++;
   3518     return (PyObject *)v;
   3519 
   3520   onError:
   3521     Py_XDECREF(v);
   3522     return NULL;
   3523 }
   3524 
   3525 /* create or adjust a UnicodeEncodeError */
   3526 static void make_encode_exception(PyObject **exceptionObject,
   3527                                   const char *encoding,
   3528                                   const Py_UNICODE *unicode, Py_ssize_t size,
   3529                                   Py_ssize_t startpos, Py_ssize_t endpos,
   3530                                   const char *reason)
   3531 {
   3532     if (*exceptionObject == NULL) {
   3533         *exceptionObject = PyUnicodeEncodeError_Create(
   3534             encoding, unicode, size, startpos, endpos, reason);
   3535     }
   3536     else {
   3537         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
   3538             goto onError;
   3539         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
   3540             goto onError;
   3541         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
   3542             goto onError;
   3543         return;
   3544       onError:
   3545         Py_CLEAR(*exceptionObject);
   3546     }
   3547 }
   3548 
   3549 /* raises a UnicodeEncodeError */
   3550 static void raise_encode_exception(PyObject **exceptionObject,
   3551                                    const char *encoding,
   3552                                    const Py_UNICODE *unicode, Py_ssize_t size,
   3553                                    Py_ssize_t startpos, Py_ssize_t endpos,
   3554                                    const char *reason)
   3555 {
   3556     make_encode_exception(exceptionObject,
   3557                           encoding, unicode, size, startpos, endpos, reason);
   3558     if (*exceptionObject != NULL)
   3559         PyCodec_StrictErrors(*exceptionObject);
   3560 }
   3561 
   3562 /* error handling callback helper:
   3563    build arguments, call the callback and check the arguments,
   3564    put the result into newpos and return the replacement string, which
   3565    has to be freed by the caller */
   3566 static PyObject *unicode_encode_call_errorhandler(const char *errors,
   3567                                                   PyObject **errorHandler,
   3568                                                   const char *encoding, const char *reason,
   3569                                                   const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
   3570                                                   Py_ssize_t startpos, Py_ssize_t endpos,
   3571                                                   Py_ssize_t *newpos)
   3572 {
   3573     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
   3574 
   3575     PyObject *restuple;
   3576     PyObject *resunicode;
   3577 
   3578     if (*errorHandler == NULL) {
   3579         *errorHandler = PyCodec_LookupError(errors);
   3580         if (*errorHandler == NULL)
   3581             return NULL;
   3582     }
   3583 
   3584     make_encode_exception(exceptionObject,
   3585                           encoding, unicode, size, startpos, endpos, reason);
   3586     if (*exceptionObject == NULL)
   3587         return NULL;
   3588 
   3589     restuple = PyObject_CallFunctionObjArgs(
   3590         *errorHandler, *exceptionObject, NULL);
   3591     if (restuple == NULL)
   3592         return NULL;
   3593     if (!PyTuple_Check(restuple)) {
   3594         PyErr_SetString(PyExc_TypeError, &argparse[4]);
   3595         Py_DECREF(restuple);
   3596         return NULL;
   3597     }
   3598     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
   3599                           &resunicode, newpos)) {
   3600         Py_DECREF(restuple);
   3601         return NULL;
   3602     }
   3603     if (*newpos<0)
   3604         *newpos = size+*newpos;
   3605     if (*newpos<0 || *newpos>size) {
   3606         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
   3607         Py_DECREF(restuple);
   3608         return NULL;
   3609     }
   3610     Py_INCREF(resunicode);
   3611     Py_DECREF(restuple);
   3612     return resunicode;
   3613 }
   3614 
   3615 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
   3616                                      Py_ssize_t size,
   3617                                      const char *errors,
   3618                                      int limit)
   3619 {
   3620     /* output object */
   3621     PyObject *res;
   3622     /* pointers to the beginning and end+1 of input */
   3623     const Py_UNICODE *startp = p;
   3624     const Py_UNICODE *endp = p + size;
   3625     /* pointer to the beginning of the unencodable characters */
   3626     /* const Py_UNICODE *badp = NULL; */
   3627     /* pointer into the output */
   3628     char *str;
   3629     /* current output position */
   3630     Py_ssize_t respos = 0;
   3631     Py_ssize_t ressize;
   3632     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
   3633     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
   3634     PyObject *errorHandler = NULL;
   3635     PyObject *exc = NULL;
   3636     /* the following variable is used for caching string comparisons
   3637      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
   3638     int known_errorHandler = -1;
   3639 
   3640     /* allocate enough for a simple encoding without
   3641        replacements, if we need more, we'll resize */
   3642     res = PyString_FromStringAndSize(NULL, size);
   3643     if (res == NULL)
   3644         goto onError;
   3645     if (size == 0)
   3646         return res;
   3647     str = PyString_AS_STRING(res);
   3648     ressize = size;
   3649 
   3650     while (p<endp) {
   3651         Py_UNICODE c = *p;
   3652 
   3653         /* can we encode this? */
   3654         if (c<limit) {
   3655             /* no overflow check, because we know that the space is enough */
   3656             *str++ = (char)c;
   3657             ++p;
   3658         }
   3659         else {
   3660             Py_ssize_t unicodepos = p-startp;
   3661             Py_ssize_t requiredsize;
   3662             PyObject *repunicode;
   3663             Py_ssize_t repsize;
   3664             Py_ssize_t newpos;
   3665             Py_ssize_t respos;
   3666             Py_UNICODE *uni2;
   3667             /* startpos for collecting unencodable chars */
   3668             const Py_UNICODE *collstart = p;
   3669             const Py_UNICODE *collend = p;
   3670             /* find all unecodable characters */
   3671             while ((collend < endp) && ((*collend) >= limit))
   3672                 ++collend;
   3673             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
   3674             if (known_errorHandler==-1) {
   3675                 if ((errors==NULL) || (!strcmp(errors, "strict")))
   3676                     known_errorHandler = 1;
   3677                 else if (!strcmp(errors, "replace"))
   3678                     known_errorHandler = 2;
   3679                 else if (!strcmp(errors, "ignore"))
   3680                     known_errorHandler = 3;
   3681                 else if (!strcmp(errors, "xmlcharrefreplace"))
   3682                     known_errorHandler = 4;
   3683                 else
   3684                     known_errorHandler = 0;
   3685             }
   3686             switch (known_errorHandler) {
   3687             case 1: /* strict */
   3688                 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
   3689                 goto onError;
   3690             case 2: /* replace */
   3691                 while (collstart++ < collend)
   3692                     *str++ = '?'; /* fall through */
   3693             case 3: /* ignore */
   3694                 p = collend;
   3695                 break;
   3696             case 4: /* xmlcharrefreplace */
   3697                 respos = str - PyString_AS_STRING(res);
   3698                 /* determine replacement size (temporarily (mis)uses p) */
   3699                 requiredsize = respos;
   3700                 for (p = collstart; p < collend;) {
   3701                     Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
   3702                     Py_ssize_t incr;
   3703                     if (ch < 10)
   3704                         incr = 2+1+1;
   3705                     else if (ch < 100)
   3706                         incr = 2+2+1;
   3707                     else if (ch < 1000)
   3708                         incr = 2+3+1;
   3709                     else if (ch < 10000)
   3710                         incr = 2+4+1;
   3711                     else if (ch < 100000)
   3712                         incr = 2+5+1;
   3713                     else if (ch < 1000000)
   3714                         incr = 2+6+1;
   3715                     else
   3716                         incr = 2+7+1;
   3717                     if (requiredsize > PY_SSIZE_T_MAX - incr)
   3718                         goto overflow;
   3719                     requiredsize += incr;
   3720                 }
   3721                 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
   3722                     goto overflow;
   3723                 requiredsize += endp - collend;
   3724                 if (requiredsize > ressize) {
   3725                     if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
   3726                         requiredsize = 2*ressize;
   3727                     if (_PyString_Resize(&res, requiredsize))
   3728                         goto onError;
   3729                     str = PyString_AS_STRING(res) + respos;
   3730                     ressize = requiredsize;
   3731                 }
   3732                 /* generate replacement (temporarily (mis)uses p) */
   3733                 for (p = collstart; p < collend;) {
   3734                     Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
   3735                     str += sprintf(str, "&#%d;", (int)ch);
   3736                 }
   3737                 p = collend;
   3738                 break;
   3739             default:
   3740                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
   3741                                                               encoding, reason, startp, size, &exc,
   3742                                                               collstart-startp, collend-startp, &newpos);
   3743                 if (repunicode == NULL)
   3744                     goto onError;
   3745                 /* need more space? (at least enough for what we have+the
   3746                    replacement+the rest of the string, so we won't have to
   3747                    check space for encodable characters) */
   3748                 respos = str - PyString_AS_STRING(res);
   3749                 repsize = PyUnicode_GET_SIZE(repunicode);
   3750                 if (respos > PY_SSIZE_T_MAX - repsize)
   3751                     goto overflow;
   3752                 requiredsize = respos + repsize;
   3753                 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
   3754                     goto overflow;
   3755                 requiredsize += endp - collend;
   3756                 if (requiredsize > ressize) {
   3757                     if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
   3758                         requiredsize = 2*ressize;
   3759                     if (_PyString_Resize(&res, requiredsize)) {
   3760                         Py_DECREF(repunicode);
   3761                         goto onError;
   3762                     }
   3763                     str = PyString_AS_STRING(res) + respos;
   3764                     ressize = requiredsize;
   3765                 }
   3766                 /* check if there is anything unencodable in the replacement
   3767                    and copy it to the output */
   3768                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2, ++str) {
   3769                     c = *uni2;
   3770                     if (c >= limit) {
   3771                         raise_encode_exception(&exc, encoding, startp, size,
   3772                                                unicodepos, unicodepos+1, reason);
   3773                         Py_DECREF(repunicode);
   3774                         goto onError;
   3775                     }
   3776                     *str = (char)c;
   3777                 }
   3778                 p = startp + newpos;
   3779                 Py_DECREF(repunicode);
   3780             }
   3781         }
   3782     }
   3783     /* Resize if we allocated to much */
   3784     respos = str - PyString_AS_STRING(res);
   3785     if (respos < ressize)
   3786         /* If this falls res will be NULL */
   3787         _PyString_Resize(&res, respos);
   3788     Py_XDECREF(errorHandler);
   3789     Py_XDECREF(exc);
   3790     return res;
   3791 
   3792   overflow:
   3793     PyErr_SetString(PyExc_OverflowError,
   3794                     "encoded result is too long for a Python string");
   3795 
   3796   onError:
   3797     Py_XDECREF(res);
   3798     Py_XDECREF(errorHandler);
   3799     Py_XDECREF(exc);
   3800     return NULL;
   3801 }
   3802 
   3803 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
   3804                                  Py_ssize_t size,
   3805                                  const char *errors)
   3806 {
   3807     return unicode_encode_ucs1(p, size, errors, 256);
   3808 }
   3809 
   3810 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
   3811 {
   3812     if (!PyUnicode_Check(unicode)) {
   3813         PyErr_BadArgument();
   3814         return NULL;
   3815     }
   3816     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
   3817                                   PyUnicode_GET_SIZE(unicode),
   3818                                   NULL);
   3819 }
   3820 
   3821 /* --- 7-bit ASCII Codec -------------------------------------------------- */
   3822 
   3823 PyObject *PyUnicode_DecodeASCII(const char *s,
   3824                                 Py_ssize_t size,
   3825                                 const char *errors)
   3826 {
   3827     const char *starts = s;
   3828     PyUnicodeObject *v;
   3829     Py_UNICODE *p;
   3830     Py_ssize_t startinpos;
   3831     Py_ssize_t endinpos;
   3832     Py_ssize_t outpos;
   3833     const char *e;
   3834     PyObject *errorHandler = NULL;
   3835     PyObject *exc = NULL;
   3836 
   3837     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
   3838     if (size == 1 && *(unsigned char*)s < 128) {
   3839         Py_UNICODE r = *(unsigned char*)s;
   3840         return PyUnicode_FromUnicode(&r, 1);
   3841     }
   3842 
   3843     v = _PyUnicode_New(size);
   3844     if (v == NULL)
   3845         goto onError;
   3846     if (size == 0)
   3847         return (PyObject *)v;
   3848     p = PyUnicode_AS_UNICODE(v);
   3849     e = s + size;
   3850     while (s < e) {
   3851         register unsigned char c = (unsigned char)*s;
   3852         if (c < 128) {
   3853             *p++ = c;
   3854             ++s;
   3855         }
   3856         else {
   3857             startinpos = s-starts;
   3858             endinpos = startinpos + 1;
   3859             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
   3860             if (unicode_decode_call_errorhandler(
   3861                     errors, &errorHandler,
   3862                     "ascii", "ordinal not in range(128)",
   3863                     starts, size, &startinpos, &endinpos, &exc, &s,
   3864                     &v, &outpos, &p))
   3865                 goto onError;
   3866         }
   3867     }
   3868     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
   3869         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
   3870             goto onError;
   3871     Py_XDECREF(errorHandler);
   3872     Py_XDECREF(exc);
   3873     return (PyObject *)v;
   3874 
   3875   onError:
   3876     Py_XDECREF(v);
   3877     Py_XDECREF(errorHandler);
   3878     Py_XDECREF(exc);
   3879     return NULL;
   3880 }
   3881 
   3882 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
   3883                                 Py_ssize_t size,
   3884                                 const char *errors)
   3885 {
   3886     return unicode_encode_ucs1(p, size, errors, 128);
   3887 }
   3888 
   3889 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
   3890 {
   3891     if (!PyUnicode_Check(unicode)) {
   3892         PyErr_BadArgument();
   3893         return NULL;
   3894     }
   3895     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
   3896                                  PyUnicode_GET_SIZE(unicode),
   3897                                  NULL);
   3898 }
   3899 
   3900 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
   3901 
   3902 /* --- MBCS codecs for Windows -------------------------------------------- */
   3903 
   3904 #if SIZEOF_INT < SIZEOF_SIZE_T
   3905 #define NEED_RETRY
   3906 #endif
   3907 
   3908 /* XXX This code is limited to "true" double-byte encodings, as
   3909    a) it assumes an incomplete character consists of a single byte, and
   3910    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
   3911    encodings, see IsDBCSLeadByteEx documentation. */
   3912 
   3913 static int is_dbcs_lead_byte(const char *s, int offset)
   3914 {
   3915     const char *curr = s + offset;
   3916 
   3917     if (IsDBCSLeadByte(*curr)) {
   3918         const char *prev = CharPrev(s, curr);
   3919         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
   3920     }
   3921     return 0;
   3922 }
   3923 
   3924 /*
   3925  * Decode MBCS string into unicode object. If 'final' is set, converts
   3926  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
   3927  */
   3928 static int decode_mbcs(PyUnicodeObject **v,
   3929                        const char *s, /* MBCS string */
   3930                        int size, /* sizeof MBCS string */
   3931                        int final)
   3932 {
   3933     Py_UNICODE *p;
   3934     Py_ssize_t n = 0;
   3935     int usize = 0;
   3936 
   3937     assert(size >= 0);
   3938 
   3939     /* Skip trailing lead-byte unless 'final' is set */
   3940     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
   3941         --size;
   3942 
   3943     /* First get the size of the result */
   3944     if (size > 0) {
   3945         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
   3946         if (usize == 0) {
   3947             PyErr_SetFromWindowsErrWithFilename(0, NULL);
   3948             return -1;
   3949         }
   3950     }
   3951 
   3952     if (*v == NULL) {
   3953         /* Create unicode object */
   3954         *v = _PyUnicode_New(usize);
   3955         if (*v == NULL)
   3956             return -1;
   3957     }
   3958     else {
   3959         /* Extend unicode object */
   3960         n = PyUnicode_GET_SIZE(*v);
   3961         if (_PyUnicode_Resize(v, n + usize) < 0)
   3962             return -1;
   3963     }
   3964 
   3965     /* Do the conversion */
   3966     if (size > 0) {
   3967         p = PyUnicode_AS_UNICODE(*v) + n;
   3968         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
   3969             PyErr_SetFromWindowsErrWithFilename(0, NULL);
   3970             return -1;
   3971         }
   3972     }
   3973 
   3974     return size;
   3975 }
   3976 
   3977 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
   3978                                        Py_ssize_t size,
   3979                                        const char *errors,
   3980                                        Py_ssize_t *consumed)
   3981 {
   3982     PyUnicodeObject *v = NULL;
   3983     int done;
   3984 
   3985     if (consumed)
   3986         *consumed = 0;
   3987 
   3988 #ifdef NEED_RETRY
   3989   retry:
   3990     if (size > INT_MAX)
   3991         done = decode_mbcs(&v, s, INT_MAX, 0);
   3992     else
   3993 #endif
   3994         done = decode_mbcs(&v, s, (int)size, !consumed);
   3995 
   3996     if (done < 0) {
   3997         Py_XDECREF(v);
   3998         return NULL;
   3999     }
   4000 
   4001     if (consumed)
   4002         *consumed += done;
   4003 
   4004 #ifdef NEED_RETRY
   4005     if (size > INT_MAX) {
   4006         s += done;
   4007         size -= done;
   4008         goto retry;
   4009     }
   4010 #endif
   4011 
   4012     return (PyObject *)v;
   4013 }
   4014 
   4015 PyObject *PyUnicode_DecodeMBCS(const char *s,
   4016                                Py_ssize_t size,
   4017                                const char *errors)
   4018 {
   4019     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
   4020 }
   4021 
   4022 /*
   4023  * Convert unicode into string object (MBCS).
   4024  * Returns 0 if succeed, -1 otherwise.
   4025  */
   4026 static int encode_mbcs(PyObject **repr,
   4027                        const Py_UNICODE *p, /* unicode */
   4028                        int size) /* size of unicode */
   4029 {
   4030     int mbcssize = 0;
   4031     Py_ssize_t n = 0;
   4032 
   4033     assert(size >= 0);
   4034 
   4035     /* First get the size of the result */
   4036     if (size > 0) {
   4037         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
   4038         if (mbcssize == 0) {
   4039             PyErr_SetFromWindowsErrWithFilename(0, NULL);
   4040             return -1;
   4041         }
   4042     }
   4043 
   4044     if (*repr == NULL) {
   4045         /* Create string object */
   4046         *repr = PyString_FromStringAndSize(NULL, mbcssize);
   4047         if (*repr == NULL)
   4048             return -1;
   4049     }
   4050     else {
   4051         /* Extend string object */
   4052         n = PyString_Size(*repr);
   4053         if (_PyString_Resize(repr, n + mbcssize) < 0)
   4054             return -1;
   4055     }
   4056 
   4057     /* Do the conversion */
   4058     if (size > 0) {
   4059         char *s = PyString_AS_STRING(*repr) + n;
   4060         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
   4061             PyErr_SetFromWindowsErrWithFilename(0, NULL);
   4062             return -1;
   4063         }
   4064     }
   4065 
   4066     return 0;
   4067 }
   4068 
   4069 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
   4070                                Py_ssize_t size,
   4071                                const char *errors)
   4072 {
   4073     PyObject *repr = NULL;
   4074     int ret;
   4075 
   4076 #ifdef NEED_RETRY
   4077   retry:
   4078     if (size > INT_MAX)
   4079         ret = encode_mbcs(&repr, p, INT_MAX);
   4080     else
   4081 #endif
   4082         ret = encode_mbcs(&repr, p, (int)size);
   4083 
   4084     if (ret < 0) {
   4085         Py_XDECREF(repr);
   4086         return NULL;
   4087     }
   4088 
   4089 #ifdef NEED_RETRY
   4090     if (size > INT_MAX) {
   4091         p += INT_MAX;
   4092         size -= INT_MAX;
   4093         goto retry;
   4094     }
   4095 #endif
   4096 
   4097     return repr;
   4098 }
   4099 
   4100 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
   4101 {
   4102     if (!PyUnicode_Check(unicode)) {
   4103         PyErr_BadArgument();
   4104         return NULL;
   4105     }
   4106     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
   4107                                 PyUnicode_GET_SIZE(unicode),
   4108                                 NULL);
   4109 }
   4110 
   4111 #undef NEED_RETRY
   4112 
   4113 #endif /* MS_WINDOWS */
   4114 
   4115 /* --- Character Mapping Codec -------------------------------------------- */
   4116 
   4117 PyObject *PyUnicode_DecodeCharmap(const char *s,
   4118                                   Py_ssize_t size,
   4119                                   PyObject *mapping,
   4120                                   const char *errors)
   4121 {
   4122     const char *starts = s;
   4123     Py_ssize_t startinpos;
   4124     Py_ssize_t endinpos;
   4125     Py_ssize_t outpos;
   4126     const char *e;
   4127     PyUnicodeObject *v;
   4128     Py_UNICODE *p;
   4129     Py_ssize_t extrachars = 0;
   4130     PyObject *errorHandler = NULL;
   4131     PyObject *exc = NULL;
   4132     Py_UNICODE *mapstring = NULL;
   4133     Py_ssize_t maplen = 0;
   4134 
   4135     /* Default to Latin-1 */
   4136     if (mapping == NULL)
   4137         return PyUnicode_DecodeLatin1(s, size, errors);
   4138 
   4139     v = _PyUnicode_New(size);
   4140     if (v == NULL)
   4141         goto onError;
   4142     if (size == 0)
   4143         return (PyObject *)v;
   4144     p = PyUnicode_AS_UNICODE(v);
   4145     e = s + size;
   4146     if (PyUnicode_CheckExact(mapping)) {
   4147         mapstring = PyUnicode_AS_UNICODE(mapping);
   4148         maplen = PyUnicode_GET_SIZE(mapping);
   4149         while (s < e) {
   4150             unsigned char ch = *s;
   4151             Py_UNICODE x = 0xfffe; /* illegal value */
   4152 
   4153             if (ch < maplen)
   4154                 x = mapstring[ch];
   4155 
   4156             if (x == 0xfffe) {
   4157                 /* undefined mapping */
   4158                 outpos = p-PyUnicode_AS_UNICODE(v);
   4159                 startinpos = s-starts;
   4160                 endinpos = startinpos+1;
   4161                 if (unicode_decode_call_errorhandler(
   4162                         errors, &errorHandler,
   4163                         "charmap", "character maps to <undefined>",
   4164                         starts, size, &startinpos, &endinpos, &exc, &s,
   4165                         &v, &outpos, &p)) {
   4166                     goto onError;
   4167                 }
   4168                 continue;
   4169             }
   4170             *p++ = x;
   4171             ++s;
   4172         }
   4173     }
   4174     else {
   4175         while (s < e) {
   4176             unsigned char ch = *s;
   4177             PyObject *w, *x;
   4178 
   4179             /* Get mapping (char ordinal -> integer, Unicode char or None) */
   4180             w = PyInt_FromLong((long)ch);
   4181             if (w == NULL)
   4182                 goto onError;
   4183             x = PyObject_GetItem(mapping, w);
   4184             Py_DECREF(w);
   4185             if (x == NULL) {
   4186                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
   4187                     /* No mapping found means: mapping is undefined. */
   4188                     PyErr_Clear();
   4189                     goto Undefined;
   4190                 } else
   4191                     goto onError;
   4192             }
   4193 
   4194             /* Apply mapping */
   4195             if (x == Py_None)
   4196                 goto Undefined;
   4197             if (PyInt_Check(x)) {
   4198                 long value = PyInt_AS_LONG(x);
   4199                 if (value == 0xFFFE)
   4200                     goto Undefined;
   4201                 if (value < 0 || value > 0x10FFFF) {
   4202                     PyErr_SetString(PyExc_TypeError,
   4203                                     "character mapping must be in range(0x110000)");
   4204                     Py_DECREF(x);
   4205                     goto onError;
   4206                 }
   4207 
   4208 #ifndef Py_UNICODE_WIDE
   4209                 if (value > 0xFFFF) {
   4210                     /* see the code for 1-n mapping below */
   4211                     if (extrachars < 2) {
   4212                         /* resize first */
   4213                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
   4214                         Py_ssize_t needed = 10 - extrachars;
   4215                         extrachars += needed;
   4216                         /* XXX overflow detection missing */
   4217                         if (_PyUnicode_Resize(&v,
   4218                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
   4219                             Py_DECREF(x);
   4220                             goto onError;
   4221                         }
   4222                         p = PyUnicode_AS_UNICODE(v) + oldpos;
   4223                     }
   4224                     value -= 0x10000;
   4225                     *p++ = 0xD800 | (value >> 10);
   4226                     *p++ = 0xDC00 | (value & 0x3FF);
   4227                     extrachars -= 2;
   4228                 }
   4229                 else
   4230 #endif
   4231                 *p++ = (Py_UNICODE)value;
   4232             }
   4233             else if (PyUnicode_Check(x)) {
   4234                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
   4235 
   4236                 if (targetsize == 1) {
   4237                     /* 1-1 mapping */
   4238                     Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
   4239                     if (value == 0xFFFE)
   4240                         goto Undefined;
   4241                     *p++ = value;
   4242                 }
   4243                 else if (targetsize > 1) {
   4244                     /* 1-n mapping */
   4245                     if (targetsize > extrachars) {
   4246                         /* resize first */
   4247                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
   4248                         Py_ssize_t needed = (targetsize - extrachars) + \
   4249                             (targetsize << 2);
   4250                         extrachars += needed;
   4251                         /* XXX overflow detection missing */
   4252                         if (_PyUnicode_Resize(&v,
   4253                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
   4254                             Py_DECREF(x);
   4255                             goto onError;
   4256                         }
   4257                         p = PyUnicode_AS_UNICODE(v) + oldpos;
   4258                     }
   4259                     Py_UNICODE_COPY(p,
   4260                                     PyUnicode_AS_UNICODE(x),
   4261                                     targetsize);
   4262                     p += targetsize;
   4263                     extrachars -= targetsize;
   4264                 }
   4265                 /* 1-0 mapping: skip the character */
   4266             }
   4267             else {
   4268                 /* wrong return value */
   4269                 PyErr_SetString(PyExc_TypeError,
   4270                                 "character mapping must return integer, None or unicode");
   4271                 Py_DECREF(x);
   4272                 goto onError;
   4273             }
   4274             Py_DECREF(x);
   4275             ++s;
   4276             continue;
   4277 Undefined:
   4278             /* undefined mapping */
   4279             Py_XDECREF(x);
   4280             outpos = p-PyUnicode_AS_UNICODE(v);
   4281             startinpos = s-starts;
   4282             endinpos = startinpos+1;
   4283             if (unicode_decode_call_errorhandler(
   4284                     errors, &errorHandler,
   4285                     "charmap", "character maps to <undefined>",
   4286                     starts, size, &startinpos, &endinpos, &exc, &s,
   4287                     &v, &outpos, &p)) {
   4288                 goto onError;
   4289             }
   4290         }
   4291     }
   4292     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
   4293         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
   4294             goto onError;
   4295     Py_XDECREF(errorHandler);
   4296     Py_XDECREF(exc);
   4297     return (PyObject *)v;
   4298 
   4299   onError:
   4300     Py_XDECREF(errorHandler);
   4301     Py_XDECREF(exc);
   4302     Py_XDECREF(v);
   4303     return NULL;
   4304 }
   4305 
   4306 /* Charmap encoding: the lookup table */
   4307 
   4308 struct encoding_map{
   4309     PyObject_HEAD
   4310     unsigned char level1[32];
   4311     int count2, count3;
   4312     unsigned char level23[1];
   4313 };
   4314 
   4315 static PyObject*
   4316 encoding_map_size(PyObject *obj, PyObject* args)
   4317 {
   4318     struct encoding_map *map = (struct encoding_map*)obj;
   4319     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
   4320                           128*map->count3);
   4321 }
   4322 
   4323 static PyMethodDef encoding_map_methods[] = {
   4324     {"size", encoding_map_size, METH_NOARGS,
   4325      PyDoc_STR("Return the size (in bytes) of this object") },
   4326     { 0 }
   4327 };
   4328 
   4329 static void
   4330 encoding_map_dealloc(PyObject* o)
   4331 {
   4332     PyObject_FREE(o);
   4333 }
   4334 
   4335 static PyTypeObject EncodingMapType = {
   4336     PyVarObject_HEAD_INIT(NULL, 0)
   4337     "EncodingMap",          /*tp_name*/
   4338     sizeof(struct encoding_map),   /*tp_basicsize*/
   4339     0,                      /*tp_itemsize*/
   4340     /* methods */
   4341     encoding_map_dealloc,   /*tp_dealloc*/
   4342     0,                      /*tp_print*/
   4343     0,                      /*tp_getattr*/
   4344     0,                      /*tp_setattr*/
   4345     0,                      /*tp_compare*/
   4346     0,                      /*tp_repr*/
   4347     0,                      /*tp_as_number*/
   4348     0,                      /*tp_as_sequence*/
   4349     0,                      /*tp_as_mapping*/
   4350     0,                      /*tp_hash*/
   4351     0,                      /*tp_call*/
   4352     0,                      /*tp_str*/
   4353     0,                      /*tp_getattro*/
   4354     0,                      /*tp_setattro*/
   4355     0,                      /*tp_as_buffer*/
   4356     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
   4357     0,                      /*tp_doc*/
   4358     0,                      /*tp_traverse*/
   4359     0,                      /*tp_clear*/
   4360     0,                      /*tp_richcompare*/
   4361     0,                      /*tp_weaklistoffset*/
   4362     0,                      /*tp_iter*/
   4363     0,                      /*tp_iternext*/
   4364     encoding_map_methods,   /*tp_methods*/
   4365     0,                      /*tp_members*/
   4366     0,                      /*tp_getset*/
   4367     0,                      /*tp_base*/
   4368     0,                      /*tp_dict*/
   4369     0,                      /*tp_descr_get*/
   4370     0,                      /*tp_descr_set*/
   4371     0,                      /*tp_dictoffset*/
   4372     0,                      /*tp_init*/
   4373     0,                      /*tp_alloc*/
   4374     0,                      /*tp_new*/
   4375     0,                      /*tp_free*/
   4376     0,                      /*tp_is_gc*/
   4377 };
   4378 
   4379 PyObject*
   4380 PyUnicode_BuildEncodingMap(PyObject* string)
   4381 {
   4382     Py_UNICODE *decode;
   4383     PyObject *result;
   4384     struct encoding_map *mresult;
   4385     int i;
   4386     int need_dict = 0;
   4387     unsigned char level1[32];
   4388     unsigned char level2[512];
   4389     unsigned char *mlevel1, *mlevel2, *mlevel3;
   4390     int count2 = 0, count3 = 0;
   4391 
   4392     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
   4393         PyErr_BadArgument();
   4394         return NULL;
   4395     }
   4396     decode = PyUnicode_AS_UNICODE(string);
   4397     memset(level1, 0xFF, sizeof level1);
   4398     memset(level2, 0xFF, sizeof level2);
   4399 
   4400     /* If there isn't a one-to-one mapping of NULL to \0,
   4401        or if there are non-BMP characters, we need to use
   4402        a mapping dictionary. */
   4403     if (decode[0] != 0)
   4404         need_dict = 1;
   4405     for (i = 1; i < 256; i++) {
   4406         int l1, l2;
   4407         if (decode[i] == 0
   4408 #ifdef Py_UNICODE_WIDE
   4409             || decode[i] > 0xFFFF
   4410 #endif
   4411             ) {
   4412             need_dict = 1;
   4413             break;
   4414         }
   4415         if (decode[i] == 0xFFFE)
   4416             /* unmapped character */
   4417             continue;
   4418         l1 = decode[i] >> 11;
   4419         l2 = decode[i] >> 7;
   4420         if (level1[l1] == 0xFF)
   4421             level1[l1] = count2++;
   4422         if (level2[l2] == 0xFF)
   4423             level2[l2] = count3++;
   4424     }
   4425 
   4426     if (count2 >= 0xFF || count3 >= 0xFF)
   4427         need_dict = 1;
   4428 
   4429     if (need_dict) {
   4430         PyObject *result = PyDict_New();
   4431         PyObject *key, *value;
   4432         if (!result)
   4433             return NULL;
   4434         for (i = 0; i < 256; i++) {
   4435             value = NULL;
   4436             key = PyInt_FromLong(decode[i]);
   4437             value = PyInt_FromLong(i);
   4438             if (!key || !value)
   4439                 goto failed1;
   4440             if (PyDict_SetItem(result, key, value) == -1)
   4441                 goto failed1;
   4442             Py_DECREF(key);
   4443             Py_DECREF(value);
   4444         }
   4445         return result;
   4446       failed1:
   4447         Py_XDECREF(key);
   4448         Py_XDECREF(value);
   4449         Py_DECREF(result);
   4450         return NULL;
   4451     }
   4452 
   4453     /* Create a three-level trie */
   4454     result = PyObject_MALLOC(sizeof(struct encoding_map) +
   4455                              16*count2 + 128*count3 - 1);
   4456     if (!result)
   4457         return PyErr_NoMemory();
   4458     PyObject_Init(result, &EncodingMapType);
   4459     mresult = (struct encoding_map*)result;
   4460     mresult->count2 = count2;
   4461     mresult->count3 = count3;
   4462     mlevel1 = mresult->level1;
   4463     mlevel2 = mresult->level23;
   4464     mlevel3 = mresult->level23 + 16*count2;
   4465     memcpy(mlevel1, level1, 32);
   4466     memset(mlevel2, 0xFF, 16*count2);
   4467     memset(mlevel3, 0, 128*count3);
   4468     count3 = 0;
   4469     for (i = 1; i < 256; i++) {
   4470         int o1, o2, o3, i2, i3;
   4471         if (decode[i] == 0xFFFE)
   4472             /* unmapped character */
   4473             continue;
   4474         o1 = decode[i]>>11;
   4475         o2 = (decode[i]>>7) & 0xF;
   4476         i2 = 16*mlevel1[o1] + o2;
   4477         if (mlevel2[i2] == 0xFF)
   4478             mlevel2[i2] = count3++;
   4479         o3 = decode[i] & 0x7F;
   4480         i3 = 128*mlevel2[i2] + o3;
   4481         mlevel3[i3] = i;
   4482     }
   4483     return result;
   4484 }
   4485 
   4486 static int
   4487 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
   4488 {
   4489     struct encoding_map *map = (struct encoding_map*)mapping;
   4490     int l1 = c>>11;
   4491     int l2 = (c>>7) & 0xF;
   4492     int l3 = c & 0x7F;
   4493     int i;
   4494 
   4495 #ifdef Py_UNICODE_WIDE
   4496     if (c > 0xFFFF) {
   4497         return -1;
   4498     }
   4499 #endif
   4500     if (c == 0)
   4501         return 0;
   4502     /* level 1*/
   4503     i = map->level1[l1];
   4504     if (i == 0xFF) {
   4505         return -1;
   4506     }
   4507     /* level 2*/
   4508     i = map->level23[16*i+l2];
   4509     if (i == 0xFF) {
   4510         return -1;
   4511     }
   4512     /* level 3 */
   4513     i = map->level23[16*map->count2 + 128*i + l3];
   4514     if (i == 0) {
   4515         return -1;
   4516     }
   4517     return i;
   4518 }
   4519 
   4520 /* Lookup the character ch in the mapping. If the character
   4521    can't be found, Py_None is returned (or NULL, if another
   4522    error occurred). */
   4523 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
   4524 {
   4525     PyObject *w = PyInt_FromLong((long)c);
   4526     PyObject *x;
   4527 
   4528     if (w == NULL)
   4529         return NULL;
   4530     x = PyObject_GetItem(mapping, w);
   4531     Py_DECREF(w);
   4532     if (x == NULL) {
   4533         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
   4534             /* No mapping found means: mapping is undefined. */
   4535             PyErr_Clear();
   4536             x = Py_None;
   4537             Py_INCREF(x);
   4538             return x;
   4539         } else
   4540             return NULL;
   4541     }
   4542     else if (x == Py_None)
   4543         return x;
   4544     else if (PyInt_Check(x)) {
   4545         long value = PyInt_AS_LONG(x);
   4546         if (value < 0 || value > 255) {
   4547             PyErr_SetString(PyExc_TypeError,
   4548                             "character mapping must be in range(256)");
   4549             Py_DECREF(x);
   4550             return NULL;
   4551         }
   4552         return x;
   4553     }
   4554     else if (PyString_Check(x))
   4555         return x;
   4556     else {
   4557         /* wrong return value */
   4558         PyErr_SetString(PyExc_TypeError,
   4559                         "character mapping must return integer, None or str");
   4560         Py_DECREF(x);
   4561         return NULL;
   4562     }
   4563 }
   4564 
   4565 static int
   4566 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
   4567 {
   4568     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
   4569     /* exponentially overallocate to minimize reallocations */
   4570     if (requiredsize < 2*outsize)
   4571         requiredsize = 2*outsize;
   4572     if (_PyString_Resize(outobj, requiredsize)) {
   4573         return 0;
   4574     }
   4575     return 1;
   4576 }
   4577 
   4578 typedef enum charmapencode_result {
   4579     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
   4580 }charmapencode_result;
   4581 /* lookup the character, put the result in the output string and adjust
   4582    various state variables. Reallocate the output string if not enough
   4583    space is available. Return a new reference to the object that
   4584    was put in the output buffer, or Py_None, if the mapping was undefined
   4585    (in which case no character was written) or NULL, if a
   4586    reallocation error occurred. The caller must decref the result */
   4587 static
   4588 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
   4589                                           PyObject **outobj, Py_ssize_t *outpos)
   4590 {
   4591     PyObject *rep;
   4592     char *outstart;
   4593     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
   4594 
   4595     if (Py_TYPE(mapping) == &EncodingMapType) {
   4596         int res = encoding_map_lookup(c, mapping);
   4597         Py_ssize_t requiredsize = *outpos+1;
   4598         if (res == -1)
   4599             return enc_FAILED;
   4600         if (outsize<requiredsize)
   4601             if (!charmapencode_resize(outobj, outpos, requiredsize))
   4602                 return enc_EXCEPTION;
   4603         outstart = PyString_AS_STRING(*outobj);
   4604         outstart[(*outpos)++] = (char)res;
   4605         return enc_SUCCESS;
   4606     }
   4607 
   4608     rep = charmapencode_lookup(c, mapping);
   4609     if (rep==NULL)
   4610         return enc_EXCEPTION;
   4611     else if (rep==Py_None) {
   4612         Py_DECREF(rep);
   4613         return enc_FAILED;
   4614     } else {
   4615         if (PyInt_Check(rep)) {
   4616             Py_ssize_t requiredsize = *outpos+1;
   4617             if (outsize<requiredsize)
   4618                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
   4619                     Py_DECREF(rep);
   4620                     return enc_EXCEPTION;
   4621                 }
   4622             outstart = PyString_AS_STRING(*outobj);
   4623             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
   4624         }
   4625         else {
   4626             const char *repchars = PyString_AS_STRING(rep);
   4627             Py_ssize_t repsize = PyString_GET_SIZE(rep);
   4628             Py_ssize_t requiredsize = *outpos+repsize;
   4629             if (outsize<requiredsize)
   4630                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
   4631                     Py_DECREF(rep);
   4632                     return enc_EXCEPTION;
   4633                 }
   4634             outstart = PyString_AS_STRING(*outobj);
   4635             memcpy(outstart + *outpos, repchars, repsize);
   4636             *outpos += repsize;
   4637         }
   4638     }
   4639     Py_DECREF(rep);
   4640     return enc_SUCCESS;
   4641 }
   4642 
   4643 /* handle an error in PyUnicode_EncodeCharmap
   4644    Return 0 on success, -1 on error */
   4645 static
   4646 int charmap_encoding_error(
   4647     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
   4648     PyObject **exceptionObject,
   4649     int *known_errorHandler, PyObject **errorHandler, const char *errors,
   4650     PyObject **res, Py_ssize_t *respos)
   4651 {
   4652     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
   4653     Py_ssize_t repsize;
   4654     Py_ssize_t newpos;
   4655     Py_UNICODE *uni2;
   4656     /* startpos for collecting unencodable chars */
   4657     Py_ssize_t collstartpos = *inpos;
   4658     Py_ssize_t collendpos = *inpos+1;
   4659     Py_ssize_t collpos;
   4660     char *encoding = "charmap";
   4661     char *reason = "character maps to <undefined>";
   4662     charmapencode_result x;
   4663 
   4664     /* find all unencodable characters */
   4665     while (collendpos < size) {
   4666         PyObject *rep;
   4667         if (Py_TYPE(mapping) == &EncodingMapType) {
   4668             int res = encoding_map_lookup(p[collendpos], mapping);
   4669             if (res != -1)
   4670                 break;
   4671             ++collendpos;
   4672             continue;
   4673         }
   4674 
   4675         rep = charmapencode_lookup(p[collendpos], mapping);
   4676         if (rep==NULL)
   4677             return -1;
   4678         else if (rep!=Py_None) {
   4679             Py_DECREF(rep);
   4680             break;
   4681         }
   4682         Py_DECREF(rep);
   4683         ++collendpos;
   4684     }
   4685     /* cache callback name lookup
   4686      * (if not done yet, i.e. it's the first error) */
   4687     if (*known_errorHandler==-1) {
   4688         if ((errors==NULL) || (!strcmp(errors, "strict")))
   4689             *known_errorHandler = 1;
   4690         else if (!strcmp(errors, "replace"))
   4691             *known_errorHandler = 2;
   4692         else if (!strcmp(errors, "ignore"))
   4693             *known_errorHandler = 3;
   4694         else if (!strcmp(errors, "xmlcharrefreplace"))
   4695             *known_errorHandler = 4;
   4696         else
   4697             *known_errorHandler = 0;
   4698     }
   4699     switch (*known_errorHandler) {
   4700     case 1: /* strict */
   4701         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
   4702         return -1;
   4703     case 2: /* replace */
   4704         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
   4705             x = charmapencode_output('?', mapping, res, respos);
   4706             if (x==enc_EXCEPTION) {
   4707                 return -1;
   4708             }
   4709             else if (x==enc_FAILED) {
   4710                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
   4711                 return -1;
   4712             }
   4713         }
   4714         /* fall through */
   4715     case 3: /* ignore */
   4716         *inpos = collendpos;
   4717         break;
   4718     case 4: /* xmlcharrefreplace */
   4719         /* generate replacement */
   4720         for (collpos = collstartpos; collpos < collendpos;) {
   4721             char buffer[2+29+1+1];
   4722             char *cp;
   4723             Py_UCS4 ch = p[collpos++];
   4724 #ifndef Py_UNICODE_WIDE
   4725             if ((0xD800 <= ch && ch <= 0xDBFF) &&
   4726                 (collpos < collendpos) &&
   4727                 (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
   4728                 ch = ((((ch & 0x03FF) << 10) |
   4729                        ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
   4730             }
   4731 #endif
   4732             sprintf(buffer, "&#%d;", (int)ch);
   4733             for (cp = buffer; *cp; ++cp) {
   4734                 x = charmapencode_output(*cp, mapping, res, respos);
   4735                 if (x==enc_EXCEPTION)
   4736                     return -1;
   4737                 else if (x==enc_FAILED) {
   4738                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
   4739                     return -1;
   4740                 }
   4741             }
   4742         }
   4743         *inpos = collendpos;
   4744         break;
   4745     default:
   4746         repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
   4747                                                       encoding, reason, p, size, exceptionObject,
   4748                                                       collstartpos, collendpos, &newpos);
   4749         if (repunicode == NULL)
   4750             return -1;
   4751         /* generate replacement  */
   4752         repsize = PyUnicode_GET_SIZE(repunicode);
   4753         for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
   4754             x = charmapencode_output(*uni2, mapping, res, respos);
   4755             if (x==enc_EXCEPTION) {
   4756                 return -1;
   4757             }
   4758             else if (x==enc_FAILED) {
   4759                 Py_DECREF(repunicode);
   4760                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
   4761                 return -1;
   4762             }
   4763         }
   4764         *inpos = newpos;
   4765         Py_DECREF(repunicode);
   4766     }
   4767     return 0;
   4768 }
   4769 
   4770 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
   4771                                   Py_ssize_t size,
   4772                                   PyObject *mapping,
   4773                                   const char *errors)
   4774 {
   4775     /* output object */
   4776     PyObject *res = NULL;
   4777     /* current input position */
   4778     Py_ssize_t inpos = 0;
   4779     /* current output position */
   4780     Py_ssize_t respos = 0;
   4781     PyObject *errorHandler = NULL;
   4782     PyObject *exc = NULL;
   4783     /* the following variable is used for caching string comparisons
   4784      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
   4785      * 3=ignore, 4=xmlcharrefreplace */
   4786     int known_errorHandler = -1;
   4787 
   4788     /* Default to Latin-1 */
   4789     if (mapping == NULL)
   4790         return PyUnicode_EncodeLatin1(p, size, errors);
   4791 
   4792     /* allocate enough for a simple encoding without
   4793        replacements, if we need more, we'll resize */
   4794     res = PyString_FromStringAndSize(NULL, size);
   4795     if (res == NULL)
   4796         goto onError;
   4797     if (size == 0)
   4798         return res;
   4799 
   4800     while (inpos<size) {
   4801         /* try to encode it */
   4802         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
   4803         if (x==enc_EXCEPTION) /* error */
   4804             goto onError;
   4805         if (x==enc_FAILED) { /* unencodable character */
   4806             if (charmap_encoding_error(p, size, &inpos, mapping,
   4807                                        &exc,
   4808                                        &known_errorHandler, &errorHandler, errors,
   4809                                        &res, &respos)) {
   4810                 goto onError;
   4811             }
   4812         }
   4813         else
   4814             /* done with this character => adjust input position */
   4815             ++inpos;
   4816     }
   4817 
   4818     /* Resize if we allocated to much */
   4819     if (respos<PyString_GET_SIZE(res)) {
   4820         if (_PyString_Resize(&res, respos))
   4821             goto onError;
   4822     }
   4823     Py_XDECREF(exc);
   4824     Py_XDECREF(errorHandler);
   4825     return res;
   4826 
   4827   onError:
   4828     Py_XDECREF(res);
   4829     Py_XDECREF(exc);
   4830     Py_XDECREF(errorHandler);
   4831     return NULL;
   4832 }
   4833 
   4834 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
   4835                                     PyObject *mapping)
   4836 {
   4837     if (!PyUnicode_Check(unicode) || mapping == NULL) {
   4838         PyErr_BadArgument();
   4839         return NULL;
   4840     }
   4841     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
   4842                                    PyUnicode_GET_SIZE(unicode),
   4843                                    mapping,
   4844                                    NULL);
   4845 }
   4846 
   4847 /* create or adjust a UnicodeTranslateError */
   4848 static void make_translate_exception(PyObject **exceptionObject,
   4849                                      const Py_UNICODE *unicode, Py_ssize_t size,
   4850                                      Py_ssize_t startpos, Py_ssize_t endpos,
   4851                                      const char *reason)
   4852 {
   4853     if (*exceptionObject == NULL) {
   4854         *exceptionObject = PyUnicodeTranslateError_Create(
   4855             unicode, size, startpos, endpos, reason);
   4856     }
   4857     else {
   4858         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
   4859             goto onError;
   4860         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
   4861             goto onError;
   4862         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
   4863             goto onError;
   4864         return;
   4865       onError:
   4866         Py_CLEAR(*exceptionObject);
   4867     }
   4868 }
   4869 
   4870 /* raises a UnicodeTranslateError */
   4871 static void raise_translate_exception(PyObject **exceptionObject,
   4872                                       const Py_UNICODE *unicode, Py_ssize_t size,
   4873                                       Py_ssize_t startpos, Py_ssize_t endpos,
   4874                                       const char *reason)
   4875 {
   4876     make_translate_exception(exceptionObject,
   4877                              unicode, size, startpos, endpos, reason);
   4878     if (*exceptionObject != NULL)
   4879         PyCodec_StrictErrors(*exceptionObject);
   4880 }
   4881 
   4882 /* error handling callback helper:
   4883    build arguments, call the callback and check the arguments,
   4884    put the result into newpos and return the replacement string, which
   4885    has to be freed by the caller */
   4886 static PyObject *unicode_translate_call_errorhandler(const char *errors,
   4887                                                      PyObject **errorHandler,
   4888                                                      const char *reason,
   4889                                                      const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
   4890                                                      Py_ssize_t startpos, Py_ssize_t endpos,
   4891                                                      Py_ssize_t *newpos)
   4892 {
   4893     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
   4894 
   4895     Py_ssize_t i_newpos;
   4896     PyObject *restuple;
   4897     PyObject *resunicode;
   4898 
   4899     if (*errorHandler == NULL) {
   4900         *errorHandler = PyCodec_LookupError(errors);
   4901         if (*errorHandler == NULL)
   4902             return NULL;
   4903     }
   4904 
   4905     make_translate_exception(exceptionObject,
   4906                              unicode, size, startpos, endpos, reason);
   4907     if (*exceptionObject == NULL)
   4908         return NULL;
   4909 
   4910     restuple = PyObject_CallFunctionObjArgs(
   4911         *errorHandler, *exceptionObject, NULL);
   4912     if (restuple == NULL)
   4913         return NULL;
   4914     if (!PyTuple_Check(restuple)) {
   4915         PyErr_SetString(PyExc_TypeError, &argparse[4]);
   4916         Py_DECREF(restuple);
   4917         return NULL;
   4918     }
   4919     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
   4920                           &resunicode, &i_newpos)) {
   4921         Py_DECREF(restuple);
   4922         return NULL;
   4923     }
   4924     if (i_newpos<0)
   4925         *newpos = size+i_newpos;
   4926     else
   4927         *newpos = i_newpos;
   4928     if (*newpos<0 || *newpos>size) {
   4929         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
   4930         Py_DECREF(restuple);
   4931         return NULL;
   4932     }
   4933     Py_INCREF(resunicode);
   4934     Py_DECREF(restuple);
   4935     return resunicode;
   4936 }
   4937 
   4938 /* Lookup the character ch in the mapping and put the result in result,
   4939    which must be decrefed by the caller.
   4940    Return 0 on success, -1 on error */
   4941 static
   4942 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
   4943 {
   4944     PyObject *w = PyInt_FromLong((long)c);
   4945     PyObject *x;
   4946 
   4947     if (w == NULL)
   4948         return -1;
   4949     x = PyObject_GetItem(mapping, w);
   4950     Py_DECREF(w);
   4951     if (x == NULL) {
   4952         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
   4953             /* No mapping found means: use 1:1 mapping. */
   4954             PyErr_Clear();
   4955             *result = NULL;
   4956             return 0;
   4957         } else
   4958             return -1;
   4959     }
   4960     else if (x == Py_None) {
   4961         *result = x;
   4962         return 0;
   4963     }
   4964     else if (PyInt_Check(x)) {
   4965         long value = PyInt_AS_LONG(x);
   4966         long max = PyUnicode_GetMax();
   4967         if (value < 0 || value > max) {
   4968             PyErr_Format(PyExc_TypeError,
   4969                          "character mapping must be in range(0x%lx)", max+1);
   4970             Py_DECREF(x);
   4971             return -1;
   4972         }
   4973         *result = x;
   4974         return 0;
   4975     }
   4976     else if (PyUnicode_Check(x)) {
   4977         *result = x;
   4978         return 0;
   4979     }
   4980     else {
   4981         /* wrong return value */
   4982         PyErr_SetString(PyExc_TypeError,
   4983                         "character mapping must return integer, None or unicode");
   4984         Py_DECREF(x);
   4985         return -1;
   4986     }
   4987 }
   4988 /* ensure that *outobj is at least requiredsize characters long,
   4989    if not reallocate and adjust various state variables.
   4990    Return 0 on success, -1 on error */
   4991 static
   4992 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
   4993                                Py_ssize_t requiredsize)
   4994 {
   4995     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
   4996     if (requiredsize > oldsize) {
   4997         /* remember old output position */
   4998         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
   4999         /* exponentially overallocate to minimize reallocations */
   5000         if (requiredsize < 2 * oldsize)
   5001             requiredsize = 2 * oldsize;
   5002         if (PyUnicode_Resize(outobj, requiredsize) < 0)
   5003             return -1;
   5004         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
   5005     }
   5006     return 0;
   5007 }
   5008 /* lookup the character, put the result in the output string and adjust
   5009    various state variables. Return a new reference to the object that
   5010    was put in the output buffer in *result, or Py_None, if the mapping was
   5011    undefined (in which case no character was written).
   5012    The called must decref result.
   5013    Return 0 on success, -1 on error. */
   5014 static
   5015 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
   5016                             Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
   5017                             PyObject **res)
   5018 {
   5019     if (charmaptranslate_lookup(*curinp, mapping, res))
   5020         return -1;
   5021     if (*res==NULL) {
   5022         /* not found => default to 1:1 mapping */
   5023         *(*outp)++ = *curinp;
   5024     }
   5025     else if (*res==Py_None)
   5026         ;
   5027     else if (PyInt_Check(*res)) {
   5028         /* no overflow check, because we know that the space is enough */
   5029         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
   5030     }
   5031     else if (PyUnicode_Check(*res)) {
   5032         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
   5033         if (repsize==1) {
   5034             /* no overflow check, because we know that the space is enough */
   5035             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
   5036         }
   5037         else if (repsize!=0) {
   5038             /* more than one character */
   5039             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
   5040                 (insize - (curinp-startinp)) +
   5041                 repsize - 1;
   5042             if (charmaptranslate_makespace(outobj, outp, requiredsize))
   5043                 return -1;
   5044             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
   5045             *outp += repsize;
   5046         }
   5047     }
   5048     else
   5049         return -1;
   5050     return 0;
   5051 }
   5052 
   5053 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
   5054                                      Py_ssize_t size,
   5055                                      PyObject *mapping,
   5056                                      const char *errors)
   5057 {
   5058     /* output object */
   5059     PyObject *res = NULL;
   5060     /* pointers to the beginning and end+1 of input */
   5061     const Py_UNICODE *startp = p;
   5062     const Py_UNICODE *endp = p + size;
   5063     /* pointer into the output */
   5064     Py_UNICODE *str;
   5065     /* current output position */
   5066     Py_ssize_t respos = 0;
   5067     char *reason = "character maps to <undefined>";
   5068     PyObject *errorHandler = NULL;
   5069     PyObject *exc = NULL;
   5070     /* the following variable is used for caching string comparisons
   5071      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
   5072      * 3=ignore, 4=xmlcharrefreplace */
   5073     int known_errorHandler = -1;
   5074 
   5075     if (mapping == NULL) {
   5076         PyErr_BadArgument();
   5077         return NULL;
   5078     }
   5079 
   5080     /* allocate enough for a simple 1:1 translation without
   5081        replacements, if we need more, we'll resize */
   5082     res = PyUnicode_FromUnicode(NULL, size);
   5083     if (res == NULL)
   5084         goto onError;
   5085     if (size == 0)
   5086         return res;
   5087     str = PyUnicode_AS_UNICODE(res);
   5088 
   5089     while (p<endp) {
   5090         /* try to encode it */
   5091         PyObject *x = NULL;
   5092         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
   5093             Py_XDECREF(x);
   5094             goto onError;
   5095         }
   5096         Py_XDECREF(x);
   5097         if (x!=Py_None) /* it worked => adjust input pointer */
   5098             ++p;
   5099         else { /* untranslatable character */
   5100             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
   5101             Py_ssize_t repsize;
   5102             Py_ssize_t newpos;
   5103             Py_UNICODE *uni2;
   5104             /* startpos for collecting untranslatable chars */
   5105             const Py_UNICODE *collstart = p;
   5106             const Py_UNICODE *collend = p+1;
   5107             const Py_UNICODE *coll;
   5108 
   5109             /* find all untranslatable characters */
   5110             while (collend < endp) {
   5111                 if (charmaptranslate_lookup(*collend, mapping, &x))
   5112                     goto onError;
   5113                 Py_XDECREF(x);
   5114                 if (x!=Py_None)
   5115                     break;
   5116                 ++collend;
   5117             }
   5118             /* cache callback name lookup
   5119              * (if not done yet, i.e. it's the first error) */
   5120             if (known_errorHandler==-1) {
   5121                 if ((errors==NULL) || (!strcmp(errors, "strict")))
   5122                     known_errorHandler = 1;
   5123                 else if (!strcmp(errors, "replace"))
   5124                     known_errorHandler = 2;
   5125                 else if (!strcmp(errors, "ignore"))
   5126                     known_errorHandler = 3;
   5127                 else if (!strcmp(errors, "xmlcharrefreplace"))
   5128                     known_errorHandler = 4;
   5129                 else
   5130                     known_errorHandler = 0;
   5131             }
   5132             switch (known_errorHandler) {
   5133             case 1: /* strict */
   5134                 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
   5135                 goto onError;
   5136             case 2: /* replace */
   5137                 /* No need to check for space, this is a 1:1 replacement */
   5138                 for (coll = collstart; coll<collend; ++coll)
   5139                     *str++ = '?';
   5140                 /* fall through */
   5141             case 3: /* ignore */
   5142                 p = collend;
   5143                 break;
   5144             case 4: /* xmlcharrefreplace */
   5145                 /* generate replacement (temporarily (mis)uses p) */
   5146                 for (p = collstart; p < collend;) {
   5147                     char buffer[2+29+1+1];
   5148                     char *cp;
   5149                     Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
   5150                     sprintf(buffer, "&#%d;", (int)ch);
   5151                     if (charmaptranslate_makespace(&res, &str,
   5152                                                    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
   5153                         goto onError;
   5154                     for (cp = buffer; *cp; ++cp)
   5155                         *str++ = *cp;
   5156                 }
   5157                 p = collend;
   5158                 break;
   5159             default:
   5160                 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
   5161                                                                  reason, startp, size, &exc,
   5162                                                                  collstart-startp, collend-startp, &newpos);
   5163                 if (repunicode == NULL)
   5164                     goto onError;
   5165                 /* generate replacement  */
   5166                 repsize = PyUnicode_GET_SIZE(repunicode);
   5167                 if (charmaptranslate_makespace(&res, &str,
   5168                                                (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
   5169                     Py_DECREF(repunicode);
   5170                     goto onError;
   5171                 }
   5172                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
   5173                     *str++ = *uni2;
   5174                 p = startp + newpos;
   5175                 Py_DECREF(repunicode);
   5176             }
   5177         }
   5178     }
   5179     /* Resize if we allocated to much */
   5180     respos = str-PyUnicode_AS_UNICODE(res);
   5181     if (respos<PyUnicode_GET_SIZE(res)) {
   5182         if (PyUnicode_Resize(&res, respos) < 0)
   5183             goto onError;
   5184     }
   5185     Py_XDECREF(exc);
   5186     Py_XDECREF(errorHandler);
   5187     return res;
   5188 
   5189   onError:
   5190     Py_XDECREF(res);
   5191     Py_XDECREF(exc);
   5192     Py_XDECREF(errorHandler);
   5193     return NULL;
   5194 }
   5195 
   5196 PyObject *PyUnicode_Translate(PyObject *str,
   5197                               PyObject *mapping,
   5198                               const char *errors)
   5199 {
   5200     PyObject *result;
   5201 
   5202     str = PyUnicode_FromObject(str);
   5203     if (str == NULL)
   5204         goto onError;
   5205     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
   5206                                         PyUnicode_GET_SIZE(str),
   5207                                         mapping,
   5208                                         errors);
   5209     Py_DECREF(str);
   5210     return result;
   5211 
   5212   onError:
   5213     Py_XDECREF(str);
   5214     return NULL;
   5215 }
   5216 
   5217 /* --- Decimal Encoder ---------------------------------------------------- */
   5218 
   5219 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
   5220                             Py_ssize_t length,
   5221                             char *output,
   5222                             const char *errors)
   5223 {
   5224     Py_UNICODE *p, *end;
   5225     PyObject *errorHandler = NULL;
   5226     PyObject *exc = NULL;
   5227     const char *encoding = "decimal";
   5228     const char *reason = "invalid decimal Unicode string";
   5229     /* the following variable is used for caching string comparisons
   5230      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
   5231     int known_errorHandler = -1;
   5232 
   5233     if (output == NULL) {
   5234         PyErr_BadArgument();
   5235         return -1;
   5236     }
   5237 
   5238     p = s;
   5239     end = s + length;
   5240     while (p < end) {
   5241         register Py_UNICODE ch = *p;
   5242         int decimal;
   5243         PyObject *repunicode;
   5244         Py_ssize_t repsize;
   5245         Py_ssize_t newpos;
   5246         Py_UNICODE *uni2;
   5247         Py_UNICODE *collstart;
   5248         Py_UNICODE *collend;
   5249 
   5250         if (Py_UNICODE_ISSPACE(ch)) {
   5251             *output++ = ' ';
   5252             ++p;
   5253             continue;
   5254         }
   5255         decimal = Py_UNICODE_TODECIMAL(ch);
   5256         if (decimal >= 0) {
   5257             *output++ = '0' + decimal;
   5258             ++p;
   5259             continue;
   5260         }
   5261         if (0 < ch && ch < 256) {
   5262             *output++ = (char)ch;
   5263             ++p;
   5264             continue;
   5265         }
   5266         /* All other characters are considered unencodable */
   5267         collstart = p;
   5268         for (collend = p+1; collend < end; collend++) {
   5269             if ((0 < *collend && *collend < 256) ||
   5270                 Py_UNICODE_ISSPACE(*collend) ||
   5271                 0 <= Py_UNICODE_TODECIMAL(*collend))
   5272                 break;
   5273         }
   5274         /* cache callback name lookup
   5275          * (if not done yet, i.e. it's the first error) */
   5276         if (known_errorHandler==-1) {
   5277             if ((errors==NULL) || (!strcmp(errors, "strict")))
   5278                 known_errorHandler = 1;
   5279             else if (!strcmp(errors, "replace"))
   5280                 known_errorHandler = 2;
   5281             else if (!strcmp(errors, "ignore"))
   5282                 known_errorHandler = 3;
   5283             else if (!strcmp(errors, "xmlcharrefreplace"))
   5284                 known_errorHandler = 4;
   5285             else
   5286                 known_errorHandler = 0;
   5287         }
   5288         switch (known_errorHandler) {
   5289         case 1: /* strict */
   5290             raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
   5291             goto onError;
   5292         case 2: /* replace */
   5293             for (p = collstart; p < collend; ++p)
   5294                 *output++ = '?';
   5295             /* fall through */
   5296         case 3: /* ignore */
   5297             p = collend;
   5298             break;
   5299         case 4: /* xmlcharrefreplace */
   5300             /* generate replacement (temporarily (mis)uses p) */
   5301             for (p = collstart; p < collend;) {
   5302                 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
   5303                 output += sprintf(output, "&#%d;", ch);
   5304             }
   5305             p = collend;
   5306             break;
   5307         default:
   5308             repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
   5309                                                           encoding, reason, s, length, &exc,
   5310                                                           collstart-s, collend-s, &newpos);
   5311             if (repunicode == NULL)
   5312                 goto onError;
   5313             /* generate replacement  */
   5314             repsize = PyUnicode_GET_SIZE(repunicode);
   5315             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
   5316                 Py_UNICODE ch = *uni2;
   5317                 if (Py_UNICODE_ISSPACE(ch))
   5318                     *output++ = ' ';
   5319                 else {
   5320                     decimal = Py_UNICODE_TODECIMAL(ch);
   5321                     if (decimal >= 0)
   5322                         *output++ = '0' + decimal;
   5323                     else if (0 < ch && ch < 256)
   5324                         *output++ = (char)ch;
   5325                     else {
   5326                         Py_DECREF(repunicode);
   5327                         raise_encode_exception(&exc, encoding,
   5328                                                s, length, collstart-s, collend-s, reason);
   5329                         goto onError;
   5330                     }
   5331                 }
   5332             }
   5333             p = s + newpos;
   5334             Py_DECREF(repunicode);
   5335         }
   5336     }
   5337     /* 0-terminate the output string */
   5338     *output++ = '\0';
   5339     Py_XDECREF(exc);
   5340     Py_XDECREF(errorHandler);
   5341     return 0;
   5342 
   5343   onError:
   5344     Py_XDECREF(exc);
   5345     Py_XDECREF(errorHandler);
   5346     return -1;
   5347 }
   5348 
   5349 /* --- Helpers ------------------------------------------------------------ */
   5350 
   5351 #include "stringlib/unicodedefs.h"
   5352 #include "stringlib/fastsearch.h"
   5353 
   5354 #include "stringlib/count.h"
   5355 #include "stringlib/find.h"
   5356 #include "stringlib/partition.h"
   5357 #include "stringlib/split.h"
   5358 
   5359 /* helper macro to fixup start/end slice values */
   5360 #define ADJUST_INDICES(start, end, len)         \
   5361     if (end > len)                              \
   5362         end = len;                              \
   5363     else if (end < 0) {                         \
   5364         end += len;                             \
   5365         if (end < 0)                            \
   5366             end = 0;                            \
   5367     }                                           \
   5368     if (start < 0) {                            \
   5369         start += len;                           \
   5370         if (start < 0)                          \
   5371             start = 0;                          \
   5372     }
   5373 
   5374 Py_ssize_t PyUnicode_Count(PyObject *str,
   5375                            PyObject *substr,
   5376                            Py_ssize_t start,
   5377                            Py_ssize_t end)
   5378 {
   5379     Py_ssize_t result;
   5380     PyUnicodeObject* str_obj;
   5381     PyUnicodeObject* sub_obj;
   5382 
   5383     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
   5384     if (!str_obj)
   5385         return -1;
   5386     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
   5387     if (!sub_obj) {
   5388         Py_DECREF(str_obj);
   5389         return -1;
   5390     }
   5391 
   5392     ADJUST_INDICES(start, end, str_obj->length);
   5393     result = stringlib_count(
   5394         str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
   5395         PY_SSIZE_T_MAX
   5396         );
   5397 
   5398     Py_DECREF(sub_obj);
   5399     Py_DECREF(str_obj);
   5400 
   5401     return result;
   5402 }
   5403 
   5404 Py_ssize_t PyUnicode_Find(PyObject *str,
   5405                           PyObject *sub,
   5406                           Py_ssize_t start,
   5407                           Py_ssize_t end,
   5408                           int direction)
   5409 {
   5410     Py_ssize_t result;
   5411 
   5412     str = PyUnicode_FromObject(str);
   5413     if (!str)
   5414         return -2;
   5415     sub = PyUnicode_FromObject(sub);
   5416     if (!sub) {
   5417         Py_DECREF(str);
   5418         return -2;
   5419     }
   5420 
   5421     if (direction > 0)
   5422         result = stringlib_find_slice(
   5423             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
   5424             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
   5425             start, end
   5426             );
   5427     else
   5428         result = stringlib_rfind_slice(
   5429             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
   5430             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
   5431             start, end
   5432             );
   5433 
   5434     Py_DECREF(str);
   5435     Py_DECREF(sub);
   5436 
   5437     return result;
   5438 }
   5439 
   5440 static
   5441 int tailmatch(PyUnicodeObject *self,
   5442               PyUnicodeObject *substring,
   5443               Py_ssize_t start,
   5444               Py_ssize_t end,
   5445               int direction)
   5446 {
   5447     if (substring->length == 0)
   5448         return 1;
   5449 
   5450     ADJUST_INDICES(start, end, self->length);
   5451     end -= substring->length;
   5452     if (end < start)
   5453         return 0;
   5454 
   5455     if (direction > 0) {
   5456         if (Py_UNICODE_MATCH(self, end, substring))
   5457             return 1;
   5458     } else {
   5459         if (Py_UNICODE_MATCH(self, start, substring))
   5460             return 1;
   5461     }
   5462 
   5463     return 0;
   5464 }
   5465 
   5466 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
   5467                                PyObject *substr,
   5468                                Py_ssize_t start,
   5469                                Py_ssize_t end,
   5470                                int direction)
   5471 {
   5472     Py_ssize_t result;
   5473 
   5474     str = PyUnicode_FromObject(str);
   5475     if (str == NULL)
   5476         return -1;
   5477     substr = PyUnicode_FromObject(substr);
   5478     if (substr == NULL) {
   5479         Py_DECREF(str);
   5480         return -1;
   5481     }
   5482 
   5483     result = tailmatch((PyUnicodeObject *)str,
   5484                        (PyUnicodeObject *)substr,
   5485                        start, end, direction);
   5486     Py_DECREF(str);
   5487     Py_DECREF(substr);
   5488     return result;
   5489 }
   5490 
   5491 /* Apply fixfct filter to the Unicode object self and return a
   5492    reference to the modified object */
   5493 
   5494 static
   5495 PyObject *fixup(PyUnicodeObject *self,
   5496                 int (*fixfct)(PyUnicodeObject *s))
   5497 {
   5498 
   5499     PyUnicodeObject *u;
   5500 
   5501     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
   5502     if (u == NULL)
   5503         return NULL;
   5504 
   5505     Py_UNICODE_COPY(u->str, self->str, self->length);
   5506 
   5507     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
   5508         /* fixfct should return TRUE if it modified the buffer. If
   5509            FALSE, return a reference to the original buffer instead
   5510            (to save space, not time) */
   5511         Py_INCREF(self);
   5512         Py_DECREF(u);
   5513         return (PyObject*) self;
   5514     }
   5515     return (PyObject*) u;
   5516 }
   5517 
   5518 static
   5519 int fixupper(PyUnicodeObject *self)
   5520 {
   5521     Py_ssize_t len = self->length;
   5522     Py_UNICODE *s = self->str;
   5523     int status = 0;
   5524 
   5525     while (len-- > 0) {
   5526         register Py_UNICODE ch;
   5527 
   5528         ch = Py_UNICODE_TOUPPER(*s);
   5529         if (ch != *s) {
   5530             status = 1;
   5531             *s = ch;
   5532         }
   5533         s++;
   5534     }
   5535 
   5536     return status;
   5537 }
   5538 
   5539 static
   5540 int fixlower(PyUnicodeObject *self)
   5541 {
   5542     Py_ssize_t len = self->length;
   5543     Py_UNICODE *s = self->str;
   5544     int status = 0;
   5545 
   5546     while (len-- > 0) {
   5547         register Py_UNICODE ch;
   5548 
   5549         ch = Py_UNICODE_TOLOWER(*s);
   5550         if (ch != *s) {
   5551             status = 1;
   5552             *s = ch;
   5553         }
   5554         s++;
   5555     }
   5556 
   5557     return status;
   5558 }
   5559 
   5560 static
   5561 int fixswapcase(PyUnicodeObject *self)
   5562 {
   5563     Py_ssize_t len = self->length;
   5564     Py_UNICODE *s = self->str;
   5565     int status = 0;
   5566 
   5567     while (len-- > 0) {
   5568         if (Py_UNICODE_ISUPPER(*s)) {
   5569             *s = Py_UNICODE_TOLOWER(*s);
   5570             status = 1;
   5571         } else if (Py_UNICODE_ISLOWER(*s)) {
   5572             *s = Py_UNICODE_TOUPPER(*s);
   5573             status = 1;
   5574         }
   5575         s++;
   5576     }
   5577 
   5578     return status;
   5579 }
   5580 
   5581 static
   5582 int fixcapitalize(PyUnicodeObject *self)
   5583 {
   5584     Py_ssize_t len = self->length;
   5585     Py_UNICODE *s = self->str;
   5586     int status = 0;
   5587 
   5588     if (len == 0)
   5589         return 0;
   5590     if (!Py_UNICODE_ISUPPER(*s)) {
   5591         *s = Py_UNICODE_TOUPPER(*s);
   5592         status = 1;
   5593     }
   5594     s++;
   5595     while (--len > 0) {
   5596         if (!Py_UNICODE_ISLOWER(*s)) {
   5597             *s = Py_UNICODE_TOLOWER(*s);
   5598             status = 1;
   5599         }
   5600         s++;
   5601     }
   5602     return status;
   5603 }
   5604 
   5605 static
   5606 int fixtitle(PyUnicodeObject *self)
   5607 {
   5608     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   5609     register Py_UNICODE *e;
   5610     int previous_is_cased;
   5611 
   5612     /* Shortcut for single character strings */
   5613     if (PyUnicode_GET_SIZE(self) == 1) {
   5614         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
   5615         if (*p != ch) {
   5616             *p = ch;
   5617             return 1;
   5618         }
   5619         else
   5620             return 0;
   5621     }
   5622 
   5623     e = p + PyUnicode_GET_SIZE(self);
   5624     previous_is_cased = 0;
   5625     for (; p < e; p++) {
   5626         register const Py_UNICODE ch = *p;
   5627 
   5628         if (previous_is_cased)
   5629             *p = Py_UNICODE_TOLOWER(ch);
   5630         else
   5631             *p = Py_UNICODE_TOTITLE(ch);
   5632 
   5633         if (Py_UNICODE_ISLOWER(ch) ||
   5634             Py_UNICODE_ISUPPER(ch) ||
   5635             Py_UNICODE_ISTITLE(ch))
   5636             previous_is_cased = 1;
   5637         else
   5638             previous_is_cased = 0;
   5639     }
   5640     return 1;
   5641 }
   5642 
   5643 PyObject *
   5644 PyUnicode_Join(PyObject *separator, PyObject *seq)
   5645 {
   5646     PyObject *internal_separator = NULL;
   5647     const Py_UNICODE blank = ' ';
   5648     const Py_UNICODE *sep = &blank;
   5649     Py_ssize_t seplen = 1;
   5650     PyUnicodeObject *res = NULL; /* the result */
   5651     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
   5652     Py_ssize_t res_used;         /* # used bytes */
   5653     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
   5654     PyObject *fseq;          /* PySequence_Fast(seq) */
   5655     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
   5656     PyObject *item;
   5657     Py_ssize_t i;
   5658 
   5659     fseq = PySequence_Fast(seq, "can only join an iterable");
   5660     if (fseq == NULL) {
   5661         return NULL;
   5662     }
   5663 
   5664     /* Grrrr.  A codec may be invoked to convert str objects to
   5665      * Unicode, and so it's possible to call back into Python code
   5666      * during PyUnicode_FromObject(), and so it's possible for a sick
   5667      * codec to change the size of fseq (if seq is a list).  Therefore
   5668      * we have to keep refetching the size -- can't assume seqlen
   5669      * is invariant.
   5670      */
   5671     seqlen = PySequence_Fast_GET_SIZE(fseq);
   5672     /* If empty sequence, return u"". */
   5673     if (seqlen == 0) {
   5674         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
   5675         goto Done;
   5676     }
   5677     /* If singleton sequence with an exact Unicode, return that. */
   5678     if (seqlen == 1) {
   5679         item = PySequence_Fast_GET_ITEM(fseq, 0);
   5680         if (PyUnicode_CheckExact(item)) {
   5681             Py_INCREF(item);
   5682             res = (PyUnicodeObject *)item;
   5683             goto Done;
   5684         }
   5685     }
   5686 
   5687     /* At least two items to join, or one that isn't exact Unicode. */
   5688     if (seqlen > 1) {
   5689         /* Set up sep and seplen -- they're needed. */
   5690         if (separator == NULL) {
   5691             sep = &blank;
   5692             seplen = 1;
   5693         }
   5694         else {
   5695             internal_separator = PyUnicode_FromObject(separator);
   5696             if (internal_separator == NULL)
   5697                 goto onError;
   5698             sep = PyUnicode_AS_UNICODE(internal_separator);
   5699             seplen = PyUnicode_GET_SIZE(internal_separator);
   5700             /* In case PyUnicode_FromObject() mutated seq. */
   5701             seqlen = PySequence_Fast_GET_SIZE(fseq);
   5702         }
   5703     }
   5704 
   5705     /* Get space. */
   5706     res = _PyUnicode_New(res_alloc);
   5707     if (res == NULL)
   5708         goto onError;
   5709     res_p = PyUnicode_AS_UNICODE(res);
   5710     res_used = 0;
   5711 
   5712     for (i = 0; i < seqlen; ++i) {
   5713         Py_ssize_t itemlen;
   5714         Py_ssize_t new_res_used;
   5715 
   5716         item = PySequence_Fast_GET_ITEM(fseq, i);
   5717         /* Convert item to Unicode. */
   5718         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
   5719             PyErr_Format(PyExc_TypeError,
   5720                          "sequence item %zd: expected string or Unicode,"
   5721                          " %.80s found",
   5722                          i, Py_TYPE(item)->tp_name);
   5723             goto onError;
   5724         }
   5725         item = PyUnicode_FromObject(item);
   5726         if (item == NULL)
   5727             goto onError;
   5728         /* We own a reference to item from here on. */
   5729 
   5730         /* In case PyUnicode_FromObject() mutated seq. */
   5731         seqlen = PySequence_Fast_GET_SIZE(fseq);
   5732 
   5733         /* Make sure we have enough space for the separator and the item. */
   5734         itemlen = PyUnicode_GET_SIZE(item);
   5735         new_res_used = res_used + itemlen;
   5736         if (new_res_used < 0)
   5737             goto Overflow;
   5738         if (i < seqlen - 1) {
   5739             new_res_used += seplen;
   5740             if (new_res_used < 0)
   5741                 goto Overflow;
   5742         }
   5743         if (new_res_used > res_alloc) {
   5744             /* double allocated size until it's big enough */
   5745             do {
   5746                 res_alloc += res_alloc;
   5747                 if (res_alloc <= 0)
   5748                     goto Overflow;
   5749             } while (new_res_used > res_alloc);
   5750             if (_PyUnicode_Resize(&res, res_alloc) < 0) {
   5751                 Py_DECREF(item);
   5752                 goto onError;
   5753             }
   5754             res_p = PyUnicode_AS_UNICODE(res) + res_used;
   5755         }
   5756 
   5757         /* Copy item, and maybe the separator. */
   5758         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
   5759         res_p += itemlen;
   5760         if (i < seqlen - 1) {
   5761             Py_UNICODE_COPY(res_p, sep, seplen);
   5762             res_p += seplen;
   5763         }
   5764         Py_DECREF(item);
   5765         res_used = new_res_used;
   5766     }
   5767 
   5768     /* Shrink res to match the used area; this probably can't fail,
   5769      * but it's cheap to check.
   5770      */
   5771     if (_PyUnicode_Resize(&res, res_used) < 0)
   5772         goto onError;
   5773 
   5774   Done:
   5775     Py_XDECREF(internal_separator);
   5776     Py_DECREF(fseq);
   5777     return (PyObject *)res;
   5778 
   5779   Overflow:
   5780     PyErr_SetString(PyExc_OverflowError,
   5781                     "join() result is too long for a Python string");
   5782     Py_DECREF(item);
   5783     /* fall through */
   5784 
   5785   onError:
   5786     Py_XDECREF(internal_separator);
   5787     Py_DECREF(fseq);
   5788     Py_XDECREF(res);
   5789     return NULL;
   5790 }
   5791 
   5792 static
   5793 PyUnicodeObject *pad(PyUnicodeObject *self,
   5794                      Py_ssize_t left,
   5795                      Py_ssize_t right,
   5796                      Py_UNICODE fill)
   5797 {
   5798     PyUnicodeObject *u;
   5799 
   5800     if (left < 0)
   5801         left = 0;
   5802     if (right < 0)
   5803         right = 0;
   5804 
   5805     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
   5806         Py_INCREF(self);
   5807         return self;
   5808     }
   5809 
   5810     if (left > PY_SSIZE_T_MAX - self->length ||
   5811         right > PY_SSIZE_T_MAX - (left + self->length)) {
   5812         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
   5813         return NULL;
   5814     }
   5815     u = _PyUnicode_New(left + self->length + right);
   5816     if (u) {
   5817         if (left)
   5818             Py_UNICODE_FILL(u->str, fill, left);
   5819         Py_UNICODE_COPY(u->str + left, self->str, self->length);
   5820         if (right)
   5821             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
   5822     }
   5823 
   5824     return u;
   5825 }
   5826 
   5827 PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
   5828 {
   5829     PyObject *list;
   5830 
   5831     string = PyUnicode_FromObject(string);
   5832     if (string == NULL)
   5833         return NULL;
   5834 
   5835     list = stringlib_splitlines(
   5836         (PyObject*) string, PyUnicode_AS_UNICODE(string),
   5837         PyUnicode_GET_SIZE(string), keepends);
   5838 
   5839     Py_DECREF(string);
   5840     return list;
   5841 }
   5842 
   5843 static
   5844 PyObject *split(PyUnicodeObject *self,
   5845                 PyUnicodeObject *substring,
   5846                 Py_ssize_t maxcount)
   5847 {
   5848     if (maxcount < 0)
   5849         maxcount = PY_SSIZE_T_MAX;
   5850 
   5851     if (substring == NULL)
   5852         return stringlib_split_whitespace(
   5853             (PyObject*) self,  self->str, self->length, maxcount
   5854             );
   5855 
   5856     return stringlib_split(
   5857         (PyObject*) self,  self->str, self->length,
   5858         substring->str, substring->length,
   5859         maxcount
   5860         );
   5861 }
   5862 
   5863 static
   5864 PyObject *rsplit(PyUnicodeObject *self,
   5865                  PyUnicodeObject *substring,
   5866                  Py_ssize_t maxcount)
   5867 {
   5868     if (maxcount < 0)
   5869         maxcount = PY_SSIZE_T_MAX;
   5870 
   5871     if (substring == NULL)
   5872         return stringlib_rsplit_whitespace(
   5873             (PyObject*) self,  self->str, self->length, maxcount
   5874             );
   5875 
   5876     return stringlib_rsplit(
   5877         (PyObject*) self,  self->str, self->length,
   5878         substring->str, substring->length,
   5879         maxcount
   5880         );
   5881 }
   5882 
   5883 static
   5884 PyObject *replace(PyUnicodeObject *self,
   5885                   PyUnicodeObject *str1,
   5886                   PyUnicodeObject *str2,
   5887                   Py_ssize_t maxcount)
   5888 {
   5889     PyUnicodeObject *u;
   5890 
   5891     if (maxcount < 0)
   5892         maxcount = PY_SSIZE_T_MAX;
   5893     else if (maxcount == 0 || self->length == 0)
   5894         goto nothing;
   5895 
   5896     if (str1->length == str2->length) {
   5897         Py_ssize_t i;
   5898         /* same length */
   5899         if (str1->length == 0)
   5900             goto nothing;
   5901         if (str1->length == 1) {
   5902             /* replace characters */
   5903             Py_UNICODE u1, u2;
   5904             if (!findchar(self->str, self->length, str1->str[0]))
   5905                 goto nothing;
   5906             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
   5907             if (!u)
   5908                 return NULL;
   5909             Py_UNICODE_COPY(u->str, self->str, self->length);
   5910             u1 = str1->str[0];
   5911             u2 = str2->str[0];
   5912             for (i = 0; i < u->length; i++)
   5913                 if (u->str[i] == u1) {
   5914                     if (--maxcount < 0)
   5915                         break;
   5916                     u->str[i] = u2;
   5917                 }
   5918         } else {
   5919             i = stringlib_find(
   5920                 self->str, self->length, str1->str, str1->length, 0
   5921                 );
   5922             if (i < 0)
   5923                 goto nothing;
   5924             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
   5925             if (!u)
   5926                 return NULL;
   5927             Py_UNICODE_COPY(u->str, self->str, self->length);
   5928 
   5929             /* change everything in-place, starting with this one */
   5930             Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
   5931             i += str1->length;
   5932 
   5933             while ( --maxcount > 0) {
   5934                 i = stringlib_find(self->str+i, self->length-i,
   5935                                    str1->str, str1->length,
   5936                                    i);
   5937                 if (i == -1)
   5938                     break;
   5939                 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
   5940                 i += str1->length;
   5941             }
   5942         }
   5943     } else {
   5944 
   5945         Py_ssize_t n, i, j;
   5946         Py_ssize_t product, new_size, delta;
   5947         Py_UNICODE *p;
   5948 
   5949         /* replace strings */
   5950         n = stringlib_count(self->str, self->length, str1->str, str1->length,
   5951                             maxcount);
   5952         if (n == 0)
   5953             goto nothing;
   5954         /* new_size = self->length + n * (str2->length - str1->length)); */
   5955         delta = (str2->length - str1->length);
   5956         if (delta == 0) {
   5957             new_size = self->length;
   5958         } else {
   5959             product = n * (str2->length - str1->length);
   5960             if ((product / (str2->length - str1->length)) != n) {
   5961                 PyErr_SetString(PyExc_OverflowError,
   5962                                 "replace string is too long");
   5963                 return NULL;
   5964             }
   5965             new_size = self->length + product;
   5966             if (new_size < 0) {
   5967                 PyErr_SetString(PyExc_OverflowError,
   5968                                 "replace string is too long");
   5969                 return NULL;
   5970             }
   5971         }
   5972         u = _PyUnicode_New(new_size);
   5973         if (!u)
   5974             return NULL;
   5975         i = 0;
   5976         p = u->str;
   5977         if (str1->length > 0) {
   5978             while (n-- > 0) {
   5979                 /* look for next match */
   5980                 j = stringlib_find(self->str+i, self->length-i,
   5981                                    str1->str, str1->length,
   5982                                    i);
   5983                 if (j == -1)
   5984                     break;
   5985                 else if (j > i) {
   5986                     /* copy unchanged part [i:j] */
   5987                     Py_UNICODE_COPY(p, self->str+i, j-i);
   5988                     p += j - i;
   5989                 }
   5990                 /* copy substitution string */
   5991                 if (str2->length > 0) {
   5992                     Py_UNICODE_COPY(p, str2->str, str2->length);
   5993                     p += str2->length;
   5994                 }
   5995                 i = j + str1->length;
   5996             }
   5997             if (i < self->length)
   5998                 /* copy tail [i:] */
   5999                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
   6000         } else {
   6001             /* interleave */
   6002             while (n > 0) {
   6003                 Py_UNICODE_COPY(p, str2->str, str2->length);
   6004                 p += str2->length;
   6005                 if (--n <= 0)
   6006                     break;
   6007                 *p++ = self->str[i++];
   6008             }
   6009             Py_UNICODE_COPY(p, self->str+i, self->length-i);
   6010         }
   6011     }
   6012     return (PyObject *) u;
   6013 
   6014   nothing:
   6015     /* nothing to replace; return original string (when possible) */
   6016     if (PyUnicode_CheckExact(self)) {
   6017         Py_INCREF(self);
   6018         return (PyObject *) self;
   6019     }
   6020     return PyUnicode_FromUnicode(self->str, self->length);
   6021 }
   6022 
   6023 /* --- Unicode Object Methods --------------------------------------------- */
   6024 
   6025 PyDoc_STRVAR(title__doc__,
   6026              "S.title() -> unicode\n\
   6027 \n\
   6028 Return a titlecased version of S, i.e. words start with title case\n\
   6029 characters, all remaining cased characters have lower case.");
   6030 
   6031 static PyObject*
   6032 unicode_title(PyUnicodeObject *self)
   6033 {
   6034     return fixup(self, fixtitle);
   6035 }
   6036 
   6037 PyDoc_STRVAR(capitalize__doc__,
   6038              "S.capitalize() -> unicode\n\
   6039 \n\
   6040 Return a capitalized version of S, i.e. make the first character\n\
   6041 have upper case and the rest lower case.");
   6042 
   6043 static PyObject*
   6044 unicode_capitalize(PyUnicodeObject *self)
   6045 {
   6046     return fixup(self, fixcapitalize);
   6047 }
   6048 
   6049 #if 0
   6050 PyDoc_STRVAR(capwords__doc__,
   6051              "S.capwords() -> unicode\n\
   6052 \n\
   6053 Apply .capitalize() to all words in S and return the result with\n\
   6054 normalized whitespace (all whitespace strings are replaced by ' ').");
   6055 
   6056 static PyObject*
   6057 unicode_capwords(PyUnicodeObject *self)
   6058 {
   6059     PyObject *list;
   6060     PyObject *item;
   6061     Py_ssize_t i;
   6062 
   6063     /* Split into words */
   6064     list = split(self, NULL, -1);
   6065     if (!list)
   6066         return NULL;
   6067 
   6068     /* Capitalize each word */
   6069     for (i = 0; i < PyList_GET_SIZE(list); i++) {
   6070         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
   6071                      fixcapitalize);
   6072         if (item == NULL)
   6073             goto onError;
   6074         Py_DECREF(PyList_GET_ITEM(list, i));
   6075         PyList_SET_ITEM(list, i, item);
   6076     }
   6077 
   6078     /* Join the words to form a new string */
   6079     item = PyUnicode_Join(NULL, list);
   6080 
   6081   onError:
   6082     Py_DECREF(list);
   6083     return (PyObject *)item;
   6084 }
   6085 #endif
   6086 
   6087 /* Argument converter.  Coerces to a single unicode character */
   6088 
   6089 static int
   6090 convert_uc(PyObject *obj, void *addr)
   6091 {
   6092     Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
   6093     PyObject *uniobj;
   6094     Py_UNICODE *unistr;
   6095 
   6096     uniobj = PyUnicode_FromObject(obj);
   6097     if (uniobj == NULL) {
   6098         PyErr_SetString(PyExc_TypeError,
   6099                         "The fill character cannot be converted to Unicode");
   6100         return 0;
   6101     }
   6102     if (PyUnicode_GET_SIZE(uniobj) != 1) {
   6103         PyErr_SetString(PyExc_TypeError,
   6104                         "The fill character must be exactly one character long");
   6105         Py_DECREF(uniobj);
   6106         return 0;
   6107     }
   6108     unistr = PyUnicode_AS_UNICODE(uniobj);
   6109     *fillcharloc = unistr[0];
   6110     Py_DECREF(uniobj);
   6111     return 1;
   6112 }
   6113 
   6114 PyDoc_STRVAR(center__doc__,
   6115              "S.center(width[, fillchar]) -> unicode\n\
   6116 \n\
   6117 Return S centered in a Unicode string of length width. Padding is\n\
   6118 done using the specified fill character (default is a space)");
   6119 
   6120 static PyObject *
   6121 unicode_center(PyUnicodeObject *self, PyObject *args)
   6122 {
   6123     Py_ssize_t marg, left;
   6124     Py_ssize_t width;
   6125     Py_UNICODE fillchar = ' ';
   6126 
   6127     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
   6128         return NULL;
   6129 
   6130     if (self->length >= width && PyUnicode_CheckExact(self)) {
   6131         Py_INCREF(self);
   6132         return (PyObject*) self;
   6133     }
   6134 
   6135     marg = width - self->length;
   6136     left = marg / 2 + (marg & width & 1);
   6137 
   6138     return (PyObject*) pad(self, left, marg - left, fillchar);
   6139 }
   6140 
   6141 #if 0
   6142 
   6143 /* This code should go into some future Unicode collation support
   6144    module. The basic comparison should compare ordinals on a naive
   6145    basis (this is what Java does and thus Jython too). */
   6146 
   6147 /* speedy UTF-16 code point order comparison */
   6148 /* gleaned from: */
   6149 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
   6150 
   6151 static short utf16Fixup[32] =
   6152 {
   6153     0, 0, 0, 0, 0, 0, 0, 0,
   6154     0, 0, 0, 0, 0, 0, 0, 0,
   6155     0, 0, 0, 0, 0, 0, 0, 0,
   6156     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
   6157 };
   6158 
   6159 static int
   6160 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
   6161 {
   6162     Py_ssize_t len1, len2;
   6163 
   6164     Py_UNICODE *s1 = str1->str;
   6165     Py_UNICODE *s2 = str2->str;
   6166 
   6167     len1 = str1->length;
   6168     len2 = str2->length;
   6169 
   6170     while (len1 > 0 && len2 > 0) {
   6171         Py_UNICODE c1, c2;
   6172 
   6173         c1 = *s1++;
   6174         c2 = *s2++;
   6175 
   6176         if (c1 > (1<<11) * 26)
   6177             c1 += utf16Fixup[c1>>11];
   6178         if (c2 > (1<<11) * 26)
   6179             c2 += utf16Fixup[c2>>11];
   6180         /* now c1 and c2 are in UTF-32-compatible order */
   6181 
   6182         if (c1 != c2)
   6183             return (c1 < c2) ? -1 : 1;
   6184 
   6185         len1--; len2--;
   6186     }
   6187 
   6188     return (len1 < len2) ? -1 : (len1 != len2);
   6189 }
   6190 
   6191 #else
   6192 
   6193 static int
   6194 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
   6195 {
   6196     register Py_ssize_t len1, len2;
   6197 
   6198     Py_UNICODE *s1 = str1->str;
   6199     Py_UNICODE *s2 = str2->str;
   6200 
   6201     len1 = str1->length;
   6202     len2 = str2->length;
   6203 
   6204     while (len1 > 0 && len2 > 0) {
   6205         Py_UNICODE c1, c2;
   6206 
   6207         c1 = *s1++;
   6208         c2 = *s2++;
   6209 
   6210         if (c1 != c2)
   6211             return (c1 < c2) ? -1 : 1;
   6212 
   6213         len1--; len2--;
   6214     }
   6215 
   6216     return (len1 < len2) ? -1 : (len1 != len2);
   6217 }
   6218 
   6219 #endif
   6220 
   6221 int PyUnicode_Compare(PyObject *left,
   6222                       PyObject *right)
   6223 {
   6224     PyUnicodeObject *u = NULL, *v = NULL;
   6225     int result;
   6226 
   6227     /* Coerce the two arguments */
   6228     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
   6229     if (u == NULL)
   6230         goto onError;
   6231     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
   6232     if (v == NULL)
   6233         goto onError;
   6234 
   6235     /* Shortcut for empty or interned objects */
   6236     if (v == u) {
   6237         Py_DECREF(u);
   6238         Py_DECREF(v);
   6239         return 0;
   6240     }
   6241 
   6242     result = unicode_compare(u, v);
   6243 
   6244     Py_DECREF(u);
   6245     Py_DECREF(v);
   6246     return result;
   6247 
   6248   onError:
   6249     Py_XDECREF(u);
   6250     Py_XDECREF(v);
   6251     return -1;
   6252 }
   6253 
   6254 PyObject *PyUnicode_RichCompare(PyObject *left,
   6255                                 PyObject *right,
   6256                                 int op)
   6257 {
   6258     int result;
   6259 
   6260     result = PyUnicode_Compare(left, right);
   6261     if (result == -1 && PyErr_Occurred())
   6262         goto onError;
   6263 
   6264     /* Convert the return value to a Boolean */
   6265     switch (op) {
   6266     case Py_EQ:
   6267         result = (result == 0);
   6268         break;
   6269     case Py_NE:
   6270         result = (result != 0);
   6271         break;
   6272     case Py_LE:
   6273         result = (result <= 0);
   6274         break;
   6275     case Py_GE:
   6276         result = (result >= 0);
   6277         break;
   6278     case Py_LT:
   6279         result = (result == -1);
   6280         break;
   6281     case Py_GT:
   6282         result = (result == 1);
   6283         break;
   6284     }
   6285     return PyBool_FromLong(result);
   6286 
   6287   onError:
   6288 
   6289     /* Standard case
   6290 
   6291        Type errors mean that PyUnicode_FromObject() could not convert
   6292        one of the arguments (usually the right hand side) to Unicode,
   6293        ie. we can't handle the comparison request. However, it is
   6294        possible that the other object knows a comparison method, which
   6295        is why we return Py_NotImplemented to give the other object a
   6296        chance.
   6297 
   6298     */
   6299     if (PyErr_ExceptionMatches(PyExc_TypeError)) {
   6300         PyErr_Clear();
   6301         Py_INCREF(Py_NotImplemented);
   6302         return Py_NotImplemented;
   6303     }
   6304     if (op != Py_EQ && op != Py_NE)
   6305         return NULL;
   6306 
   6307     /* Equality comparison.
   6308 
   6309        This is a special case: we silence any PyExc_UnicodeDecodeError
   6310        and instead turn it into a PyErr_UnicodeWarning.
   6311 
   6312     */
   6313     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
   6314         return NULL;
   6315     PyErr_Clear();
   6316     if (PyErr_Warn(PyExc_UnicodeWarning,
   6317                    (op == Py_EQ) ?
   6318                    "Unicode equal comparison "
   6319                    "failed to convert both arguments to Unicode - "
   6320                    "interpreting them as being unequal" :
   6321                    "Unicode unequal comparison "
   6322                    "failed to convert both arguments to Unicode - "
   6323                    "interpreting them as being unequal"
   6324             ) < 0)
   6325         return NULL;
   6326     result = (op == Py_NE);
   6327     return PyBool_FromLong(result);
   6328 }
   6329 
   6330 int PyUnicode_Contains(PyObject *container,
   6331                        PyObject *element)
   6332 {
   6333     PyObject *str, *sub;
   6334     int result;
   6335 
   6336     /* Coerce the two arguments */
   6337     sub = PyUnicode_FromObject(element);
   6338     if (!sub) {
   6339         return -1;
   6340     }
   6341 
   6342     str = PyUnicode_FromObject(container);
   6343     if (!str) {
   6344         Py_DECREF(sub);
   6345         return -1;
   6346     }
   6347 
   6348     result = stringlib_contains_obj(str, sub);
   6349 
   6350     Py_DECREF(str);
   6351     Py_DECREF(sub);
   6352 
   6353     return result;
   6354 }
   6355 
   6356 /* Concat to string or Unicode object giving a new Unicode object. */
   6357 
   6358 PyObject *PyUnicode_Concat(PyObject *left,
   6359                            PyObject *right)
   6360 {
   6361     PyUnicodeObject *u = NULL, *v = NULL, *w;
   6362 
   6363     /* Coerce the two arguments */
   6364     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
   6365     if (u == NULL)
   6366         goto onError;
   6367     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
   6368     if (v == NULL)
   6369         goto onError;
   6370 
   6371     /* Shortcuts */
   6372     if (v == unicode_empty) {
   6373         Py_DECREF(v);
   6374         return (PyObject *)u;
   6375     }
   6376     if (u == unicode_empty) {
   6377         Py_DECREF(u);
   6378         return (PyObject *)v;
   6379     }
   6380 
   6381     if (u->length > PY_SSIZE_T_MAX - v->length) {
   6382         PyErr_SetString(PyExc_OverflowError,
   6383                         "strings are too large to concat");
   6384         goto onError;
   6385     }
   6386 
   6387     /* Concat the two Unicode strings */
   6388     w = _PyUnicode_New(u->length + v->length);
   6389     if (w == NULL)
   6390         goto onError;
   6391     Py_UNICODE_COPY(w->str, u->str, u->length);
   6392     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
   6393 
   6394     Py_DECREF(u);
   6395     Py_DECREF(v);
   6396     return (PyObject *)w;
   6397 
   6398   onError:
   6399     Py_XDECREF(u);
   6400     Py_XDECREF(v);
   6401     return NULL;
   6402 }
   6403 
   6404 PyDoc_STRVAR(count__doc__,
   6405              "S.count(sub[, start[, end]]) -> int\n\
   6406 \n\
   6407 Return the number of non-overlapping occurrences of substring sub in\n\
   6408 Unicode string S[start:end].  Optional arguments start and end are\n\
   6409 interpreted as in slice notation.");
   6410 
   6411 static PyObject *
   6412 unicode_count(PyUnicodeObject *self, PyObject *args)
   6413 {
   6414     PyUnicodeObject *substring;
   6415     Py_ssize_t start = 0;
   6416     Py_ssize_t end = PY_SSIZE_T_MAX;
   6417     PyObject *result;
   6418 
   6419     if (!stringlib_parse_args_finds_unicode("count", args, &substring,
   6420                                             &start, &end))
   6421         return NULL;
   6422 
   6423     ADJUST_INDICES(start, end, self->length);
   6424     result = PyInt_FromSsize_t(
   6425         stringlib_count(self->str + start, end - start,
   6426                         substring->str, substring->length,
   6427                         PY_SSIZE_T_MAX)
   6428         );
   6429 
   6430     Py_DECREF(substring);
   6431 
   6432     return result;
   6433 }
   6434 
   6435 PyDoc_STRVAR(encode__doc__,
   6436              "S.encode([encoding[,errors]]) -> string or unicode\n\
   6437 \n\
   6438 Encodes S using the codec registered for encoding. encoding defaults\n\
   6439 to the default encoding. errors may be given to set a different error\n\
   6440 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
   6441 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
   6442 'xmlcharrefreplace' as well as any other name registered with\n\
   6443 codecs.register_error that can handle UnicodeEncodeErrors.");
   6444 
   6445 static PyObject *
   6446 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
   6447 {
   6448     static char *kwlist[] = {"encoding", "errors", 0};
   6449     char *encoding = NULL;
   6450     char *errors = NULL;
   6451     PyObject *v;
   6452 
   6453     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
   6454                                      kwlist, &encoding, &errors))
   6455         return NULL;
   6456     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
   6457     if (v == NULL)
   6458         goto onError;
   6459     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
   6460         PyErr_Format(PyExc_TypeError,
   6461                      "encoder did not return a string/unicode object "
   6462                      "(type=%.400s)",
   6463                      Py_TYPE(v)->tp_name);
   6464         Py_DECREF(v);
   6465         return NULL;
   6466     }
   6467     return v;
   6468 
   6469   onError:
   6470     return NULL;
   6471 }
   6472 
   6473 PyDoc_STRVAR(decode__doc__,
   6474              "S.decode([encoding[,errors]]) -> string or unicode\n\
   6475 \n\
   6476 Decodes S using the codec registered for encoding. encoding defaults\n\
   6477 to the default encoding. errors may be given to set a different error\n\
   6478 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
   6479 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
   6480 as well as any other name registered with codecs.register_error that is\n\
   6481 able to handle UnicodeDecodeErrors.");
   6482 
   6483 static PyObject *
   6484 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
   6485 {
   6486     static char *kwlist[] = {"encoding", "errors", 0};
   6487     char *encoding = NULL;
   6488     char *errors = NULL;
   6489     PyObject *v;
   6490 
   6491     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
   6492                                      kwlist, &encoding, &errors))
   6493         return NULL;
   6494     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
   6495     if (v == NULL)
   6496         goto onError;
   6497     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
   6498         PyErr_Format(PyExc_TypeError,
   6499                      "decoder did not return a string/unicode object "
   6500                      "(type=%.400s)",
   6501                      Py_TYPE(v)->tp_name);
   6502         Py_DECREF(v);
   6503         return NULL;
   6504     }
   6505     return v;
   6506 
   6507   onError:
   6508     return NULL;
   6509 }
   6510 
   6511 PyDoc_STRVAR(expandtabs__doc__,
   6512              "S.expandtabs([tabsize]) -> unicode\n\
   6513 \n\
   6514 Return a copy of S where all tab characters are expanded using spaces.\n\
   6515 If tabsize is not given, a tab size of 8 characters is assumed.");
   6516 
   6517 static PyObject*
   6518 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
   6519 {
   6520     Py_UNICODE *e;
   6521     Py_UNICODE *p;
   6522     Py_UNICODE *q;
   6523     Py_UNICODE *qe;
   6524     Py_ssize_t i, j, incr;
   6525     PyUnicodeObject *u;
   6526     int tabsize = 8;
   6527 
   6528     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
   6529         return NULL;
   6530 
   6531     /* First pass: determine size of output string */
   6532     i = 0; /* chars up to and including most recent \n or \r */
   6533     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
   6534     e = self->str + self->length; /* end of input */
   6535     for (p = self->str; p < e; p++)
   6536         if (*p == '\t') {
   6537             if (tabsize > 0) {
   6538                 incr = tabsize - (j % tabsize); /* cannot overflow */
   6539                 if (j > PY_SSIZE_T_MAX - incr)
   6540                     goto overflow1;
   6541                 j += incr;
   6542             }
   6543         }
   6544         else {
   6545             if (j > PY_SSIZE_T_MAX - 1)
   6546                 goto overflow1;
   6547             j++;
   6548             if (*p == '\n' || *p == '\r') {
   6549                 if (i > PY_SSIZE_T_MAX - j)
   6550                     goto overflow1;
   6551                 i += j;
   6552                 j = 0;
   6553             }
   6554         }
   6555 
   6556     if (i > PY_SSIZE_T_MAX - j)
   6557         goto overflow1;
   6558 
   6559     /* Second pass: create output string and fill it */
   6560     u = _PyUnicode_New(i + j);
   6561     if (!u)
   6562         return NULL;
   6563 
   6564     j = 0; /* same as in first pass */
   6565     q = u->str; /* next output char */
   6566     qe = u->str + u->length; /* end of output */
   6567 
   6568     for (p = self->str; p < e; p++)
   6569         if (*p == '\t') {
   6570             if (tabsize > 0) {
   6571                 i = tabsize - (j % tabsize);
   6572                 j += i;
   6573                 while (i--) {
   6574                     if (q >= qe)
   6575                         goto overflow2;
   6576                     *q++ = ' ';
   6577                 }
   6578             }
   6579         }
   6580         else {
   6581             if (q >= qe)
   6582                 goto overflow2;
   6583             *q++ = *p;
   6584             j++;
   6585             if (*p == '\n' || *p == '\r')
   6586                 j = 0;
   6587         }
   6588 
   6589     return (PyObject*) u;
   6590 
   6591   overflow2:
   6592     Py_DECREF(u);
   6593   overflow1:
   6594     PyErr_SetString(PyExc_OverflowError, "new string is too long");
   6595     return NULL;
   6596 }
   6597 
   6598 PyDoc_STRVAR(find__doc__,
   6599              "S.find(sub [,start [,end]]) -> int\n\
   6600 \n\
   6601 Return the lowest index in S where substring sub is found,\n\
   6602 such that sub is contained within S[start:end].  Optional\n\
   6603 arguments start and end are interpreted as in slice notation.\n\
   6604 \n\
   6605 Return -1 on failure.");
   6606 
   6607 static PyObject *
   6608 unicode_find(PyUnicodeObject *self, PyObject *args)
   6609 {
   6610     PyUnicodeObject *substring;
   6611     Py_ssize_t start;
   6612     Py_ssize_t end;
   6613     Py_ssize_t result;
   6614 
   6615     if (!stringlib_parse_args_finds_unicode("find", args, &substring,
   6616                                             &start, &end))
   6617         return NULL;
   6618 
   6619     result = stringlib_find_slice(
   6620         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
   6621         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
   6622         start, end
   6623         );
   6624 
   6625     Py_DECREF(substring);
   6626 
   6627     return PyInt_FromSsize_t(result);
   6628 }
   6629 
   6630 static PyObject *
   6631 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
   6632 {
   6633     if (index < 0 || index >= self->length) {
   6634         PyErr_SetString(PyExc_IndexError, "string index out of range");
   6635         return NULL;
   6636     }
   6637 
   6638     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
   6639 }
   6640 
   6641 static long
   6642 unicode_hash(PyUnicodeObject *self)
   6643 {
   6644     /* Since Unicode objects compare equal to their ASCII string
   6645        counterparts, they should use the individual character values
   6646        as basis for their hash value.  This is needed to assure that
   6647        strings and Unicode objects behave in the same way as
   6648        dictionary keys. */
   6649 
   6650     register Py_ssize_t len;
   6651     register Py_UNICODE *p;
   6652     register long x;
   6653 
   6654 #ifdef Py_DEBUG
   6655     assert(_Py_HashSecret_Initialized);
   6656 #endif
   6657     if (self->hash != -1)
   6658         return self->hash;
   6659     len = PyUnicode_GET_SIZE(self);
   6660     /*
   6661       We make the hash of the empty string be 0, rather than using
   6662       (prefix ^ suffix), since this slightly obfuscates the hash secret
   6663     */
   6664     if (len == 0) {
   6665         self->hash = 0;
   6666         return 0;
   6667     }
   6668     p = PyUnicode_AS_UNICODE(self);
   6669     x = _Py_HashSecret.prefix;
   6670     x ^= *p << 7;
   6671     while (--len >= 0)
   6672         x = (1000003*x) ^ *p++;
   6673     x ^= PyUnicode_GET_SIZE(self);
   6674     x ^= _Py_HashSecret.suffix;
   6675     if (x == -1)
   6676         x = -2;
   6677     self->hash = x;
   6678     return x;
   6679 }
   6680 
   6681 PyDoc_STRVAR(index__doc__,
   6682              "S.index(sub [,start [,end]]) -> int\n\
   6683 \n\
   6684 Like S.find() but raise ValueError when the substring is not found.");
   6685 
   6686 static PyObject *
   6687 unicode_index(PyUnicodeObject *self, PyObject *args)
   6688 {
   6689     Py_ssize_t result;
   6690     PyUnicodeObject *substring;
   6691     Py_ssize_t start;
   6692     Py_ssize_t end;
   6693 
   6694     if (!stringlib_parse_args_finds_unicode("index", args, &substring,
   6695                                             &start, &end))
   6696         return NULL;
   6697 
   6698     result = stringlib_find_slice(
   6699         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
   6700         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
   6701         start, end
   6702         );
   6703 
   6704     Py_DECREF(substring);
   6705 
   6706     if (result < 0) {
   6707         PyErr_SetString(PyExc_ValueError, "substring not found");
   6708         return NULL;
   6709     }
   6710 
   6711     return PyInt_FromSsize_t(result);
   6712 }
   6713 
   6714 PyDoc_STRVAR(islower__doc__,
   6715              "S.islower() -> bool\n\
   6716 \n\
   6717 Return True if all cased characters in S are lowercase and there is\n\
   6718 at least one cased character in S, False otherwise.");
   6719 
   6720 static PyObject*
   6721 unicode_islower(PyUnicodeObject *self)
   6722 {
   6723     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6724     register const Py_UNICODE *e;
   6725     int cased;
   6726 
   6727     /* Shortcut for single character strings */
   6728     if (PyUnicode_GET_SIZE(self) == 1)
   6729         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
   6730 
   6731     /* Special case for empty strings */
   6732     if (PyUnicode_GET_SIZE(self) == 0)
   6733         return PyBool_FromLong(0);
   6734 
   6735     e = p + PyUnicode_GET_SIZE(self);
   6736     cased = 0;
   6737     for (; p < e; p++) {
   6738         register const Py_UNICODE ch = *p;
   6739 
   6740         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
   6741             return PyBool_FromLong(0);
   6742         else if (!cased && Py_UNICODE_ISLOWER(ch))
   6743             cased = 1;
   6744     }
   6745     return PyBool_FromLong(cased);
   6746 }
   6747 
   6748 PyDoc_STRVAR(isupper__doc__,
   6749              "S.isupper() -> bool\n\
   6750 \n\
   6751 Return True if all cased characters in S are uppercase and there is\n\
   6752 at least one cased character in S, False otherwise.");
   6753 
   6754 static PyObject*
   6755 unicode_isupper(PyUnicodeObject *self)
   6756 {
   6757     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6758     register const Py_UNICODE *e;
   6759     int cased;
   6760 
   6761     /* Shortcut for single character strings */
   6762     if (PyUnicode_GET_SIZE(self) == 1)
   6763         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
   6764 
   6765     /* Special case for empty strings */
   6766     if (PyUnicode_GET_SIZE(self) == 0)
   6767         return PyBool_FromLong(0);
   6768 
   6769     e = p + PyUnicode_GET_SIZE(self);
   6770     cased = 0;
   6771     for (; p < e; p++) {
   6772         register const Py_UNICODE ch = *p;
   6773 
   6774         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
   6775             return PyBool_FromLong(0);
   6776         else if (!cased && Py_UNICODE_ISUPPER(ch))
   6777             cased = 1;
   6778     }
   6779     return PyBool_FromLong(cased);
   6780 }
   6781 
   6782 PyDoc_STRVAR(istitle__doc__,
   6783              "S.istitle() -> bool\n\
   6784 \n\
   6785 Return True if S is a titlecased string and there is at least one\n\
   6786 character in S, i.e. upper- and titlecase characters may only\n\
   6787 follow uncased characters and lowercase characters only cased ones.\n\
   6788 Return False otherwise.");
   6789 
   6790 static PyObject*
   6791 unicode_istitle(PyUnicodeObject *self)
   6792 {
   6793     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6794     register const Py_UNICODE *e;
   6795     int cased, previous_is_cased;
   6796 
   6797     /* Shortcut for single character strings */
   6798     if (PyUnicode_GET_SIZE(self) == 1)
   6799         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
   6800                                (Py_UNICODE_ISUPPER(*p) != 0));
   6801 
   6802     /* Special case for empty strings */
   6803     if (PyUnicode_GET_SIZE(self) == 0)
   6804         return PyBool_FromLong(0);
   6805 
   6806     e = p + PyUnicode_GET_SIZE(self);
   6807     cased = 0;
   6808     previous_is_cased = 0;
   6809     for (; p < e; p++) {
   6810         register const Py_UNICODE ch = *p;
   6811 
   6812         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
   6813             if (previous_is_cased)
   6814                 return PyBool_FromLong(0);
   6815             previous_is_cased = 1;
   6816             cased = 1;
   6817         }
   6818         else if (Py_UNICODE_ISLOWER(ch)) {
   6819             if (!previous_is_cased)
   6820                 return PyBool_FromLong(0);
   6821             previous_is_cased = 1;
   6822             cased = 1;
   6823         }
   6824         else
   6825             previous_is_cased = 0;
   6826     }
   6827     return PyBool_FromLong(cased);
   6828 }
   6829 
   6830 PyDoc_STRVAR(isspace__doc__,
   6831              "S.isspace() -> bool\n\
   6832 \n\
   6833 Return True if all characters in S are whitespace\n\
   6834 and there is at least one character in S, False otherwise.");
   6835 
   6836 static PyObject*
   6837 unicode_isspace(PyUnicodeObject *self)
   6838 {
   6839     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6840     register const Py_UNICODE *e;
   6841 
   6842     /* Shortcut for single character strings */
   6843     if (PyUnicode_GET_SIZE(self) == 1 &&
   6844         Py_UNICODE_ISSPACE(*p))
   6845         return PyBool_FromLong(1);
   6846 
   6847     /* Special case for empty strings */
   6848     if (PyUnicode_GET_SIZE(self) == 0)
   6849         return PyBool_FromLong(0);
   6850 
   6851     e = p + PyUnicode_GET_SIZE(self);
   6852     for (; p < e; p++) {
   6853         if (!Py_UNICODE_ISSPACE(*p))
   6854             return PyBool_FromLong(0);
   6855     }
   6856     return PyBool_FromLong(1);
   6857 }
   6858 
   6859 PyDoc_STRVAR(isalpha__doc__,
   6860              "S.isalpha() -> bool\n\
   6861 \n\
   6862 Return True if all characters in S are alphabetic\n\
   6863 and there is at least one character in S, False otherwise.");
   6864 
   6865 static PyObject*
   6866 unicode_isalpha(PyUnicodeObject *self)
   6867 {
   6868     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6869     register const Py_UNICODE *e;
   6870 
   6871     /* Shortcut for single character strings */
   6872     if (PyUnicode_GET_SIZE(self) == 1 &&
   6873         Py_UNICODE_ISALPHA(*p))
   6874         return PyBool_FromLong(1);
   6875 
   6876     /* Special case for empty strings */
   6877     if (PyUnicode_GET_SIZE(self) == 0)
   6878         return PyBool_FromLong(0);
   6879 
   6880     e = p + PyUnicode_GET_SIZE(self);
   6881     for (; p < e; p++) {
   6882         if (!Py_UNICODE_ISALPHA(*p))
   6883             return PyBool_FromLong(0);
   6884     }
   6885     return PyBool_FromLong(1);
   6886 }
   6887 
   6888 PyDoc_STRVAR(isalnum__doc__,
   6889              "S.isalnum() -> bool\n\
   6890 \n\
   6891 Return True if all characters in S are alphanumeric\n\
   6892 and there is at least one character in S, False otherwise.");
   6893 
   6894 static PyObject*
   6895 unicode_isalnum(PyUnicodeObject *self)
   6896 {
   6897     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6898     register const Py_UNICODE *e;
   6899 
   6900     /* Shortcut for single character strings */
   6901     if (PyUnicode_GET_SIZE(self) == 1 &&
   6902         Py_UNICODE_ISALNUM(*p))
   6903         return PyBool_FromLong(1);
   6904 
   6905     /* Special case for empty strings */
   6906     if (PyUnicode_GET_SIZE(self) == 0)
   6907         return PyBool_FromLong(0);
   6908 
   6909     e = p + PyUnicode_GET_SIZE(self);
   6910     for (; p < e; p++) {
   6911         if (!Py_UNICODE_ISALNUM(*p))
   6912             return PyBool_FromLong(0);
   6913     }
   6914     return PyBool_FromLong(1);
   6915 }
   6916 
   6917 PyDoc_STRVAR(isdecimal__doc__,
   6918              "S.isdecimal() -> bool\n\
   6919 \n\
   6920 Return True if there are only decimal characters in S,\n\
   6921 False otherwise.");
   6922 
   6923 static PyObject*
   6924 unicode_isdecimal(PyUnicodeObject *self)
   6925 {
   6926     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6927     register const Py_UNICODE *e;
   6928 
   6929     /* Shortcut for single character strings */
   6930     if (PyUnicode_GET_SIZE(self) == 1 &&
   6931         Py_UNICODE_ISDECIMAL(*p))
   6932         return PyBool_FromLong(1);
   6933 
   6934     /* Special case for empty strings */
   6935     if (PyUnicode_GET_SIZE(self) == 0)
   6936         return PyBool_FromLong(0);
   6937 
   6938     e = p + PyUnicode_GET_SIZE(self);
   6939     for (; p < e; p++) {
   6940         if (!Py_UNICODE_ISDECIMAL(*p))
   6941             return PyBool_FromLong(0);
   6942     }
   6943     return PyBool_FromLong(1);
   6944 }
   6945 
   6946 PyDoc_STRVAR(isdigit__doc__,
   6947              "S.isdigit() -> bool\n\
   6948 \n\
   6949 Return True if all characters in S are digits\n\
   6950 and there is at least one character in S, False otherwise.");
   6951 
   6952 static PyObject*
   6953 unicode_isdigit(PyUnicodeObject *self)
   6954 {
   6955     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6956     register const Py_UNICODE *e;
   6957 
   6958     /* Shortcut for single character strings */
   6959     if (PyUnicode_GET_SIZE(self) == 1 &&
   6960         Py_UNICODE_ISDIGIT(*p))
   6961         return PyBool_FromLong(1);
   6962 
   6963     /* Special case for empty strings */
   6964     if (PyUnicode_GET_SIZE(self) == 0)
   6965         return PyBool_FromLong(0);
   6966 
   6967     e = p + PyUnicode_GET_SIZE(self);
   6968     for (; p < e; p++) {
   6969         if (!Py_UNICODE_ISDIGIT(*p))
   6970             return PyBool_FromLong(0);
   6971     }
   6972     return PyBool_FromLong(1);
   6973 }
   6974 
   6975 PyDoc_STRVAR(isnumeric__doc__,
   6976              "S.isnumeric() -> bool\n\
   6977 \n\
   6978 Return True if there are only numeric characters in S,\n\
   6979 False otherwise.");
   6980 
   6981 static PyObject*
   6982 unicode_isnumeric(PyUnicodeObject *self)
   6983 {
   6984     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6985     register const Py_UNICODE *e;
   6986 
   6987     /* Shortcut for single character strings */
   6988     if (PyUnicode_GET_SIZE(self) == 1 &&
   6989         Py_UNICODE_ISNUMERIC(*p))
   6990         return PyBool_FromLong(1);
   6991 
   6992     /* Special case for empty strings */
   6993     if (PyUnicode_GET_SIZE(self) == 0)
   6994         return PyBool_FromLong(0);
   6995 
   6996     e = p + PyUnicode_GET_SIZE(self);
   6997     for (; p < e; p++) {
   6998         if (!Py_UNICODE_ISNUMERIC(*p))
   6999             return PyBool_FromLong(0);
   7000     }
   7001     return PyBool_FromLong(1);
   7002 }
   7003 
   7004 PyDoc_STRVAR(join__doc__,
   7005              "S.join(iterable) -> unicode\n\
   7006 \n\
   7007 Return a string which is the concatenation of the strings in the\n\
   7008 iterable.  The separator between elements is S.");
   7009 
   7010 static PyObject*
   7011 unicode_join(PyObject *self, PyObject *data)
   7012 {
   7013     return PyUnicode_Join(self, data);
   7014 }
   7015 
   7016 static Py_ssize_t
   7017 unicode_length(PyUnicodeObject *self)
   7018 {
   7019     return self->length;
   7020 }
   7021 
   7022 PyDoc_STRVAR(ljust__doc__,
   7023              "S.ljust(width[, fillchar]) -> int\n\
   7024 \n\
   7025 Return S left-justified in a Unicode string of length width. Padding is\n\
   7026 done using the specified fill character (default is a space).");
   7027 
   7028 static PyObject *
   7029 unicode_ljust(PyUnicodeObject *self, PyObject *args)
   7030 {
   7031     Py_ssize_t width;
   7032     Py_UNICODE fillchar = ' ';
   7033 
   7034     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
   7035         return NULL;
   7036 
   7037     if (self->length >= width && PyUnicode_CheckExact(self)) {
   7038         Py_INCREF(self);
   7039         return (PyObject*) self;
   7040     }
   7041 
   7042     return (PyObject*) pad(self, 0, width - self->length, fillchar);
   7043 }
   7044 
   7045 PyDoc_STRVAR(lower__doc__,
   7046              "S.lower() -> unicode\n\
   7047 \n\
   7048 Return a copy of the string S converted to lowercase.");
   7049 
   7050 static PyObject*
   7051 unicode_lower(PyUnicodeObject *self)
   7052 {
   7053     return fixup(self, fixlower);
   7054 }
   7055 
   7056 #define LEFTSTRIP 0
   7057 #define RIGHTSTRIP 1
   7058 #define BOTHSTRIP 2
   7059 
   7060 /* Arrays indexed by above */
   7061 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
   7062 
   7063 #define STRIPNAME(i) (stripformat[i]+3)
   7064 
   7065 /* externally visible for str.strip(unicode) */
   7066 PyObject *
   7067 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
   7068 {
   7069     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
   7070     Py_ssize_t len = PyUnicode_GET_SIZE(self);
   7071     Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
   7072     Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
   7073     Py_ssize_t i, j;
   7074 
   7075     BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
   7076 
   7077     i = 0;
   7078     if (striptype != RIGHTSTRIP) {
   7079         while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
   7080             i++;
   7081         }
   7082     }
   7083 
   7084     j = len;
   7085     if (striptype != LEFTSTRIP) {
   7086         do {
   7087             j--;
   7088         } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
   7089         j++;
   7090     }
   7091 
   7092     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
   7093         Py_INCREF(self);
   7094         return (PyObject*)self;
   7095     }
   7096     else
   7097         return PyUnicode_FromUnicode(s+i, j-i);
   7098 }
   7099 
   7100 
   7101 static PyObject *
   7102 do_strip(PyUnicodeObject *self, int striptype)
   7103 {
   7104     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
   7105     Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
   7106 
   7107     i = 0;
   7108     if (striptype != RIGHTSTRIP) {
   7109         while (i < len && Py_UNICODE_ISSPACE(s[i])) {
   7110             i++;
   7111         }
   7112     }
   7113 
   7114     j = len;
   7115     if (striptype != LEFTSTRIP) {
   7116         do {
   7117             j--;
   7118         } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
   7119         j++;
   7120     }
   7121 
   7122     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
   7123         Py_INCREF(self);
   7124         return (PyObject*)self;
   7125     }
   7126     else
   7127         return PyUnicode_FromUnicode(s+i, j-i);
   7128 }
   7129 
   7130 
   7131 static PyObject *
   7132 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
   7133 {
   7134     PyObject *sep = NULL;
   7135 
   7136     if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
   7137         return NULL;
   7138 
   7139     if (sep != NULL && sep != Py_None) {
   7140         if (PyUnicode_Check(sep))
   7141             return _PyUnicode_XStrip(self, striptype, sep);
   7142         else if (PyString_Check(sep)) {
   7143             PyObject *res;
   7144             sep = PyUnicode_FromObject(sep);
   7145             if (sep==NULL)
   7146                 return NULL;
   7147             res = _PyUnicode_XStrip(self, striptype, sep);
   7148             Py_DECREF(sep);
   7149             return res;
   7150         }
   7151         else {
   7152             PyErr_Format(PyExc_TypeError,
   7153                          "%s arg must be None, unicode or str",
   7154                          STRIPNAME(striptype));
   7155             return NULL;
   7156         }
   7157     }
   7158 
   7159     return do_strip(self, striptype);
   7160 }
   7161 
   7162 
   7163 PyDoc_STRVAR(strip__doc__,
   7164              "S.strip([chars]) -> unicode\n\
   7165 \n\
   7166 Return a copy of the string S with leading and trailing\n\
   7167 whitespace removed.\n\
   7168 If chars is given and not None, remove characters in chars instead.\n\
   7169 If chars is a str, it will be converted to unicode before stripping");
   7170 
   7171 static PyObject *
   7172 unicode_strip(PyUnicodeObject *self, PyObject *args)
   7173 {
   7174     if (PyTuple_GET_SIZE(args) == 0)
   7175         return do_strip(self, BOTHSTRIP); /* Common case */
   7176     else
   7177         return do_argstrip(self, BOTHSTRIP, args);
   7178 }
   7179 
   7180 
   7181 PyDoc_STRVAR(lstrip__doc__,
   7182              "S.lstrip([chars]) -> unicode\n\
   7183 \n\
   7184 Return a copy of the string S with leading whitespace removed.\n\
   7185 If chars is given and not None, remove characters in chars instead.\n\
   7186 If chars is a str, it will be converted to unicode before stripping");
   7187 
   7188 static PyObject *
   7189 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
   7190 {
   7191     if (PyTuple_GET_SIZE(args) == 0)
   7192         return do_strip(self, LEFTSTRIP); /* Common case */
   7193     else
   7194         return do_argstrip(self, LEFTSTRIP, args);
   7195 }
   7196 
   7197 
   7198 PyDoc_STRVAR(rstrip__doc__,
   7199              "S.rstrip([chars]) -> unicode\n\
   7200 \n\
   7201 Return a copy of the string S with trailing whitespace removed.\n\
   7202 If chars is given and not None, remove characters in chars instead.\n\
   7203 If chars is a str, it will be converted to unicode before stripping");
   7204 
   7205 static PyObject *
   7206 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
   7207 {
   7208     if (PyTuple_GET_SIZE(args) == 0)
   7209         return do_strip(self, RIGHTSTRIP); /* Common case */
   7210     else
   7211         return do_argstrip(self, RIGHTSTRIP, args);
   7212 }
   7213 
   7214 
   7215 static PyObject*
   7216 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
   7217 {
   7218     PyUnicodeObject *u;
   7219     Py_UNICODE *p;
   7220     Py_ssize_t nchars;
   7221     size_t nbytes;
   7222 
   7223     if (len < 0)
   7224         len = 0;
   7225 
   7226     if (len == 1 && PyUnicode_CheckExact(str)) {
   7227         /* no repeat, return original string */
   7228         Py_INCREF(str);
   7229         return (PyObject*) str;
   7230     }
   7231 
   7232     /* ensure # of chars needed doesn't overflow Py_ssize_t and # of bytes
   7233      * needed doesn't overflow size_t
   7234      */
   7235     if (len && str->length > PY_SSIZE_T_MAX / len) {
   7236         PyErr_SetString(PyExc_OverflowError,
   7237                         "repeated string is too long");
   7238         return NULL;
   7239     }
   7240     nchars = len * str->length;
   7241     nbytes = ((size_t)nchars + 1u) * sizeof(Py_UNICODE);
   7242     if (nbytes / sizeof(Py_UNICODE) != ((size_t)nchars + 1u)) {
   7243         PyErr_SetString(PyExc_OverflowError,
   7244                         "repeated string is too long");
   7245         return NULL;
   7246     }
   7247     u = _PyUnicode_New(nchars);
   7248     if (!u)
   7249         return NULL;
   7250 
   7251     p = u->str;
   7252 
   7253     if (str->length == 1 && len > 0) {
   7254         Py_UNICODE_FILL(p, str->str[0], len);
   7255     } else {
   7256         Py_ssize_t done = 0; /* number of characters copied this far */
   7257         if (done < nchars) {
   7258             Py_UNICODE_COPY(p, str->str, str->length);
   7259             done = str->length;
   7260         }
   7261         while (done < nchars) {
   7262             Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
   7263             Py_UNICODE_COPY(p+done, p, n);
   7264             done += n;
   7265         }
   7266     }
   7267 
   7268     return (PyObject*) u;
   7269 }
   7270 
   7271 PyObject *PyUnicode_Replace(PyObject *obj,
   7272                             PyObject *subobj,
   7273                             PyObject *replobj,
   7274                             Py_ssize_t maxcount)
   7275 {
   7276     PyObject *self;
   7277     PyObject *str1;
   7278     PyObject *str2;
   7279     PyObject *result;
   7280 
   7281     self = PyUnicode_FromObject(obj);
   7282     if (self == NULL)
   7283         return NULL;
   7284     str1 = PyUnicode_FromObject(subobj);
   7285     if (str1 == NULL) {
   7286         Py_DECREF(self);
   7287         return NULL;
   7288     }
   7289     str2 = PyUnicode_FromObject(replobj);
   7290     if (str2 == NULL) {
   7291         Py_DECREF(self);
   7292         Py_DECREF(str1);
   7293         return NULL;
   7294     }
   7295     result = replace((PyUnicodeObject *)self,
   7296                      (PyUnicodeObject *)str1,
   7297                      (PyUnicodeObject *)str2,
   7298                      maxcount);
   7299     Py_DECREF(self);
   7300     Py_DECREF(str1);
   7301     Py_DECREF(str2);
   7302     return result;
   7303 }
   7304 
   7305 PyDoc_STRVAR(replace__doc__,
   7306              "S.replace(old, new[, count]) -> unicode\n\
   7307 \n\
   7308 Return a copy of S with all occurrences of substring\n\
   7309 old replaced by new.  If the optional argument count is\n\
   7310 given, only the first count occurrences are replaced.");
   7311 
   7312 static PyObject*
   7313 unicode_replace(PyUnicodeObject *self, PyObject *args)
   7314 {
   7315     PyUnicodeObject *str1;
   7316     PyUnicodeObject *str2;
   7317     Py_ssize_t maxcount = -1;
   7318     PyObject *result;
   7319 
   7320     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
   7321         return NULL;
   7322     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
   7323     if (str1 == NULL)
   7324         return NULL;
   7325     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
   7326     if (str2 == NULL) {
   7327         Py_DECREF(str1);
   7328         return NULL;
   7329     }
   7330 
   7331     result = replace(self, str1, str2, maxcount);
   7332 
   7333     Py_DECREF(str1);
   7334     Py_DECREF(str2);
   7335     return result;
   7336 }
   7337 
   7338 static
   7339 PyObject *unicode_repr(PyObject *unicode)
   7340 {
   7341     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
   7342                                 PyUnicode_GET_SIZE(unicode),
   7343                                 1);
   7344 }
   7345 
   7346 PyDoc_STRVAR(rfind__doc__,
   7347              "S.rfind(sub [,start [,end]]) -> int\n\
   7348 \n\
   7349 Return the highest index in S where substring sub is found,\n\
   7350 such that sub is contained within S[start:end].  Optional\n\
   7351 arguments start and end are interpreted as in slice notation.\n\
   7352 \n\
   7353 Return -1 on failure.");
   7354 
   7355 static PyObject *
   7356 unicode_rfind(PyUnicodeObject *self, PyObject *args)
   7357 {
   7358     PyUnicodeObject *substring;
   7359     Py_ssize_t start;
   7360     Py_ssize_t end;
   7361     Py_ssize_t result;
   7362 
   7363     if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
   7364                                             &start, &end))
   7365         return NULL;
   7366 
   7367     result = stringlib_rfind_slice(
   7368         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
   7369         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
   7370         start, end
   7371         );
   7372 
   7373     Py_DECREF(substring);
   7374 
   7375     return PyInt_FromSsize_t(result);
   7376 }
   7377 
   7378 PyDoc_STRVAR(rindex__doc__,
   7379              "S.rindex(sub [,start [,end]]) -> int\n\
   7380 \n\
   7381 Like S.rfind() but raise ValueError when the substring is not found.");
   7382 
   7383 static PyObject *
   7384 unicode_rindex(PyUnicodeObject *self, PyObject *args)
   7385 {
   7386     PyUnicodeObject *substring;
   7387     Py_ssize_t start;
   7388     Py_ssize_t end;
   7389     Py_ssize_t result;
   7390 
   7391     if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
   7392                                             &start, &end))
   7393         return NULL;
   7394 
   7395     result = stringlib_rfind_slice(
   7396         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
   7397         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
   7398         start, end
   7399         );
   7400 
   7401     Py_DECREF(substring);
   7402 
   7403     if (result < 0) {
   7404         PyErr_SetString(PyExc_ValueError, "substring not found");
   7405         return NULL;
   7406     }
   7407     return PyInt_FromSsize_t(result);
   7408 }
   7409 
   7410 PyDoc_STRVAR(rjust__doc__,
   7411              "S.rjust(width[, fillchar]) -> unicode\n\
   7412 \n\
   7413 Return S right-justified in a Unicode string of length width. Padding is\n\
   7414 done using the specified fill character (default is a space).");
   7415 
   7416 static PyObject *
   7417 unicode_rjust(PyUnicodeObject *self, PyObject *args)
   7418 {
   7419     Py_ssize_t width;
   7420     Py_UNICODE fillchar = ' ';
   7421 
   7422     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
   7423         return NULL;
   7424 
   7425     if (self->length >= width && PyUnicode_CheckExact(self)) {
   7426         Py_INCREF(self);
   7427         return (PyObject*) self;
   7428     }
   7429 
   7430     return (PyObject*) pad(self, width - self->length, 0, fillchar);
   7431 }
   7432 
   7433 static PyObject*
   7434 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
   7435 {
   7436     /* standard clamping */
   7437     if (start < 0)
   7438         start = 0;
   7439     if (end < 0)
   7440         end = 0;
   7441     if (end > self->length)
   7442         end = self->length;
   7443     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
   7444         /* full slice, return original string */
   7445         Py_INCREF(self);
   7446         return (PyObject*) self;
   7447     }
   7448     if (start > end)
   7449         start = end;
   7450     /* copy slice */
   7451     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
   7452                                              end - start);
   7453 }
   7454 
   7455 PyObject *PyUnicode_Split(PyObject *s,
   7456                           PyObject *sep,
   7457                           Py_ssize_t maxsplit)
   7458 {
   7459     PyObject *result;
   7460 
   7461     s = PyUnicode_FromObject(s);
   7462     if (s == NULL)
   7463         return NULL;
   7464     if (sep != NULL) {
   7465         sep = PyUnicode_FromObject(sep);
   7466         if (sep == NULL) {
   7467             Py_DECREF(s);
   7468             return NULL;
   7469         }
   7470     }
   7471 
   7472     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
   7473 
   7474     Py_DECREF(s);
   7475     Py_XDECREF(sep);
   7476     return result;
   7477 }
   7478 
   7479 PyDoc_STRVAR(split__doc__,
   7480              "S.split([sep [,maxsplit]]) -> list of strings\n\
   7481 \n\
   7482 Return a list of the words in S, using sep as the\n\
   7483 delimiter string.  If maxsplit is given, at most maxsplit\n\
   7484 splits are done. If sep is not specified or is None, any\n\
   7485 whitespace string is a separator and empty strings are\n\
   7486 removed from the result.");
   7487 
   7488 static PyObject*
   7489 unicode_split(PyUnicodeObject *self, PyObject *args)
   7490 {
   7491     PyObject *substring = Py_None;
   7492     Py_ssize_t maxcount = -1;
   7493 
   7494     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
   7495         return NULL;
   7496 
   7497     if (substring == Py_None)
   7498         return split(self, NULL, maxcount);
   7499     else if (PyUnicode_Check(substring))
   7500         return split(self, (PyUnicodeObject *)substring, maxcount);
   7501     else
   7502         return PyUnicode_Split((PyObject *)self, substring, maxcount);
   7503 }
   7504 
   7505 PyObject *
   7506 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
   7507 {
   7508     PyObject* str_obj;
   7509     PyObject* sep_obj;
   7510     PyObject* out;
   7511 
   7512     str_obj = PyUnicode_FromObject(str_in);
   7513     if (!str_obj)
   7514         return NULL;
   7515     sep_obj = PyUnicode_FromObject(sep_in);
   7516     if (!sep_obj) {
   7517         Py_DECREF(str_obj);
   7518         return NULL;
   7519     }
   7520 
   7521     out = stringlib_partition(
   7522         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
   7523         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
   7524         );
   7525 
   7526     Py_DECREF(sep_obj);
   7527     Py_DECREF(str_obj);
   7528 
   7529     return out;
   7530 }
   7531 
   7532 
   7533 PyObject *
   7534 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
   7535 {
   7536     PyObject* str_obj;
   7537     PyObject* sep_obj;
   7538     PyObject* out;
   7539 
   7540     str_obj = PyUnicode_FromObject(str_in);
   7541     if (!str_obj)
   7542         return NULL;
   7543     sep_obj = PyUnicode_FromObject(sep_in);
   7544     if (!sep_obj) {
   7545         Py_DECREF(str_obj);
   7546         return NULL;
   7547     }
   7548 
   7549     out = stringlib_rpartition(
   7550         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
   7551         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
   7552         );
   7553 
   7554     Py_DECREF(sep_obj);
   7555     Py_DECREF(str_obj);
   7556 
   7557     return out;
   7558 }
   7559 
   7560 PyDoc_STRVAR(partition__doc__,
   7561              "S.partition(sep) -> (head, sep, tail)\n\
   7562 \n\
   7563 Search for the separator sep in S, and return the part before it,\n\
   7564 the separator itself, and the part after it.  If the separator is not\n\
   7565 found, return S and two empty strings.");
   7566 
   7567 static PyObject*
   7568 unicode_partition(PyUnicodeObject *self, PyObject *separator)
   7569 {
   7570     return PyUnicode_Partition((PyObject *)self, separator);
   7571 }
   7572 
   7573 PyDoc_STRVAR(rpartition__doc__,
   7574              "S.rpartition(sep) -> (head, sep, tail)\n\
   7575 \n\
   7576 Search for the separator sep in S, starting at the end of S, and return\n\
   7577 the part before it, the separator itself, and the part after it.  If the\n\
   7578 separator is not found, return two empty strings and S.");
   7579 
   7580 static PyObject*
   7581 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
   7582 {
   7583     return PyUnicode_RPartition((PyObject *)self, separator);
   7584 }
   7585 
   7586 PyObject *PyUnicode_RSplit(PyObject *s,
   7587                            PyObject *sep,
   7588                            Py_ssize_t maxsplit)
   7589 {
   7590     PyObject *result;
   7591 
   7592     s = PyUnicode_FromObject(s);
   7593     if (s == NULL)
   7594         return NULL;
   7595     if (sep != NULL) {
   7596         sep = PyUnicode_FromObject(sep);
   7597         if (sep == NULL) {
   7598             Py_DECREF(s);
   7599             return NULL;
   7600         }
   7601     }
   7602 
   7603     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
   7604 
   7605     Py_DECREF(s);
   7606     Py_XDECREF(sep);
   7607     return result;
   7608 }
   7609 
   7610 PyDoc_STRVAR(rsplit__doc__,
   7611              "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
   7612 \n\
   7613 Return a list of the words in S, using sep as the\n\
   7614 delimiter string, starting at the end of the string and\n\
   7615 working to the front.  If maxsplit is given, at most maxsplit\n\
   7616 splits are done. If sep is not specified, any whitespace string\n\
   7617 is a separator.");
   7618 
   7619 static PyObject*
   7620 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
   7621 {
   7622     PyObject *substring = Py_None;
   7623     Py_ssize_t maxcount = -1;
   7624 
   7625     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
   7626         return NULL;
   7627 
   7628     if (substring == Py_None)
   7629         return rsplit(self, NULL, maxcount);
   7630     else if (PyUnicode_Check(substring))
   7631         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
   7632     else
   7633         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
   7634 }
   7635 
   7636 PyDoc_STRVAR(splitlines__doc__,
   7637              "S.splitlines(keepends=False) -> list of strings\n\
   7638 \n\
   7639 Return a list of the lines in S, breaking at line boundaries.\n\
   7640 Line breaks are not included in the resulting list unless keepends\n\
   7641 is given and true.");
   7642 
   7643 static PyObject*
   7644 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
   7645 {
   7646     int keepends = 0;
   7647 
   7648     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
   7649         return NULL;
   7650 
   7651     return PyUnicode_Splitlines((PyObject *)self, keepends);
   7652 }
   7653 
   7654 static
   7655 PyObject *unicode_str(PyUnicodeObject *self)
   7656 {
   7657     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
   7658 }
   7659 
   7660 PyDoc_STRVAR(swapcase__doc__,
   7661              "S.swapcase() -> unicode\n\
   7662 \n\
   7663 Return a copy of S with uppercase characters converted to lowercase\n\
   7664 and vice versa.");
   7665 
   7666 static PyObject*
   7667 unicode_swapcase(PyUnicodeObject *self)
   7668 {
   7669     return fixup(self, fixswapcase);
   7670 }
   7671 
   7672 PyDoc_STRVAR(translate__doc__,
   7673              "S.translate(table) -> unicode\n\
   7674 \n\
   7675 Return a copy of the string S, where all characters have been mapped\n\
   7676 through the given translation table, which must be a mapping of\n\
   7677 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
   7678 Unmapped characters are left untouched. Characters mapped to None\n\
   7679 are deleted.");
   7680 
   7681 static PyObject*
   7682 unicode_translate(PyUnicodeObject *self, PyObject *table)
   7683 {
   7684     return PyUnicode_TranslateCharmap(self->str,
   7685                                       self->length,
   7686                                       table,
   7687                                       "ignore");
   7688 }
   7689 
   7690 PyDoc_STRVAR(upper__doc__,
   7691              "S.upper() -> unicode\n\
   7692 \n\
   7693 Return a copy of S converted to uppercase.");
   7694 
   7695 static PyObject*
   7696 unicode_upper(PyUnicodeObject *self)
   7697 {
   7698     return fixup(self, fixupper);
   7699 }
   7700 
   7701 PyDoc_STRVAR(zfill__doc__,
   7702              "S.zfill(width) -> unicode\n\
   7703 \n\
   7704 Pad a numeric string S with zeros on the left, to fill a field\n\
   7705 of the specified width. The string S is never truncated.");
   7706 
   7707 static PyObject *
   7708 unicode_zfill(PyUnicodeObject *self, PyObject *args)
   7709 {
   7710     Py_ssize_t fill;
   7711     PyUnicodeObject *u;
   7712 
   7713     Py_ssize_t width;
   7714     if (!PyArg_ParseTuple(args, "n:zfill", &width))
   7715         return NULL;
   7716 
   7717     if (self->length >= width) {
   7718         if (PyUnicode_CheckExact(self)) {
   7719             Py_INCREF(self);
   7720             return (PyObject*) self;
   7721         }
   7722         else
   7723             return PyUnicode_FromUnicode(
   7724                 PyUnicode_AS_UNICODE(self),
   7725                 PyUnicode_GET_SIZE(self)
   7726                 );
   7727     }
   7728 
   7729     fill = width - self->length;
   7730 
   7731     u = pad(self, fill, 0, '0');
   7732 
   7733     if (u == NULL)
   7734         return NULL;
   7735 
   7736     if (u->str[fill] == '+' || u->str[fill] == '-') {
   7737         /* move sign to beginning of string */
   7738         u->str[0] = u->str[fill];
   7739         u->str[fill] = '0';
   7740     }
   7741 
   7742     return (PyObject*) u;
   7743 }
   7744 
   7745 #if 0
   7746 static PyObject*
   7747 free_listsize(PyUnicodeObject *self)
   7748 {
   7749     return PyInt_FromLong(numfree);
   7750 }
   7751 #endif
   7752 
   7753 PyDoc_STRVAR(startswith__doc__,
   7754              "S.startswith(prefix[, start[, end]]) -> bool\n\
   7755 \n\
   7756 Return True if S starts with the specified prefix, False otherwise.\n\
   7757 With optional start, test S beginning at that position.\n\
   7758 With optional end, stop comparing S at that position.\n\
   7759 prefix can also be a tuple of strings to try.");
   7760 
   7761 static PyObject *
   7762 unicode_startswith(PyUnicodeObject *self,
   7763                    PyObject *args)
   7764 {
   7765     PyObject *subobj;
   7766     PyUnicodeObject *substring;
   7767     Py_ssize_t start = 0;
   7768     Py_ssize_t end = PY_SSIZE_T_MAX;
   7769     int result;
   7770 
   7771     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
   7772         return NULL;
   7773     if (PyTuple_Check(subobj)) {
   7774         Py_ssize_t i;
   7775         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
   7776             substring = (PyUnicodeObject *)PyUnicode_FromObject(
   7777                 PyTuple_GET_ITEM(subobj, i));
   7778             if (substring == NULL)
   7779                 return NULL;
   7780             result = tailmatch(self, substring, start, end, -1);
   7781             Py_DECREF(substring);
   7782             if (result) {
   7783                 Py_RETURN_TRUE;
   7784             }
   7785         }
   7786         /* nothing matched */
   7787         Py_RETURN_FALSE;
   7788     }
   7789     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
   7790     if (substring == NULL) {
   7791         if (PyErr_ExceptionMatches(PyExc_TypeError))
   7792             PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
   7793                          "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
   7794         return NULL;
   7795     }
   7796     result = tailmatch(self, substring, start, end, -1);
   7797     Py_DECREF(substring);
   7798     return PyBool_FromLong(result);
   7799 }
   7800 
   7801 
   7802 PyDoc_STRVAR(endswith__doc__,
   7803              "S.endswith(suffix[, start[, end]]) -> bool\n\
   7804 \n\
   7805 Return True if S ends with the specified suffix, False otherwise.\n\
   7806 With optional start, test S beginning at that position.\n\
   7807 With optional end, stop comparing S at that position.\n\
   7808 suffix can also be a tuple of strings to try.");
   7809 
   7810 static PyObject *
   7811 unicode_endswith(PyUnicodeObject *self,
   7812                  PyObject *args)
   7813 {
   7814     PyObject *subobj;
   7815     PyUnicodeObject *substring;
   7816     Py_ssize_t start = 0;
   7817     Py_ssize_t end = PY_SSIZE_T_MAX;
   7818     int result;
   7819 
   7820     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
   7821         return NULL;
   7822     if (PyTuple_Check(subobj)) {
   7823         Py_ssize_t i;
   7824         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
   7825             substring = (PyUnicodeObject *)PyUnicode_FromObject(
   7826                 PyTuple_GET_ITEM(subobj, i));
   7827             if (substring == NULL)
   7828                 return NULL;
   7829             result = tailmatch(self, substring, start, end, +1);
   7830             Py_DECREF(substring);
   7831             if (result) {
   7832                 Py_RETURN_TRUE;
   7833             }
   7834         }
   7835         Py_RETURN_FALSE;
   7836     }
   7837     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
   7838     if (substring == NULL) {
   7839         if (PyErr_ExceptionMatches(PyExc_TypeError))
   7840             PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
   7841                          "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
   7842         return NULL;
   7843     }
   7844     result = tailmatch(self, substring, start, end, +1);
   7845     Py_DECREF(substring);
   7846     return PyBool_FromLong(result);
   7847 }
   7848 
   7849 
   7850 /* Implements do_string_format, which is unicode because of stringlib */
   7851 #include "stringlib/string_format.h"
   7852 
   7853 PyDoc_STRVAR(format__doc__,
   7854              "S.format(*args, **kwargs) -> unicode\n\
   7855 \n\
   7856 Return a formatted version of S, using substitutions from args and kwargs.\n\
   7857 The substitutions are identified by braces ('{' and '}').");
   7858 
   7859 static PyObject *
   7860 unicode__format__(PyObject *self, PyObject *args)
   7861 {
   7862     PyObject *format_spec;
   7863     PyObject *result = NULL;
   7864     PyObject *tmp = NULL;
   7865 
   7866     /* If 2.x, convert format_spec to the same type as value */
   7867     /* This is to allow things like u''.format('') */
   7868     if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
   7869         goto done;
   7870     if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
   7871         PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
   7872                      "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
   7873         goto done;
   7874     }
   7875     tmp = PyObject_Unicode(format_spec);
   7876     if (tmp == NULL)
   7877         goto done;
   7878     format_spec = tmp;
   7879 
   7880     result = _PyUnicode_FormatAdvanced(self,
   7881                                        PyUnicode_AS_UNICODE(format_spec),
   7882                                        PyUnicode_GET_SIZE(format_spec));
   7883   done:
   7884     Py_XDECREF(tmp);
   7885     return result;
   7886 }
   7887 
   7888 PyDoc_STRVAR(p_format__doc__,
   7889              "S.__format__(format_spec) -> unicode\n\
   7890 \n\
   7891 Return a formatted version of S as described by format_spec.");
   7892 
   7893 static PyObject *
   7894 unicode__sizeof__(PyUnicodeObject *v)
   7895 {
   7896     return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
   7897                              sizeof(Py_UNICODE) * (v->length + 1));
   7898 }
   7899 
   7900 PyDoc_STRVAR(sizeof__doc__,
   7901              "S.__sizeof__() -> size of S in memory, in bytes\n\
   7902 \n\
   7903 ");
   7904 
   7905 static PyObject *
   7906 unicode_getnewargs(PyUnicodeObject *v)
   7907 {
   7908     return Py_BuildValue("(u#)", v->str, v->length);
   7909 }
   7910 
   7911 
   7912 static PyMethodDef unicode_methods[] = {
   7913     {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
   7914     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
   7915     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
   7916     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
   7917     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
   7918     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
   7919     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
   7920     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
   7921     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
   7922     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
   7923     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
   7924     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
   7925     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
   7926     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
   7927     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
   7928     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
   7929     {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
   7930 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
   7931     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
   7932     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
   7933     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
   7934     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
   7935     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
   7936     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
   7937     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
   7938     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
   7939     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
   7940     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
   7941     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
   7942     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
   7943     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
   7944     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
   7945     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
   7946     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
   7947     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
   7948     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
   7949     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
   7950     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
   7951     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
   7952     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
   7953     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
   7954     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
   7955     {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
   7956     {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
   7957     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
   7958 #if 0
   7959     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
   7960 #endif
   7961 
   7962 #if 0
   7963     /* This one is just used for debugging the implementation. */
   7964     {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
   7965 #endif
   7966 
   7967     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
   7968     {NULL, NULL}
   7969 };
   7970 
   7971 static PyObject *
   7972 unicode_mod(PyObject *v, PyObject *w)
   7973 {
   7974     if (!PyUnicode_Check(v)) {
   7975         Py_INCREF(Py_NotImplemented);
   7976         return Py_NotImplemented;
   7977     }
   7978     return PyUnicode_Format(v, w);
   7979 }
   7980 
   7981 static PyNumberMethods unicode_as_number = {
   7982     0,              /*nb_add*/
   7983     0,              /*nb_subtract*/
   7984     0,              /*nb_multiply*/
   7985     0,              /*nb_divide*/
   7986     unicode_mod,            /*nb_remainder*/
   7987 };
   7988 
   7989 static PySequenceMethods unicode_as_sequence = {
   7990     (lenfunc) unicode_length,       /* sq_length */
   7991     PyUnicode_Concat,           /* sq_concat */
   7992     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
   7993     (ssizeargfunc) unicode_getitem,     /* sq_item */
   7994     (ssizessizeargfunc) unicode_slice,  /* sq_slice */
   7995     0,                  /* sq_ass_item */
   7996     0,                  /* sq_ass_slice */
   7997     PyUnicode_Contains,         /* sq_contains */
   7998 };
   7999 
   8000 static PyObject*
   8001 unicode_subscript(PyUnicodeObject* self, PyObject* item)
   8002 {
   8003     if (PyIndex_Check(item)) {
   8004         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
   8005         if (i == -1 && PyErr_Occurred())
   8006             return NULL;
   8007         if (i < 0)
   8008             i += PyUnicode_GET_SIZE(self);
   8009         return unicode_getitem(self, i);
   8010     } else if (PySlice_Check(item)) {
   8011         Py_ssize_t start, stop, step, slicelength, cur, i;
   8012         Py_UNICODE* source_buf;
   8013         Py_UNICODE* result_buf;
   8014         PyObject* result;
   8015 
   8016         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
   8017                                  &start, &stop, &step, &slicelength) < 0) {
   8018             return NULL;
   8019         }
   8020 
   8021         if (slicelength <= 0) {
   8022             return PyUnicode_FromUnicode(NULL, 0);
   8023         } else if (start == 0 && step == 1 && slicelength == self->length &&
   8024                    PyUnicode_CheckExact(self)) {
   8025             Py_INCREF(self);
   8026             return (PyObject *)self;
   8027         } else if (step == 1) {
   8028             return PyUnicode_FromUnicode(self->str + start, slicelength);
   8029         } else {
   8030             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
   8031             result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
   8032                                                        sizeof(Py_UNICODE));
   8033 
   8034             if (result_buf == NULL)
   8035                 return PyErr_NoMemory();
   8036 
   8037             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
   8038                 result_buf[i] = source_buf[cur];
   8039             }
   8040 
   8041             result = PyUnicode_FromUnicode(result_buf, slicelength);
   8042             PyObject_FREE(result_buf);
   8043             return result;
   8044         }
   8045     } else {
   8046         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
   8047         return NULL;
   8048     }
   8049 }
   8050 
   8051 static PyMappingMethods unicode_as_mapping = {
   8052     (lenfunc)unicode_length,        /* mp_length */
   8053     (binaryfunc)unicode_subscript,  /* mp_subscript */
   8054     (objobjargproc)0,           /* mp_ass_subscript */
   8055 };
   8056 
   8057 static Py_ssize_t
   8058 unicode_buffer_getreadbuf(PyUnicodeObject *self,
   8059                           Py_ssize_t index,
   8060                           const void **ptr)
   8061 {
   8062     if (index != 0) {
   8063         PyErr_SetString(PyExc_SystemError,
   8064                         "accessing non-existent unicode segment");
   8065         return -1;
   8066     }
   8067     *ptr = (void *) self->str;
   8068     return PyUnicode_GET_DATA_SIZE(self);
   8069 }
   8070 
   8071 static Py_ssize_t
   8072 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
   8073                            const void **ptr)
   8074 {
   8075     PyErr_SetString(PyExc_TypeError,
   8076                     "cannot use unicode as modifiable buffer");
   8077     return -1;
   8078 }
   8079 
   8080 static int
   8081 unicode_buffer_getsegcount(PyUnicodeObject *self,
   8082                            Py_ssize_t *lenp)
   8083 {
   8084     if (lenp)
   8085         *lenp = PyUnicode_GET_DATA_SIZE(self);
   8086     return 1;
   8087 }
   8088 
   8089 static Py_ssize_t
   8090 unicode_buffer_getcharbuf(PyUnicodeObject *self,
   8091                           Py_ssize_t index,
   8092                           const void **ptr)
   8093 {
   8094     PyObject *str;
   8095 
   8096     if (index != 0) {
   8097         PyErr_SetString(PyExc_SystemError,
   8098                         "accessing non-existent unicode segment");
   8099         return -1;
   8100     }
   8101     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
   8102     if (str == NULL)
   8103         return -1;
   8104     *ptr = (void *) PyString_AS_STRING(str);
   8105     return PyString_GET_SIZE(str);
   8106 }
   8107 
   8108 /* Helpers for PyUnicode_Format() */
   8109 
   8110 static PyObject *
   8111 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
   8112 {
   8113     Py_ssize_t argidx = *p_argidx;
   8114     if (argidx < arglen) {
   8115         (*p_argidx)++;
   8116         if (arglen < 0)
   8117             return args;
   8118         else
   8119             return PyTuple_GetItem(args, argidx);
   8120     }
   8121     PyErr_SetString(PyExc_TypeError,
   8122                     "not enough arguments for format string");
   8123     return NULL;
   8124 }
   8125 
   8126 #define F_LJUST (1<<0)
   8127 #define F_SIGN  (1<<1)
   8128 #define F_BLANK (1<<2)
   8129 #define F_ALT   (1<<3)
   8130 #define F_ZERO  (1<<4)
   8131 
   8132 static Py_ssize_t
   8133 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
   8134 {
   8135     register Py_ssize_t i;
   8136     Py_ssize_t len = strlen(charbuffer);
   8137     for (i = len - 1; i >= 0; i--)
   8138         buffer[i] = (Py_UNICODE) charbuffer[i];
   8139 
   8140     return len;
   8141 }
   8142 
   8143 static int
   8144 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
   8145 {
   8146     Py_ssize_t result;
   8147 
   8148     PyOS_snprintf((char *)buffer, len, format, x);
   8149     result = strtounicode(buffer, (char *)buffer);
   8150     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
   8151 }
   8152 
   8153 /* XXX To save some code duplication, formatfloat/long/int could have been
   8154    shared with stringobject.c, converting from 8-bit to Unicode after the
   8155    formatting is done. */
   8156 
   8157 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
   8158 
   8159 static PyObject *
   8160 formatfloat(PyObject *v, int flags, int prec, int type)
   8161 {
   8162     char *p;
   8163     PyObject *result;
   8164     double x;
   8165 
   8166     x = PyFloat_AsDouble(v);
   8167     if (x == -1.0 && PyErr_Occurred())
   8168         return NULL;
   8169 
   8170     if (prec < 0)
   8171         prec = 6;
   8172 
   8173     p = PyOS_double_to_string(x, type, prec,
   8174                               (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
   8175     if (p == NULL)
   8176         return NULL;
   8177     result = PyUnicode_FromStringAndSize(p, strlen(p));
   8178     PyMem_Free(p);
   8179     return result;
   8180 }
   8181 
   8182 static PyObject*
   8183 formatlong(PyObject *val, int flags, int prec, int type)
   8184 {
   8185     char *buf;
   8186     int i, len;
   8187     PyObject *str; /* temporary string object. */
   8188     PyUnicodeObject *result;
   8189 
   8190     str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
   8191     if (!str)
   8192         return NULL;
   8193     result = _PyUnicode_New(len);
   8194     if (!result) {
   8195         Py_DECREF(str);
   8196         return NULL;
   8197     }
   8198     for (i = 0; i < len; i++)
   8199         result->str[i] = buf[i];
   8200     result->str[len] = 0;
   8201     Py_DECREF(str);
   8202     return (PyObject*)result;
   8203 }
   8204 
   8205 static int
   8206 formatint(Py_UNICODE *buf,
   8207           size_t buflen,
   8208           int flags,
   8209           int prec,
   8210           int type,
   8211           PyObject *v)
   8212 {
   8213     /* fmt = '%#.' + `prec` + 'l' + `type`
   8214      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
   8215      *                     + 1 + 1
   8216      *                   = 24
   8217      */
   8218     char fmt[64]; /* plenty big enough! */
   8219     char *sign;
   8220     long x;
   8221 
   8222     x = PyInt_AsLong(v);
   8223     if (x == -1 && PyErr_Occurred())
   8224         return -1;
   8225     if (x < 0 && type == 'u') {
   8226         type = 'd';
   8227     }
   8228     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
   8229         sign = "-";
   8230     else
   8231         sign = "";
   8232     if (prec < 0)
   8233         prec = 1;
   8234 
   8235     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
   8236      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
   8237      */
   8238     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
   8239         PyErr_SetString(PyExc_OverflowError,
   8240                         "formatted integer is too long (precision too large?)");
   8241         return -1;
   8242     }
   8243 
   8244     if ((flags & F_ALT) &&
   8245         (type == 'x' || type == 'X')) {
   8246         /* When converting under %#x or %#X, there are a number
   8247          * of issues that cause pain:
   8248          * - when 0 is being converted, the C standard leaves off
   8249          *   the '0x' or '0X', which is inconsistent with other
   8250          *   %#x/%#X conversions and inconsistent with Python's
   8251          *   hex() function
   8252          * - there are platforms that violate the standard and
   8253          *   convert 0 with the '0x' or '0X'
   8254          *   (Metrowerks, Compaq Tru64)
   8255          * - there are platforms that give '0x' when converting
   8256          *   under %#X, but convert 0 in accordance with the
   8257          *   standard (OS/2 EMX)
   8258          *
   8259          * We can achieve the desired consistency by inserting our
   8260          * own '0x' or '0X' prefix, and substituting %x/%X in place
   8261          * of %#x/%#X.
   8262          *
   8263          * Note that this is the same approach as used in
   8264          * formatint() in stringobject.c
   8265          */
   8266         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
   8267                       sign, type, prec, type);
   8268     }
   8269     else {
   8270         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
   8271                       sign, (flags&F_ALT) ? "#" : "",
   8272                       prec, type);
   8273     }
   8274     if (sign[0])
   8275         return longtounicode(buf, buflen, fmt, -x);
   8276     else
   8277         return longtounicode(buf, buflen, fmt, x);
   8278 }
   8279 
   8280 static int
   8281 formatchar(Py_UNICODE *buf,
   8282            size_t buflen,
   8283            PyObject *v)
   8284 {
   8285     PyObject *unistr;
   8286     char *str;
   8287     /* presume that the buffer is at least 2 characters long */
   8288     if (PyUnicode_Check(v)) {
   8289         if (PyUnicode_GET_SIZE(v) != 1)
   8290             goto onError;
   8291         buf[0] = PyUnicode_AS_UNICODE(v)[0];
   8292     }
   8293 
   8294     else if (PyString_Check(v)) {
   8295         if (PyString_GET_SIZE(v) != 1)
   8296             goto onError;
   8297         /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
   8298            with a UnicodeDecodeError if 'char' is not decodable with the
   8299            default encoding (usually ASCII, but it might be something else) */
   8300         str = PyString_AS_STRING(v);
   8301         if ((unsigned char)str[0] > 0x7F) {
   8302             /* the char is not ASCII; try to decode the string using the
   8303                default encoding and return -1 to let the UnicodeDecodeError
   8304                be raised if the string can't be decoded */
   8305             unistr = PyUnicode_Decode(str, 1, NULL, "strict");
   8306             if (unistr == NULL)
   8307                 return -1;
   8308             buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
   8309             Py_DECREF(unistr);
   8310         }
   8311         else
   8312             buf[0] = (Py_UNICODE)str[0];
   8313     }
   8314 
   8315     else {
   8316         /* Integer input truncated to a character */
   8317         long x;
   8318         x = PyInt_AsLong(v);
   8319         if (x == -1 && PyErr_Occurred())
   8320             goto onError;
   8321 #ifdef Py_UNICODE_WIDE
   8322         if (x < 0 || x > 0x10ffff) {
   8323             PyErr_SetString(PyExc_OverflowError,
   8324                             "%c arg not in range(0x110000) "
   8325                             "(wide Python build)");
   8326             return -1;
   8327         }
   8328 #else
   8329         if (x < 0 || x > 0xffff) {
   8330             PyErr_SetString(PyExc_OverflowError,
   8331                             "%c arg not in range(0x10000) "
   8332                             "(narrow Python build)");
   8333             return -1;
   8334         }
   8335 #endif
   8336         buf[0] = (Py_UNICODE) x;
   8337     }
   8338     buf[1] = '\0';
   8339     return 1;
   8340 
   8341   onError:
   8342     PyErr_SetString(PyExc_TypeError,
   8343                     "%c requires int or char");
   8344     return -1;
   8345 }
   8346 
   8347 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
   8348 
   8349    FORMATBUFLEN is the length of the buffer in which the ints &
   8350    chars are formatted. XXX This is a magic number. Each formatting
   8351    routine does bounds checking to ensure no overflow, but a better
   8352    solution may be to malloc a buffer of appropriate size for each
   8353    format. For now, the current solution is sufficient.
   8354 */
   8355 #define FORMATBUFLEN (size_t)120
   8356 
   8357 PyObject *PyUnicode_Format(PyObject *format,
   8358                            PyObject *args)
   8359 {
   8360     Py_UNICODE *fmt, *res;
   8361     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
   8362     int args_owned = 0;
   8363     PyUnicodeObject *result = NULL;
   8364     PyObject *dict = NULL;
   8365     PyObject *uformat;
   8366 
   8367     if (format == NULL || args == NULL) {
   8368         PyErr_BadInternalCall();
   8369         return NULL;
   8370     }
   8371     uformat = PyUnicode_FromObject(format);
   8372     if (uformat == NULL)
   8373         return NULL;
   8374     fmt = PyUnicode_AS_UNICODE(uformat);
   8375     fmtcnt = PyUnicode_GET_SIZE(uformat);
   8376 
   8377     reslen = rescnt = fmtcnt + 100;
   8378     result = _PyUnicode_New(reslen);
   8379     if (result == NULL)
   8380         goto onError;
   8381     res = PyUnicode_AS_UNICODE(result);
   8382 
   8383     if (PyTuple_Check(args)) {
   8384         arglen = PyTuple_Size(args);
   8385         argidx = 0;
   8386     }
   8387     else {
   8388         arglen = -1;
   8389         argidx = -2;
   8390     }
   8391     if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript &&
   8392         !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type))
   8393         dict = args;
   8394 
   8395     while (--fmtcnt >= 0) {
   8396         if (*fmt != '%') {
   8397             if (--rescnt < 0) {
   8398                 rescnt = fmtcnt + 100;
   8399                 reslen += rescnt;
   8400                 if (_PyUnicode_Resize(&result, reslen) < 0)
   8401                     goto onError;
   8402                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
   8403                 --rescnt;
   8404             }
   8405             *res++ = *fmt++;
   8406         }
   8407         else {
   8408             /* Got a format specifier */
   8409             int flags = 0;
   8410             Py_ssize_t width = -1;
   8411             int prec = -1;
   8412             Py_UNICODE c = '\0';
   8413             Py_UNICODE fill;
   8414             int isnumok;
   8415             PyObject *v = NULL;
   8416             PyObject *temp = NULL;
   8417             Py_UNICODE *pbuf;
   8418             Py_UNICODE sign;
   8419             Py_ssize_t len;
   8420             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
   8421 
   8422             fmt++;
   8423             if (*fmt == '(') {
   8424                 Py_UNICODE *keystart;
   8425                 Py_ssize_t keylen;
   8426                 PyObject *key;
   8427                 int pcount = 1;
   8428 
   8429                 if (dict == NULL) {
   8430                     PyErr_SetString(PyExc_TypeError,
   8431                                     "format requires a mapping");
   8432                     goto onError;
   8433                 }
   8434                 ++fmt;
   8435                 --fmtcnt;
   8436                 keystart = fmt;
   8437                 /* Skip over balanced parentheses */
   8438                 while (pcount > 0 && --fmtcnt >= 0) {
   8439                     if (*fmt == ')')
   8440                         --pcount;
   8441                     else if (*fmt == '(')
   8442                         ++pcount;
   8443                     fmt++;
   8444                 }
   8445                 keylen = fmt - keystart - 1;
   8446                 if (fmtcnt < 0 || pcount > 0) {
   8447                     PyErr_SetString(PyExc_ValueError,
   8448                                     "incomplete format key");
   8449                     goto onError;
   8450                 }
   8451 #if 0
   8452                 /* keys are converted to strings using UTF-8 and
   8453                    then looked up since Python uses strings to hold
   8454                    variables names etc. in its namespaces and we
   8455                    wouldn't want to break common idioms. */
   8456                 key = PyUnicode_EncodeUTF8(keystart,
   8457                                            keylen,
   8458                                            NULL);
   8459 #else
   8460                 key = PyUnicode_FromUnicode(keystart, keylen);
   8461 #endif
   8462                 if (key == NULL)
   8463                     goto onError;
   8464                 if (args_owned) {
   8465                     Py_DECREF(args);
   8466                     args_owned = 0;
   8467                 }
   8468                 args = PyObject_GetItem(dict, key);
   8469                 Py_DECREF(key);
   8470                 if (args == NULL) {
   8471                     goto onError;
   8472                 }
   8473                 args_owned = 1;
   8474                 arglen = -1;
   8475                 argidx = -2;
   8476             }
   8477             while (--fmtcnt >= 0) {
   8478                 switch (c = *fmt++) {
   8479                 case '-': flags |= F_LJUST; continue;
   8480                 case '+': flags |= F_SIGN; continue;
   8481                 case ' ': flags |= F_BLANK; continue;
   8482                 case '#': flags |= F_ALT; continue;
   8483                 case '0': flags |= F_ZERO; continue;
   8484                 }
   8485                 break;
   8486             }
   8487             if (c == '*') {
   8488                 v = getnextarg(args, arglen, &argidx);
   8489                 if (v == NULL)
   8490                     goto onError;
   8491                 if (!PyInt_Check(v)) {
   8492                     PyErr_SetString(PyExc_TypeError,
   8493                                     "* wants int");
   8494                     goto onError;
   8495                 }
   8496                 width = PyInt_AsSsize_t(v);
   8497                 if (width == -1 && PyErr_Occurred())
   8498                     goto onError;
   8499                 if (width < 0) {
   8500                     flags |= F_LJUST;
   8501                     width = -width;
   8502                 }
   8503                 if (--fmtcnt >= 0)
   8504                     c = *fmt++;
   8505             }
   8506             else if (c >= '0' && c <= '9') {
   8507                 width = c - '0';
   8508                 while (--fmtcnt >= 0) {
   8509                     c = *fmt++;
   8510                     if (c < '0' || c > '9')
   8511                         break;
   8512                     if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
   8513                         PyErr_SetString(PyExc_ValueError,
   8514                                         "width too big");
   8515                         goto onError;
   8516                     }
   8517                     width = width*10 + (c - '0');
   8518                 }
   8519             }
   8520             if (c == '.') {
   8521                 prec = 0;
   8522                 if (--fmtcnt >= 0)
   8523                     c = *fmt++;
   8524                 if (c == '*') {
   8525                     v = getnextarg(args, arglen, &argidx);
   8526                     if (v == NULL)
   8527                         goto onError;
   8528                     if (!PyInt_Check(v)) {
   8529                         PyErr_SetString(PyExc_TypeError,
   8530                                         "* wants int");
   8531                         goto onError;
   8532                     }
   8533                     prec = _PyInt_AsInt(v);
   8534                     if (prec == -1 && PyErr_Occurred())
   8535                         goto onError;
   8536                     if (prec < 0)
   8537                         prec = 0;
   8538                     if (--fmtcnt >= 0)
   8539                         c = *fmt++;
   8540                 }
   8541                 else if (c >= '0' && c <= '9') {
   8542                     prec = c - '0';
   8543                     while (--fmtcnt >= 0) {
   8544                         c = *fmt++;
   8545                         if (c < '0' || c > '9')
   8546                             break;
   8547                         if (prec > (INT_MAX - ((int)c - '0')) / 10) {
   8548                             PyErr_SetString(PyExc_ValueError,
   8549                                             "prec too big");
   8550                             goto onError;
   8551                         }
   8552                         prec = prec*10 + (c - '0');
   8553                     }
   8554                 }
   8555             } /* prec */
   8556             if (fmtcnt >= 0) {
   8557                 if (c == 'h' || c == 'l' || c == 'L') {
   8558                     if (--fmtcnt >= 0)
   8559                         c = *fmt++;
   8560                 }
   8561             }
   8562             if (fmtcnt < 0) {
   8563                 PyErr_SetString(PyExc_ValueError,
   8564                                 "incomplete format");
   8565                 goto onError;
   8566             }
   8567             if (c != '%') {
   8568                 v = getnextarg(args, arglen, &argidx);
   8569                 if (v == NULL)
   8570                     goto onError;
   8571             }
   8572             sign = 0;
   8573             fill = ' ';
   8574             switch (c) {
   8575 
   8576             case '%':
   8577                 pbuf = formatbuf;
   8578                 /* presume that buffer length is at least 1 */
   8579                 pbuf[0] = '%';
   8580                 len = 1;
   8581                 break;
   8582 
   8583             case 's':
   8584             case 'r':
   8585                 if (PyUnicode_CheckExact(v) && c == 's') {
   8586                     temp = v;
   8587                     Py_INCREF(temp);
   8588                 }
   8589                 else {
   8590                     PyObject *unicode;
   8591                     if (c == 's')
   8592                         temp = PyObject_Unicode(v);
   8593                     else
   8594                         temp = PyObject_Repr(v);
   8595                     if (temp == NULL)
   8596                         goto onError;
   8597                     if (PyUnicode_Check(temp))
   8598                         /* nothing to do */;
   8599                     else if (PyString_Check(temp)) {
   8600                         /* convert to string to Unicode */
   8601                         unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
   8602                                                    PyString_GET_SIZE(temp),
   8603                                                    NULL,
   8604                                                    "strict");
   8605                         Py_DECREF(temp);
   8606                         temp = unicode;
   8607                         if (temp == NULL)
   8608                             goto onError;
   8609                     }
   8610                     else {
   8611                         Py_DECREF(temp);
   8612                         PyErr_SetString(PyExc_TypeError,
   8613                                         "%s argument has non-string str()");
   8614                         goto onError;
   8615                     }
   8616                 }
   8617                 pbuf = PyUnicode_AS_UNICODE(temp);
   8618                 len = PyUnicode_GET_SIZE(temp);
   8619                 if (prec >= 0 && len > prec)
   8620                     len = prec;
   8621                 break;
   8622 
   8623             case 'i':
   8624             case 'd':
   8625             case 'u':
   8626             case 'o':
   8627             case 'x':
   8628             case 'X':
   8629                 if (c == 'i')
   8630                     c = 'd';
   8631                 isnumok = 0;
   8632                 if (PyNumber_Check(v)) {
   8633                     PyObject *iobj=NULL;
   8634 
   8635                     if (PyInt_Check(v) || (PyLong_Check(v))) {
   8636                         iobj = v;
   8637                         Py_INCREF(iobj);
   8638                     }
   8639                     else {
   8640                         iobj = PyNumber_Int(v);
   8641                         if (iobj==NULL) {
   8642                             PyErr_Clear();
   8643                             iobj = PyNumber_Long(v);
   8644                         }
   8645                     }
   8646                     if (iobj!=NULL) {
   8647                         if (PyInt_Check(iobj)) {
   8648                             isnumok = 1;
   8649                             pbuf = formatbuf;
   8650                             len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
   8651                                             flags, prec, c, iobj);
   8652                             Py_DECREF(iobj);
   8653                             if (len < 0)
   8654                                 goto onError;
   8655                             sign = 1;
   8656                         }
   8657                         else if (PyLong_Check(iobj)) {
   8658                             isnumok = 1;
   8659                             temp = formatlong(iobj, flags, prec, c);
   8660                             Py_DECREF(iobj);
   8661                             if (!temp)
   8662                                 goto onError;
   8663                             pbuf = PyUnicode_AS_UNICODE(temp);
   8664                             len = PyUnicode_GET_SIZE(temp);
   8665                             sign = 1;
   8666                         }
   8667                         else {
   8668                             Py_DECREF(iobj);
   8669                         }
   8670                     }
   8671                 }
   8672                 if (!isnumok) {
   8673                     PyErr_Format(PyExc_TypeError,
   8674                                  "%%%c format: a number is required, "
   8675                                  "not %.200s", (char)c, Py_TYPE(v)->tp_name);
   8676                     goto onError;
   8677                 }
   8678                 if (flags & F_ZERO)
   8679                     fill = '0';
   8680                 break;
   8681 
   8682             case 'e':
   8683             case 'E':
   8684             case 'f':
   8685             case 'F':
   8686             case 'g':
   8687             case 'G':
   8688                 temp = formatfloat(v, flags, prec, c);
   8689                 if (temp == NULL)
   8690                     goto onError;
   8691                 pbuf = PyUnicode_AS_UNICODE(temp);
   8692                 len = PyUnicode_GET_SIZE(temp);
   8693                 sign = 1;
   8694                 if (flags & F_ZERO)
   8695                     fill = '0';
   8696                 break;
   8697 
   8698             case 'c':
   8699                 pbuf = formatbuf;
   8700                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
   8701                 if (len < 0)
   8702                     goto onError;
   8703                 break;
   8704 
   8705             default:
   8706                 PyErr_Format(PyExc_ValueError,
   8707                              "unsupported format character '%c' (0x%x) "
   8708                              "at index %zd",
   8709                              (31<=c && c<=126) ? (char)c : '?',
   8710                              (int)c,
   8711                              (Py_ssize_t)(fmt - 1 -
   8712                                           PyUnicode_AS_UNICODE(uformat)));
   8713                 goto onError;
   8714             }
   8715             if (sign) {
   8716                 if (*pbuf == '-' || *pbuf == '+') {
   8717                     sign = *pbuf++;
   8718                     len--;
   8719                 }
   8720                 else if (flags & F_SIGN)
   8721                     sign = '+';
   8722                 else if (flags & F_BLANK)
   8723                     sign = ' ';
   8724                 else
   8725                     sign = 0;
   8726             }
   8727             if (width < len)
   8728                 width = len;
   8729             if (rescnt - (sign != 0) < width) {
   8730                 reslen -= rescnt;
   8731                 rescnt = width + fmtcnt + 100;
   8732                 reslen += rescnt;
   8733                 if (reslen < 0) {
   8734                     Py_XDECREF(temp);
   8735                     PyErr_NoMemory();
   8736                     goto onError;
   8737                 }
   8738                 if (_PyUnicode_Resize(&result, reslen) < 0) {
   8739                     Py_XDECREF(temp);
   8740                     goto onError;
   8741                 }
   8742                 res = PyUnicode_AS_UNICODE(result)
   8743                     + reslen - rescnt;
   8744             }
   8745             if (sign) {
   8746                 if (fill != ' ')
   8747                     *res++ = sign;
   8748                 rescnt--;
   8749                 if (width > len)
   8750                     width--;
   8751             }
   8752             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
   8753                 assert(pbuf[0] == '0');
   8754                 assert(pbuf[1] == c);
   8755                 if (fill != ' ') {
   8756                     *res++ = *pbuf++;
   8757                     *res++ = *pbuf++;
   8758                 }
   8759                 rescnt -= 2;
   8760                 width -= 2;
   8761                 if (width < 0)
   8762                     width = 0;
   8763                 len -= 2;
   8764             }
   8765             if (width > len && !(flags & F_LJUST)) {
   8766                 do {
   8767                     --rescnt;
   8768                     *res++ = fill;
   8769                 } while (--width > len);
   8770             }
   8771             if (fill == ' ') {
   8772                 if (sign)
   8773                     *res++ = sign;
   8774                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
   8775                     assert(pbuf[0] == '0');
   8776                     assert(pbuf[1] == c);
   8777                     *res++ = *pbuf++;
   8778                     *res++ = *pbuf++;
   8779                 }
   8780             }
   8781             Py_UNICODE_COPY(res, pbuf, len);
   8782             res += len;
   8783             rescnt -= len;
   8784             while (--width >= len) {
   8785                 --rescnt;
   8786                 *res++ = ' ';
   8787             }
   8788             if (dict && (argidx < arglen) && c != '%') {
   8789                 PyErr_SetString(PyExc_TypeError,
   8790                                 "not all arguments converted during string formatting");
   8791                 Py_XDECREF(temp);
   8792                 goto onError;
   8793             }
   8794             Py_XDECREF(temp);
   8795         } /* '%' */
   8796     } /* until end */
   8797     if (argidx < arglen && !dict) {
   8798         PyErr_SetString(PyExc_TypeError,
   8799                         "not all arguments converted during string formatting");
   8800         goto onError;
   8801     }
   8802 
   8803     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
   8804         goto onError;
   8805     if (args_owned) {
   8806         Py_DECREF(args);
   8807     }
   8808     Py_DECREF(uformat);
   8809     return (PyObject *)result;
   8810 
   8811   onError:
   8812     Py_XDECREF(result);
   8813     Py_DECREF(uformat);
   8814     if (args_owned) {
   8815         Py_DECREF(args);
   8816     }
   8817     return NULL;
   8818 }
   8819 
   8820 static PyBufferProcs unicode_as_buffer = {
   8821     (readbufferproc) unicode_buffer_getreadbuf,
   8822     (writebufferproc) unicode_buffer_getwritebuf,
   8823     (segcountproc) unicode_buffer_getsegcount,
   8824     (charbufferproc) unicode_buffer_getcharbuf,
   8825 };
   8826 
   8827 static PyObject *
   8828 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
   8829 
   8830 static PyObject *
   8831 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
   8832 {
   8833     PyObject *x = NULL;
   8834     static char *kwlist[] = {"string", "encoding", "errors", 0};
   8835     char *encoding = NULL;
   8836     char *errors = NULL;
   8837 
   8838     if (type != &PyUnicode_Type)
   8839         return unicode_subtype_new(type, args, kwds);
   8840     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
   8841                                      kwlist, &x, &encoding, &errors))
   8842         return NULL;
   8843     if (x == NULL)
   8844         return (PyObject *)_PyUnicode_New(0);
   8845     if (encoding == NULL && errors == NULL)
   8846         return PyObject_Unicode(x);
   8847     else
   8848         return PyUnicode_FromEncodedObject(x, encoding, errors);
   8849 }
   8850 
   8851 static PyObject *
   8852 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
   8853 {
   8854     PyUnicodeObject *tmp, *pnew;
   8855     Py_ssize_t n;
   8856 
   8857     assert(PyType_IsSubtype(type, &PyUnicode_Type));
   8858     tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
   8859     if (tmp == NULL)
   8860         return NULL;
   8861     assert(PyUnicode_Check(tmp));
   8862     pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
   8863     if (pnew == NULL) {
   8864         Py_DECREF(tmp);
   8865         return NULL;
   8866     }
   8867     pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
   8868     if (pnew->str == NULL) {
   8869         _Py_ForgetReference((PyObject *)pnew);
   8870         PyObject_Del(pnew);
   8871         Py_DECREF(tmp);
   8872         return PyErr_NoMemory();
   8873     }
   8874     Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
   8875     pnew->length = n;
   8876     pnew->hash = tmp->hash;
   8877     Py_DECREF(tmp);
   8878     return (PyObject *)pnew;
   8879 }
   8880 
   8881 PyDoc_STRVAR(unicode_doc,
   8882              "unicode(object='') -> unicode object\n\
   8883 unicode(string[, encoding[, errors]]) -> unicode object\n\
   8884 \n\
   8885 Create a new Unicode object from the given encoded string.\n\
   8886 encoding defaults to the current default string encoding.\n\
   8887 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
   8888 
   8889 PyTypeObject PyUnicode_Type = {
   8890     PyVarObject_HEAD_INIT(&PyType_Type, 0)
   8891     "unicode",              /* tp_name */
   8892     sizeof(PyUnicodeObject),        /* tp_size */
   8893     0,                  /* tp_itemsize */
   8894     /* Slots */
   8895     (destructor)unicode_dealloc,    /* tp_dealloc */
   8896     0,                  /* tp_print */
   8897     0,                  /* tp_getattr */
   8898     0,                  /* tp_setattr */
   8899     0,                  /* tp_compare */
   8900     unicode_repr,           /* tp_repr */
   8901     &unicode_as_number,         /* tp_as_number */
   8902     &unicode_as_sequence,       /* tp_as_sequence */
   8903     &unicode_as_mapping,        /* tp_as_mapping */
   8904     (hashfunc) unicode_hash,        /* tp_hash*/
   8905     0,                  /* tp_call*/
   8906     (reprfunc) unicode_str,     /* tp_str */
   8907     PyObject_GenericGetAttr,        /* tp_getattro */
   8908     0,                  /* tp_setattro */
   8909     &unicode_as_buffer,         /* tp_as_buffer */
   8910     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
   8911     Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
   8912     unicode_doc,            /* tp_doc */
   8913     0,                  /* tp_traverse */
   8914     0,                  /* tp_clear */
   8915     PyUnicode_RichCompare,      /* tp_richcompare */
   8916     0,                  /* tp_weaklistoffset */
   8917     0,                  /* tp_iter */
   8918     0,                  /* tp_iternext */
   8919     unicode_methods,            /* tp_methods */
   8920     0,                  /* tp_members */
   8921     0,                  /* tp_getset */
   8922     &PyBaseString_Type,         /* tp_base */
   8923     0,                  /* tp_dict */
   8924     0,                  /* tp_descr_get */
   8925     0,                  /* tp_descr_set */
   8926     0,                  /* tp_dictoffset */
   8927     0,                  /* tp_init */
   8928     0,                  /* tp_alloc */
   8929     unicode_new,            /* tp_new */
   8930     PyObject_Del,           /* tp_free */
   8931 };
   8932 
   8933 /* Initialize the Unicode implementation */
   8934 
   8935 void _PyUnicode_Init(void)
   8936 {
   8937     /* XXX - move this array to unicodectype.c ? */
   8938     Py_UNICODE linebreak[] = {
   8939         0x000A, /* LINE FEED */
   8940         0x000D, /* CARRIAGE RETURN */
   8941         0x001C, /* FILE SEPARATOR */
   8942         0x001D, /* GROUP SEPARATOR */
   8943         0x001E, /* RECORD SEPARATOR */
   8944         0x0085, /* NEXT LINE */
   8945         0x2028, /* LINE SEPARATOR */
   8946         0x2029, /* PARAGRAPH SEPARATOR */
   8947     };
   8948 
   8949     /* Init the implementation */
   8950     if (!unicode_empty) {
   8951         unicode_empty = _PyUnicode_New(0);
   8952         if (!unicode_empty)
   8953             return;
   8954     }
   8955 
   8956     if (PyType_Ready(&PyUnicode_Type) < 0)
   8957         Py_FatalError("Can't initialize 'unicode'");
   8958 
   8959     /* initialize the linebreak bloom filter */
   8960     bloom_linebreak = make_bloom_mask(
   8961         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
   8962         );
   8963 
   8964     PyType_Ready(&EncodingMapType);
   8965 
   8966     if (PyType_Ready(&PyFieldNameIter_Type) < 0)
   8967         Py_FatalError("Can't initialize field name iterator type");
   8968 
   8969     if (PyType_Ready(&PyFormatterIter_Type) < 0)
   8970         Py_FatalError("Can't initialize formatter iter type");
   8971 }
   8972 
   8973 /* Finalize the Unicode implementation */
   8974 
   8975 int
   8976 PyUnicode_ClearFreeList(void)
   8977 {
   8978     int freelist_size = numfree;
   8979     PyUnicodeObject *u;
   8980 
   8981     for (u = free_list; u != NULL;) {
   8982         PyUnicodeObject *v = u;
   8983         u = *(PyUnicodeObject **)u;
   8984         if (v->str)
   8985             PyObject_DEL(v->str);
   8986         Py_XDECREF(v->defenc);
   8987         PyObject_Del(v);
   8988         numfree--;
   8989     }
   8990     free_list = NULL;
   8991     assert(numfree == 0);
   8992     return freelist_size;
   8993 }
   8994 
   8995 void
   8996 _PyUnicode_Fini(void)
   8997 {
   8998     int i;
   8999 
   9000     Py_CLEAR(unicode_empty);
   9001 
   9002     for (i = 0; i < 256; i++)
   9003         Py_CLEAR(unicode_latin1[i]);
   9004 
   9005     (void)PyUnicode_ClearFreeList();
   9006 }
   9007 
   9008 #ifdef __cplusplus
   9009 }
   9010 #endif
   9011