Home | History | Annotate | Download | only in Objects
      1 /*
      2 
      3 Unicode implementation based on original code by Fredrik Lundh,
      4 modified by Marc-Andre Lemburg <mal (at) lemburg.com> according to the
      5 Unicode Integration Proposal (see file Misc/unicode.txt).
      6 
      7 Major speed upgrades to the method implementations at the Reykjavik
      8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
      9 
     10 Copyright (c) Corporation for National Research Initiatives.
     11 
     12 --------------------------------------------------------------------
     13 The original string type implementation is:
     14 
     15   Copyright (c) 1999 by Secret Labs AB
     16   Copyright (c) 1999 by Fredrik Lundh
     17 
     18 By obtaining, using, and/or copying this software and/or its
     19 associated documentation, you agree that you have read, understood,
     20 and will comply with the following terms and conditions:
     21 
     22 Permission to use, copy, modify, and distribute this software and its
     23 associated documentation for any purpose and without fee is hereby
     24 granted, provided that the above copyright notice appears in all
     25 copies, and that both that copyright notice and this permission notice
     26 appear in supporting documentation, and that the name of Secret Labs
     27 AB or the author not be used in advertising or publicity pertaining to
     28 distribution of the software without specific, written prior
     29 permission.
     30 
     31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
     32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
     33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
     34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
     37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     38 --------------------------------------------------------------------
     39 
     40 */
     41 
     42 #define PY_SSIZE_T_CLEAN
     43 #include "Python.h"
     44 
     45 #include "unicodeobject.h"
     46 #include "ucnhash.h"
     47 
     48 #ifdef MS_WINDOWS
     49 #include <windows.h>
     50 #endif
     51 
     52 /* Limit for the Unicode object free list */
     53 
     54 #define PyUnicode_MAXFREELIST       1024
     55 
     56 /* Limit for the Unicode object free list stay alive optimization.
     57 
     58    The implementation will keep allocated Unicode memory intact for
     59    all objects on the free list having a size less than this
     60    limit. This reduces malloc() overhead for small Unicode objects.
     61 
     62    At worst this will result in PyUnicode_MAXFREELIST *
     63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
     64    malloc()-overhead) bytes of unused garbage.
     65 
     66    Setting the limit to 0 effectively turns the feature off.
     67 
     68    Note: This is an experimental feature ! If you get core dumps when
     69    using Unicode objects, turn this feature off.
     70 
     71 */
     72 
     73 #define KEEPALIVE_SIZE_LIMIT       9
     74 
     75 /* Endianness switches; defaults to little endian */
     76 
     77 #ifdef WORDS_BIGENDIAN
     78 # define BYTEORDER_IS_BIG_ENDIAN
     79 #else
     80 # define BYTEORDER_IS_LITTLE_ENDIAN
     81 #endif
     82 
     83 /* --- Globals ------------------------------------------------------------
     84 
     85 NOTE: In the interpreter's initialization phase, some globals are currently
     86       initialized dynamically as needed. In the process Unicode objects may
     87       be created before the Unicode type is ready.
     88 
     89 */
     90 
     91 
     92 #ifdef __cplusplus
     93 extern "C" {
     94 #endif
     95 
     96 /* Free list for Unicode objects */
     97 static PyUnicodeObject *free_list = NULL;
     98 static int numfree = 0;
     99 
    100 /* The empty Unicode object is shared to improve performance. */
    101 static PyUnicodeObject *unicode_empty = NULL;
    102 
    103 #define _Py_RETURN_UNICODE_EMPTY()                      \
    104     do {                                                \
    105         if (unicode_empty != NULL)                      \
    106             Py_INCREF(unicode_empty);                   \
    107         else {                                          \
    108             unicode_empty = _PyUnicode_New(0);          \
    109             if (unicode_empty != NULL)                  \
    110                 Py_INCREF(unicode_empty);               \
    111         }                                               \
    112         return (PyObject *)unicode_empty;               \
    113     } while (0)
    114 
    115 /* Single character Unicode strings in the Latin-1 range are being
    116    shared as well. */
    117 static PyUnicodeObject *unicode_latin1[256] = {NULL};
    118 
    119 /* Default encoding to use and assume when NULL is passed as encoding
    120    parameter; it is initialized by _PyUnicode_Init().
    121 
    122    Always use the PyUnicode_SetDefaultEncoding() and
    123    PyUnicode_GetDefaultEncoding() APIs to access this global.
    124 
    125 */
    126 static char unicode_default_encoding[100 + 1] = "ascii";
    127 
    128 /* Fast detection of the most frequent whitespace characters */
    129 const unsigned char _Py_ascii_whitespace[] = {
    130     0, 0, 0, 0, 0, 0, 0, 0,
    131 /*     case 0x0009: * CHARACTER TABULATION */
    132 /*     case 0x000A: * LINE FEED */
    133 /*     case 0x000B: * LINE TABULATION */
    134 /*     case 0x000C: * FORM FEED */
    135 /*     case 0x000D: * CARRIAGE RETURN */
    136     0, 1, 1, 1, 1, 1, 0, 0,
    137     0, 0, 0, 0, 0, 0, 0, 0,
    138 /*     case 0x001C: * FILE SEPARATOR */
    139 /*     case 0x001D: * GROUP SEPARATOR */
    140 /*     case 0x001E: * RECORD SEPARATOR */
    141 /*     case 0x001F: * UNIT SEPARATOR */
    142     0, 0, 0, 0, 1, 1, 1, 1,
    143 /*     case 0x0020: * SPACE */
    144     1, 0, 0, 0, 0, 0, 0, 0,
    145     0, 0, 0, 0, 0, 0, 0, 0,
    146     0, 0, 0, 0, 0, 0, 0, 0,
    147     0, 0, 0, 0, 0, 0, 0, 0,
    148 
    149     0, 0, 0, 0, 0, 0, 0, 0,
    150     0, 0, 0, 0, 0, 0, 0, 0,
    151     0, 0, 0, 0, 0, 0, 0, 0,
    152     0, 0, 0, 0, 0, 0, 0, 0,
    153     0, 0, 0, 0, 0, 0, 0, 0,
    154     0, 0, 0, 0, 0, 0, 0, 0,
    155     0, 0, 0, 0, 0, 0, 0, 0,
    156     0, 0, 0, 0, 0, 0, 0, 0
    157 };
    158 
    159 /* Same for linebreaks */
    160 static unsigned char ascii_linebreak[] = {
    161     0, 0, 0, 0, 0, 0, 0, 0,
    162 /*         0x000A, * LINE FEED */
    163 /*         0x000B, * LINE TABULATION */
    164 /*         0x000C, * FORM FEED */
    165 /*         0x000D, * CARRIAGE RETURN */
    166     0, 0, 1, 1, 1, 1, 0, 0,
    167     0, 0, 0, 0, 0, 0, 0, 0,
    168 /*         0x001C, * FILE SEPARATOR */
    169 /*         0x001D, * GROUP SEPARATOR */
    170 /*         0x001E, * RECORD SEPARATOR */
    171     0, 0, 0, 0, 1, 1, 1, 0,
    172     0, 0, 0, 0, 0, 0, 0, 0,
    173     0, 0, 0, 0, 0, 0, 0, 0,
    174     0, 0, 0, 0, 0, 0, 0, 0,
    175     0, 0, 0, 0, 0, 0, 0, 0,
    176 
    177     0, 0, 0, 0, 0, 0, 0, 0,
    178     0, 0, 0, 0, 0, 0, 0, 0,
    179     0, 0, 0, 0, 0, 0, 0, 0,
    180     0, 0, 0, 0, 0, 0, 0, 0,
    181     0, 0, 0, 0, 0, 0, 0, 0,
    182     0, 0, 0, 0, 0, 0, 0, 0,
    183     0, 0, 0, 0, 0, 0, 0, 0,
    184     0, 0, 0, 0, 0, 0, 0, 0
    185 };
    186 
    187 
    188 Py_UNICODE
    189 PyUnicode_GetMax(void)
    190 {
    191 #ifdef Py_UNICODE_WIDE
    192     return 0x10FFFF;
    193 #else
    194     /* This is actually an illegal character, so it should
    195        not be passed to unichr. */
    196     return 0xFFFF;
    197 #endif
    198 }
    199 
    200 /* --- Bloom Filters ----------------------------------------------------- */
    201 
    202 /* stuff to implement simple "bloom filters" for Unicode characters.
    203    to keep things simple, we use a single bitmask, using the least 5
    204    bits from each unicode characters as the bit index. */
    205 
    206 /* the linebreak mask is set up by Unicode_Init below */
    207 
    208 #if LONG_BIT >= 128
    209 #define BLOOM_WIDTH 128
    210 #elif LONG_BIT >= 64
    211 #define BLOOM_WIDTH 64
    212 #elif LONG_BIT >= 32
    213 #define BLOOM_WIDTH 32
    214 #else
    215 #error "LONG_BIT is smaller than 32"
    216 #endif
    217 
    218 #define BLOOM_MASK unsigned long
    219 
    220 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
    221 
    222 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
    223 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
    224 
    225 #define BLOOM_LINEBREAK(ch)                                             \
    226     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
    227      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
    228 
    229 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
    230 {
    231     /* calculate simple bloom-style bitmask for a given unicode string */
    232 
    233     BLOOM_MASK mask;
    234     Py_ssize_t i;
    235 
    236     mask = 0;
    237     for (i = 0; i < len; i++)
    238         BLOOM_ADD(mask, ptr[i]);
    239 
    240     return mask;
    241 }
    242 
    243 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
    244 {
    245     Py_ssize_t i;
    246 
    247     for (i = 0; i < setlen; i++)
    248         if (set[i] == chr)
    249             return 1;
    250 
    251     return 0;
    252 }
    253 
    254 #define BLOOM_MEMBER(mask, chr, set, setlen)                    \
    255     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
    256 
    257 /* --- Unicode Object ----------------------------------------------------- */
    258 
    259 static
    260 int unicode_resize(register PyUnicodeObject *unicode,
    261                    Py_ssize_t length)
    262 {
    263     void *oldstr;
    264 
    265     /* Shortcut if there's nothing much to do. */
    266     if (unicode->length == length)
    267         goto reset;
    268 
    269     /* Resizing shared object (unicode_empty or single character
    270        objects) in-place is not allowed. Use PyUnicode_Resize()
    271        instead ! */
    272 
    273     if (unicode == unicode_empty ||
    274         (unicode->length == 1 &&
    275          unicode->str[0] < 256U &&
    276          unicode_latin1[unicode->str[0]] == unicode)) {
    277         PyErr_SetString(PyExc_SystemError,
    278                         "can't resize shared unicode objects");
    279         return -1;
    280     }
    281 
    282     /* We allocate one more byte to make sure the string is Ux0000 terminated.
    283        The overallocation is also used by fastsearch, which assumes that it's
    284        safe to look at str[length] (without making any assumptions about what
    285        it contains). */
    286 
    287     oldstr = unicode->str;
    288     unicode->str = PyObject_REALLOC(unicode->str,
    289                                     sizeof(Py_UNICODE) * (length + 1));
    290     if (!unicode->str) {
    291         unicode->str = (Py_UNICODE *)oldstr;
    292         PyErr_NoMemory();
    293         return -1;
    294     }
    295     unicode->str[length] = 0;
    296     unicode->length = length;
    297 
    298   reset:
    299     /* Reset the object caches */
    300     if (unicode->defenc) {
    301         Py_CLEAR(unicode->defenc);
    302     }
    303     unicode->hash = -1;
    304 
    305     return 0;
    306 }
    307 
    308 /* We allocate one more byte to make sure the string is
    309    Ux0000 terminated; some code relies on that.
    310 
    311    XXX This allocator could further be enhanced by assuring that the
    312    free list never reduces its size below 1.
    313 
    314 */
    315 
    316 static
    317 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
    318 {
    319     register PyUnicodeObject *unicode;
    320 
    321     /* Optimization for empty strings */
    322     if (length == 0 && unicode_empty != NULL) {
    323         Py_INCREF(unicode_empty);
    324         return unicode_empty;
    325     }
    326 
    327     /* Ensure we won't overflow the size. */
    328     if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
    329         return (PyUnicodeObject *)PyErr_NoMemory();
    330     }
    331 
    332     /* Unicode freelist & memory allocation */
    333     if (free_list) {
    334         unicode = free_list;
    335         free_list = *(PyUnicodeObject **)unicode;
    336         numfree--;
    337         if (unicode->str) {
    338             /* Keep-Alive optimization: we only upsize the buffer,
    339                never downsize it. */
    340             if ((unicode->length < length) &&
    341                 unicode_resize(unicode, length) < 0) {
    342                 PyObject_DEL(unicode->str);
    343                 unicode->str = NULL;
    344             }
    345         }
    346         else {
    347             size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
    348             unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
    349         }
    350         PyObject_INIT(unicode, &PyUnicode_Type);
    351     }
    352     else {
    353         size_t new_size;
    354         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
    355         if (unicode == NULL)
    356             return NULL;
    357         new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
    358         unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
    359     }
    360 
    361     if (!unicode->str) {
    362         PyErr_NoMemory();
    363         goto onError;
    364     }
    365     /* Initialize the first element to guard against cases where
    366      * the caller fails before initializing str -- unicode_resize()
    367      * reads str[0], and the Keep-Alive optimization can keep memory
    368      * allocated for str alive across a call to unicode_dealloc(unicode).
    369      * We don't want unicode_resize to read uninitialized memory in
    370      * that case.
    371      */
    372     unicode->str[0] = 0;
    373     unicode->str[length] = 0;
    374     unicode->length = length;
    375     unicode->hash = -1;
    376     unicode->defenc = NULL;
    377     return unicode;
    378 
    379   onError:
    380     /* XXX UNREF/NEWREF interface should be more symmetrical */
    381     _Py_DEC_REFTOTAL;
    382     _Py_ForgetReference((PyObject *)unicode);
    383     PyObject_Del(unicode);
    384     return NULL;
    385 }
    386 
    387 static
    388 void unicode_dealloc(register PyUnicodeObject *unicode)
    389 {
    390     if (PyUnicode_CheckExact(unicode) &&
    391         numfree < PyUnicode_MAXFREELIST) {
    392         /* Keep-Alive optimization */
    393         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
    394             PyObject_DEL(unicode->str);
    395             unicode->str = NULL;
    396             unicode->length = 0;
    397         }
    398         if (unicode->defenc) {
    399             Py_CLEAR(unicode->defenc);
    400         }
    401         /* Add to free list */
    402         *(PyUnicodeObject **)unicode = free_list;
    403         free_list = unicode;
    404         numfree++;
    405     }
    406     else {
    407         PyObject_DEL(unicode->str);
    408         Py_XDECREF(unicode->defenc);
    409         Py_TYPE(unicode)->tp_free((PyObject *)unicode);
    410     }
    411 }
    412 
    413 static
    414 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
    415 {
    416     register PyUnicodeObject *v;
    417 
    418     /* Argument checks */
    419     if (unicode == NULL) {
    420         PyErr_BadInternalCall();
    421         return -1;
    422     }
    423     v = *unicode;
    424     if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
    425         PyErr_BadInternalCall();
    426         return -1;
    427     }
    428 
    429     /* Resizing unicode_empty and single character objects is not
    430        possible since these are being shared. We simply return a fresh
    431        copy with the same Unicode content. */
    432     if (v->length != length &&
    433         (v == unicode_empty || v->length == 1)) {
    434         PyUnicodeObject *w = _PyUnicode_New(length);
    435         if (w == NULL)
    436             return -1;
    437         Py_UNICODE_COPY(w->str, v->str,
    438                         length < v->length ? length : v->length);
    439         Py_DECREF(*unicode);
    440         *unicode = w;
    441         return 0;
    442     }
    443 
    444     /* Note that we don't have to modify *unicode for unshared Unicode
    445        objects, since we can modify them in-place. */
    446     return unicode_resize(v, length);
    447 }
    448 
    449 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
    450 {
    451     return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
    452 }
    453 
    454 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
    455                                 Py_ssize_t size)
    456 {
    457     PyUnicodeObject *unicode;
    458 
    459     /* If the Unicode data is known at construction time, we can apply
    460        some optimizations which share commonly used objects. */
    461     if (u != NULL) {
    462 
    463         /* Optimization for empty strings */
    464         if (size == 0)
    465             _Py_RETURN_UNICODE_EMPTY();
    466 
    467         /* Single character Unicode objects in the Latin-1 range are
    468            shared when using this constructor */
    469         if (size == 1 && *u < 256) {
    470             unicode = unicode_latin1[*u];
    471             if (!unicode) {
    472                 unicode = _PyUnicode_New(1);
    473                 if (!unicode)
    474                     return NULL;
    475                 unicode->str[0] = *u;
    476                 unicode_latin1[*u] = unicode;
    477             }
    478             Py_INCREF(unicode);
    479             return (PyObject *)unicode;
    480         }
    481     }
    482 
    483     unicode = _PyUnicode_New(size);
    484     if (!unicode)
    485         return NULL;
    486 
    487     /* Copy the Unicode data into the new object */
    488     if (u != NULL)
    489         Py_UNICODE_COPY(unicode->str, u, size);
    490 
    491     return (PyObject *)unicode;
    492 }
    493 
    494 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
    495 {
    496     PyUnicodeObject *unicode;
    497 
    498     if (size < 0) {
    499         PyErr_SetString(PyExc_SystemError,
    500                         "Negative size passed to PyUnicode_FromStringAndSize");
    501         return NULL;
    502     }
    503 
    504     /* If the Unicode data is known at construction time, we can apply
    505        some optimizations which share commonly used objects.
    506        Also, this means the input must be UTF-8, so fall back to the
    507        UTF-8 decoder at the end. */
    508     if (u != NULL) {
    509 
    510         /* Optimization for empty strings */
    511         if (size == 0)
    512             _Py_RETURN_UNICODE_EMPTY();
    513 
    514         /* Single characters are shared when using this constructor.
    515            Restrict to ASCII, since the input must be UTF-8. */
    516         if (size == 1 && Py_CHARMASK(*u) < 128) {
    517             unicode = unicode_latin1[Py_CHARMASK(*u)];
    518             if (!unicode) {
    519                 unicode = _PyUnicode_New(1);
    520                 if (!unicode)
    521                     return NULL;
    522                 unicode->str[0] = Py_CHARMASK(*u);
    523                 unicode_latin1[Py_CHARMASK(*u)] = unicode;
    524             }
    525             Py_INCREF(unicode);
    526             return (PyObject *)unicode;
    527         }
    528 
    529         return PyUnicode_DecodeUTF8(u, size, NULL);
    530     }
    531 
    532     unicode = _PyUnicode_New(size);
    533     if (!unicode)
    534         return NULL;
    535 
    536     return (PyObject *)unicode;
    537 }
    538 
    539 PyObject *PyUnicode_FromString(const char *u)
    540 {
    541     size_t size = strlen(u);
    542     if (size > PY_SSIZE_T_MAX) {
    543         PyErr_SetString(PyExc_OverflowError, "input too long");
    544         return NULL;
    545     }
    546 
    547     return PyUnicode_FromStringAndSize(u, size);
    548 }
    549 
    550 /* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
    551  * by 'ptr', possibly combining surrogate pairs on narrow builds.
    552  * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
    553  * that should be returned and 'end' pointing to the end of the buffer.
    554  * ('end' is used on narrow builds to detect a lone surrogate at the
    555  * end of the buffer that should be returned unchanged.)
    556  * The ptr and end arguments should be side-effect free and ptr must an lvalue.
    557  * The type of the returned char is always Py_UCS4.
    558  *
    559  * Note: the macro advances ptr to next char, so it might have side-effects
    560  *       (especially if used with other macros).
    561  */
    562 
    563 /* helper macros used by _Py_UNICODE_NEXT */
    564 #define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
    565 #define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
    566 /* Join two surrogate characters and return a single Py_UCS4 value. */
    567 #define _Py_UNICODE_JOIN_SURROGATES(high, low)  \
    568     (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
    569       ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
    570 
    571 #ifdef Py_UNICODE_WIDE
    572 #define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
    573 #else
    574 #define _Py_UNICODE_NEXT(ptr, end)                                      \
    575      (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) &&      \
    576         _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ?                       \
    577        ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
    578        (Py_UCS4)*(ptr)++)
    579 #endif
    580 
    581 #ifdef HAVE_WCHAR_H
    582 
    583 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
    584 # define CONVERT_WCHAR_TO_SURROGATES
    585 #endif
    586 
    587 #ifdef CONVERT_WCHAR_TO_SURROGATES
    588 
    589 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
    590    to convert from UTF32 to UTF16. */
    591 
    592 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
    593                                  Py_ssize_t size)
    594 {
    595     PyUnicodeObject *unicode;
    596     register Py_ssize_t i;
    597     Py_ssize_t alloc;
    598     const wchar_t *orig_w;
    599 
    600     if (w == NULL) {
    601         PyErr_BadInternalCall();
    602         return NULL;
    603     }
    604 
    605     alloc = size;
    606     orig_w = w;
    607     for (i = size; i > 0; i--) {
    608         if (*w > 0xFFFF)
    609             alloc++;
    610         w++;
    611     }
    612     w = orig_w;
    613     unicode = _PyUnicode_New(alloc);
    614     if (!unicode)
    615         return NULL;
    616 
    617     /* Copy the wchar_t data into the new object */
    618     {
    619         register Py_UNICODE *u;
    620         u = PyUnicode_AS_UNICODE(unicode);
    621         for (i = size; i > 0; i--) {
    622             if (*w > 0xFFFF) {
    623                 wchar_t ordinal = *w++;
    624                 ordinal -= 0x10000;
    625                 *u++ = 0xD800 | (ordinal >> 10);
    626                 *u++ = 0xDC00 | (ordinal & 0x3FF);
    627             }
    628             else
    629                 *u++ = *w++;
    630         }
    631     }
    632     return (PyObject *)unicode;
    633 }
    634 
    635 #else
    636 
    637 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
    638                                  Py_ssize_t size)
    639 {
    640     PyUnicodeObject *unicode;
    641 
    642     if (w == NULL) {
    643         PyErr_BadInternalCall();
    644         return NULL;
    645     }
    646 
    647     unicode = _PyUnicode_New(size);
    648     if (!unicode)
    649         return NULL;
    650 
    651     /* Copy the wchar_t data into the new object */
    652 #ifdef HAVE_USABLE_WCHAR_T
    653     memcpy(unicode->str, w, size * sizeof(wchar_t));
    654 #else
    655     {
    656         register Py_UNICODE *u;
    657         register Py_ssize_t i;
    658         u = PyUnicode_AS_UNICODE(unicode);
    659         for (i = size; i > 0; i--)
    660             *u++ = *w++;
    661     }
    662 #endif
    663 
    664     return (PyObject *)unicode;
    665 }
    666 
    667 #endif /* CONVERT_WCHAR_TO_SURROGATES */
    668 
    669 #undef CONVERT_WCHAR_TO_SURROGATES
    670 
    671 static void
    672 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
    673 {
    674     *fmt++ = '%';
    675     if (width) {
    676         if (zeropad)
    677             *fmt++ = '0';
    678         fmt += sprintf(fmt, "%d", width);
    679     }
    680     if (precision)
    681         fmt += sprintf(fmt, ".%d", precision);
    682     if (longflag)
    683         *fmt++ = 'l';
    684     else if (size_tflag) {
    685         char *f = PY_FORMAT_SIZE_T;
    686         while (*f)
    687             *fmt++ = *f++;
    688     }
    689     *fmt++ = c;
    690     *fmt = '\0';
    691 }
    692 
    693 #define appendstring(string) \
    694     do { \
    695         for (copy = string;*copy; copy++) { \
    696             *s++ = (unsigned char)*copy; \
    697         } \
    698     } while (0)
    699 
    700 PyObject *
    701 PyUnicode_FromFormatV(const char *format, va_list vargs)
    702 {
    703     va_list count;
    704     Py_ssize_t callcount = 0;
    705     PyObject **callresults = NULL;
    706     PyObject **callresult = NULL;
    707     Py_ssize_t n = 0;
    708     int width = 0;
    709     int precision = 0;
    710     int zeropad;
    711     const char* f;
    712     Py_UNICODE *s;
    713     PyObject *string;
    714     /* used by sprintf */
    715     char buffer[21];
    716     /* use abuffer instead of buffer, if we need more space
    717      * (which can happen if there's a format specifier with width). */
    718     char *abuffer = NULL;
    719     char *realbuffer;
    720     Py_ssize_t abuffersize = 0;
    721     char fmt[60]; /* should be enough for %0width.precisionld */
    722     const char *copy;
    723 
    724 #ifdef VA_LIST_IS_ARRAY
    725     Py_MEMCPY(count, vargs, sizeof(va_list));
    726 #else
    727 #ifdef  __va_copy
    728     __va_copy(count, vargs);
    729 #else
    730     count = vargs;
    731 #endif
    732 #endif
    733      /* step 1: count the number of %S/%R/%s format specifications
    734       * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
    735       * objects once during step 3 and put the result in an array) */
    736     for (f = format; *f; f++) {
    737          if (*f == '%') {
    738              f++;
    739              while (*f && *f != '%' && !isalpha((unsigned)*f))
    740                  f++;
    741              if (!*f)
    742                  break;
    743              if (*f == 's' || *f=='S' || *f=='R')
    744                  ++callcount;
    745          }
    746     }
    747     /* step 2: allocate memory for the results of
    748      * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
    749     if (callcount) {
    750         callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
    751         if (!callresults) {
    752             PyErr_NoMemory();
    753             return NULL;
    754         }
    755         callresult = callresults;
    756     }
    757     /* step 3: figure out how large a buffer we need */
    758     for (f = format; *f; f++) {
    759         if (*f == '%') {
    760             const char* p = f++;
    761             width = 0;
    762             while (isdigit((unsigned)*f))
    763                 width = (width*10) + *f++ - '0';
    764             precision = 0;
    765             if (*f == '.') {
    766                 f++;
    767                 while (isdigit((unsigned)*f))
    768                     precision = (precision*10) + *f++ - '0';
    769             }
    770 
    771             /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
    772              * they don't affect the amount of space we reserve.
    773              */
    774             if ((*f == 'l' || *f == 'z') &&
    775                 (f[1] == 'd' || f[1] == 'u'))
    776                 ++f;
    777 
    778             switch (*f) {
    779             case 'c':
    780             {
    781                 int ordinal = va_arg(count, int);
    782 #ifdef Py_UNICODE_WIDE
    783                 if (ordinal < 0 || ordinal > 0x10ffff) {
    784                     PyErr_SetString(PyExc_OverflowError,
    785                                     "%c arg not in range(0x110000) "
    786                                     "(wide Python build)");
    787                     goto fail;
    788                 }
    789 #else
    790                 if (ordinal < 0 || ordinal > 0xffff) {
    791                     PyErr_SetString(PyExc_OverflowError,
    792                                     "%c arg not in range(0x10000) "
    793                                     "(narrow Python build)");
    794                     goto fail;
    795                 }
    796 #endif
    797                 /* fall through... */
    798             }
    799             case '%':
    800                 n++;
    801                 break;
    802             case 'd': case 'u': case 'i': case 'x':
    803                 (void) va_arg(count, int);
    804                 if (width < precision)
    805                     width = precision;
    806                 /* 20 bytes is enough to hold a 64-bit
    807                    integer.  Decimal takes the most space.
    808                    This isn't enough for octal.
    809                    If a width is specified we need more
    810                    (which we allocate later). */
    811                 if (width < 20)
    812                     width = 20;
    813                 n += width;
    814                 if (abuffersize < width)
    815                     abuffersize = width;
    816                 break;
    817             case 's':
    818             {
    819                 /* UTF-8 */
    820                 const char *s = va_arg(count, const char*);
    821                 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
    822                 if (!str)
    823                     goto fail;
    824                 n += PyUnicode_GET_SIZE(str);
    825                 /* Remember the str and switch to the next slot */
    826                 *callresult++ = str;
    827                 break;
    828             }
    829             case 'U':
    830             {
    831                 PyObject *obj = va_arg(count, PyObject *);
    832                 assert(obj && PyUnicode_Check(obj));
    833                 n += PyUnicode_GET_SIZE(obj);
    834                 break;
    835             }
    836             case 'V':
    837             {
    838                 PyObject *obj = va_arg(count, PyObject *);
    839                 const char *str = va_arg(count, const char *);
    840                 assert(obj || str);
    841                 assert(!obj || PyUnicode_Check(obj));
    842                 if (obj)
    843                     n += PyUnicode_GET_SIZE(obj);
    844                 else
    845                     n += strlen(str);
    846                 break;
    847             }
    848             case 'S':
    849             {
    850                 PyObject *obj = va_arg(count, PyObject *);
    851                 PyObject *str;
    852                 assert(obj);
    853                 str = PyObject_Str(obj);
    854                 if (!str)
    855                     goto fail;
    856                 n += PyString_GET_SIZE(str);
    857                 /* Remember the str and switch to the next slot */
    858                 *callresult++ = str;
    859                 break;
    860             }
    861             case 'R':
    862             {
    863                 PyObject *obj = va_arg(count, PyObject *);
    864                 PyObject *repr;
    865                 assert(obj);
    866                 repr = PyObject_Repr(obj);
    867                 if (!repr)
    868                     goto fail;
    869                 n += PyUnicode_GET_SIZE(repr);
    870                 /* Remember the repr and switch to the next slot */
    871                 *callresult++ = repr;
    872                 break;
    873             }
    874             case 'p':
    875                 (void) va_arg(count, int);
    876                 /* maximum 64-bit pointer representation:
    877                  * 0xffffffffffffffff
    878                  * so 19 characters is enough.
    879                  * XXX I count 18 -- what's the extra for?
    880                  */
    881                 n += 19;
    882                 break;
    883             default:
    884                 /* if we stumble upon an unknown
    885                    formatting code, copy the rest of
    886                    the format string to the output
    887                    string. (we cannot just skip the
    888                    code, since there's no way to know
    889                    what's in the argument list) */
    890                 n += strlen(p);
    891                 goto expand;
    892             }
    893         } else
    894             n++;
    895     }
    896   expand:
    897     if (abuffersize > 20) {
    898         /* add 1 for sprintf's trailing null byte */
    899         abuffer = PyObject_Malloc(abuffersize + 1);
    900         if (!abuffer) {
    901             PyErr_NoMemory();
    902             goto fail;
    903         }
    904         realbuffer = abuffer;
    905     }
    906     else
    907         realbuffer = buffer;
    908     /* step 4: fill the buffer */
    909     /* Since we've analyzed how much space we need for the worst case,
    910        we don't have to resize the string.
    911        There can be no errors beyond this point. */
    912     string = PyUnicode_FromUnicode(NULL, n);
    913     if (!string)
    914         goto fail;
    915 
    916     s = PyUnicode_AS_UNICODE(string);
    917     callresult = callresults;
    918 
    919     for (f = format; *f; f++) {
    920         if (*f == '%') {
    921             const char* p = f++;
    922             int longflag = 0;
    923             int size_tflag = 0;
    924             zeropad = (*f == '0');
    925             /* parse the width.precision part */
    926             width = 0;
    927             while (isdigit((unsigned)*f))
    928                 width = (width*10) + *f++ - '0';
    929             precision = 0;
    930             if (*f == '.') {
    931                 f++;
    932                 while (isdigit((unsigned)*f))
    933                     precision = (precision*10) + *f++ - '0';
    934             }
    935             /* handle the long flag, but only for %ld and %lu.
    936                others can be added when necessary. */
    937             if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
    938                 longflag = 1;
    939                 ++f;
    940             }
    941             /* handle the size_t flag. */
    942             if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
    943                 size_tflag = 1;
    944                 ++f;
    945             }
    946 
    947             switch (*f) {
    948             case 'c':
    949                 *s++ = va_arg(vargs, int);
    950                 break;
    951             case 'd':
    952                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
    953                 if (longflag)
    954                     sprintf(realbuffer, fmt, va_arg(vargs, long));
    955                 else if (size_tflag)
    956                     sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
    957                 else
    958                     sprintf(realbuffer, fmt, va_arg(vargs, int));
    959                 appendstring(realbuffer);
    960                 break;
    961             case 'u':
    962                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
    963                 if (longflag)
    964                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
    965                 else if (size_tflag)
    966                     sprintf(realbuffer, fmt, va_arg(vargs, size_t));
    967                 else
    968                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
    969                 appendstring(realbuffer);
    970                 break;
    971             case 'i':
    972                 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
    973                 sprintf(realbuffer, fmt, va_arg(vargs, int));
    974                 appendstring(realbuffer);
    975                 break;
    976             case 'x':
    977                 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
    978                 sprintf(realbuffer, fmt, va_arg(vargs, int));
    979                 appendstring(realbuffer);
    980                 break;
    981             case 's':
    982             {
    983                 /* unused, since we already have the result */
    984                 (void) va_arg(vargs, char *);
    985                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
    986                                 PyUnicode_GET_SIZE(*callresult));
    987                 s += PyUnicode_GET_SIZE(*callresult);
    988                 /* We're done with the unicode()/repr() => forget it */
    989                 Py_DECREF(*callresult);
    990                 /* switch to next unicode()/repr() result */
    991                 ++callresult;
    992                 break;
    993             }
    994             case 'U':
    995             {
    996                 PyObject *obj = va_arg(vargs, PyObject *);
    997                 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
    998                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
    999                 s += size;
   1000                 break;
   1001             }
   1002             case 'V':
   1003             {
   1004                 PyObject *obj = va_arg(vargs, PyObject *);
   1005                 const char *str = va_arg(vargs, const char *);
   1006                 if (obj) {
   1007                     Py_ssize_t size = PyUnicode_GET_SIZE(obj);
   1008                     Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
   1009                     s += size;
   1010                 } else {
   1011                     appendstring(str);
   1012                 }
   1013                 break;
   1014             }
   1015             case 'S':
   1016             case 'R':
   1017             {
   1018                 const char *str = PyString_AS_STRING(*callresult);
   1019                 /* unused, since we already have the result */
   1020                 (void) va_arg(vargs, PyObject *);
   1021                 appendstring(str);
   1022                 /* We're done with the unicode()/repr() => forget it */
   1023                 Py_DECREF(*callresult);
   1024                 /* switch to next unicode()/repr() result */
   1025                 ++callresult;
   1026                 break;
   1027             }
   1028             case 'p':
   1029                 sprintf(buffer, "%p", va_arg(vargs, void*));
   1030                 /* %p is ill-defined:  ensure leading 0x. */
   1031                 if (buffer[1] == 'X')
   1032                     buffer[1] = 'x';
   1033                 else if (buffer[1] != 'x') {
   1034                     memmove(buffer+2, buffer, strlen(buffer)+1);
   1035                     buffer[0] = '0';
   1036                     buffer[1] = 'x';
   1037                 }
   1038                 appendstring(buffer);
   1039                 break;
   1040             case '%':
   1041                 *s++ = '%';
   1042                 break;
   1043             default:
   1044                 appendstring(p);
   1045                 goto end;
   1046             }
   1047         } else
   1048             *s++ = *f;
   1049     }
   1050 
   1051   end:
   1052     if (callresults)
   1053         PyObject_Free(callresults);
   1054     if (abuffer)
   1055         PyObject_Free(abuffer);
   1056     PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
   1057     return string;
   1058   fail:
   1059     if (callresults) {
   1060         PyObject **callresult2 = callresults;
   1061         while (callresult2 < callresult) {
   1062             Py_DECREF(*callresult2);
   1063             ++callresult2;
   1064         }
   1065         PyObject_Free(callresults);
   1066     }
   1067     if (abuffer)
   1068         PyObject_Free(abuffer);
   1069     return NULL;
   1070 }
   1071 
   1072 #undef appendstring
   1073 
   1074 PyObject *
   1075 PyUnicode_FromFormat(const char *format, ...)
   1076 {
   1077     PyObject* ret;
   1078     va_list vargs;
   1079 
   1080 #ifdef HAVE_STDARG_PROTOTYPES
   1081     va_start(vargs, format);
   1082 #else
   1083     va_start(vargs);
   1084 #endif
   1085     ret = PyUnicode_FromFormatV(format, vargs);
   1086     va_end(vargs);
   1087     return ret;
   1088 }
   1089 
   1090 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
   1091                                 wchar_t *w,
   1092                                 Py_ssize_t size)
   1093 {
   1094     if (unicode == NULL) {
   1095         PyErr_BadInternalCall();
   1096         return -1;
   1097     }
   1098 
   1099     /* If possible, try to copy the 0-termination as well */
   1100     if (size > PyUnicode_GET_SIZE(unicode))
   1101         size = PyUnicode_GET_SIZE(unicode) + 1;
   1102 
   1103 #ifdef HAVE_USABLE_WCHAR_T
   1104     memcpy(w, unicode->str, size * sizeof(wchar_t));
   1105 #else
   1106     {
   1107         register Py_UNICODE *u;
   1108         register Py_ssize_t i;
   1109         u = PyUnicode_AS_UNICODE(unicode);
   1110         for (i = size; i > 0; i--)
   1111             *w++ = *u++;
   1112     }
   1113 #endif
   1114 
   1115     if (size > PyUnicode_GET_SIZE(unicode))
   1116         return PyUnicode_GET_SIZE(unicode);
   1117     else
   1118         return size;
   1119 }
   1120 
   1121 #endif
   1122 
   1123 PyObject *PyUnicode_FromOrdinal(int ordinal)
   1124 {
   1125     Py_UNICODE s[1];
   1126 
   1127 #ifdef Py_UNICODE_WIDE
   1128     if (ordinal < 0 || ordinal > 0x10ffff) {
   1129         PyErr_SetString(PyExc_ValueError,
   1130                         "unichr() arg not in range(0x110000) "
   1131                         "(wide Python build)");
   1132         return NULL;
   1133     }
   1134 #else
   1135     if (ordinal < 0 || ordinal > 0xffff) {
   1136         PyErr_SetString(PyExc_ValueError,
   1137                         "unichr() arg not in range(0x10000) "
   1138                         "(narrow Python build)");
   1139         return NULL;
   1140     }
   1141 #endif
   1142 
   1143     s[0] = (Py_UNICODE)ordinal;
   1144     return PyUnicode_FromUnicode(s, 1);
   1145 }
   1146 
   1147 PyObject *PyUnicode_FromObject(register PyObject *obj)
   1148 {
   1149     /* XXX Perhaps we should make this API an alias of
   1150        PyObject_Unicode() instead ?! */
   1151     if (PyUnicode_CheckExact(obj)) {
   1152         Py_INCREF(obj);
   1153         return obj;
   1154     }
   1155     if (PyUnicode_Check(obj)) {
   1156         /* For a Unicode subtype that's not a Unicode object,
   1157            return a true Unicode object with the same data. */
   1158         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
   1159                                      PyUnicode_GET_SIZE(obj));
   1160     }
   1161     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
   1162 }
   1163 
   1164 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
   1165                                       const char *encoding,
   1166                                       const char *errors)
   1167 {
   1168     const char *s = NULL;
   1169     Py_ssize_t len;
   1170     PyObject *v;
   1171 
   1172     if (obj == NULL) {
   1173         PyErr_BadInternalCall();
   1174         return NULL;
   1175     }
   1176 
   1177 #if 0
   1178     /* For b/w compatibility we also accept Unicode objects provided
   1179        that no encodings is given and then redirect to
   1180        PyObject_Unicode() which then applies the additional logic for
   1181        Unicode subclasses.
   1182 
   1183        NOTE: This API should really only be used for object which
   1184        represent *encoded* Unicode !
   1185 
   1186     */
   1187     if (PyUnicode_Check(obj)) {
   1188         if (encoding) {
   1189             PyErr_SetString(PyExc_TypeError,
   1190                             "decoding Unicode is not supported");
   1191             return NULL;
   1192         }
   1193         return PyObject_Unicode(obj);
   1194     }
   1195 #else
   1196     if (PyUnicode_Check(obj)) {
   1197         PyErr_SetString(PyExc_TypeError,
   1198                         "decoding Unicode is not supported");
   1199         return NULL;
   1200     }
   1201 #endif
   1202 
   1203     /* Coerce object */
   1204     if (PyString_Check(obj)) {
   1205         s = PyString_AS_STRING(obj);
   1206         len = PyString_GET_SIZE(obj);
   1207     }
   1208     else if (PyByteArray_Check(obj)) {
   1209         /* Python 2.x specific */
   1210         PyErr_Format(PyExc_TypeError,
   1211                      "decoding bytearray is not supported");
   1212         return NULL;
   1213     }
   1214     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
   1215         /* Overwrite the error message with something more useful in
   1216            case of a TypeError. */
   1217         if (PyErr_ExceptionMatches(PyExc_TypeError))
   1218             PyErr_Format(PyExc_TypeError,
   1219                          "coercing to Unicode: need string or buffer, "
   1220                          "%.80s found",
   1221                          Py_TYPE(obj)->tp_name);
   1222         goto onError;
   1223     }
   1224 
   1225     /* Convert to Unicode */
   1226     if (len == 0)
   1227         _Py_RETURN_UNICODE_EMPTY();
   1228 
   1229     v = PyUnicode_Decode(s, len, encoding, errors);
   1230     return v;
   1231 
   1232   onError:
   1233     return NULL;
   1234 }
   1235 
   1236 PyObject *PyUnicode_Decode(const char *s,
   1237                            Py_ssize_t size,
   1238                            const char *encoding,
   1239                            const char *errors)
   1240 {
   1241     PyObject *buffer = NULL, *unicode;
   1242 
   1243     if (encoding == NULL)
   1244         encoding = PyUnicode_GetDefaultEncoding();
   1245 
   1246     /* Shortcuts for common default encodings */
   1247     if (strcmp(encoding, "utf-8") == 0)
   1248         return PyUnicode_DecodeUTF8(s, size, errors);
   1249     else if (strcmp(encoding, "latin-1") == 0)
   1250         return PyUnicode_DecodeLatin1(s, size, errors);
   1251 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
   1252     else if (strcmp(encoding, "mbcs") == 0)
   1253         return PyUnicode_DecodeMBCS(s, size, errors);
   1254 #endif
   1255     else if (strcmp(encoding, "ascii") == 0)
   1256         return PyUnicode_DecodeASCII(s, size, errors);
   1257 
   1258     /* Decode via the codec registry */
   1259     buffer = PyBuffer_FromMemory((void *)s, size);
   1260     if (buffer == NULL)
   1261         goto onError;
   1262     unicode = PyCodec_Decode(buffer, encoding, errors);
   1263     if (unicode == NULL)
   1264         goto onError;
   1265     if (!PyUnicode_Check(unicode)) {
   1266         PyErr_Format(PyExc_TypeError,
   1267                      "decoder did not return an unicode object (type=%.400s)",
   1268                      Py_TYPE(unicode)->tp_name);
   1269         Py_DECREF(unicode);
   1270         goto onError;
   1271     }
   1272     Py_DECREF(buffer);
   1273     return unicode;
   1274 
   1275   onError:
   1276     Py_XDECREF(buffer);
   1277     return NULL;
   1278 }
   1279 
   1280 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
   1281                                     const char *encoding,
   1282                                     const char *errors)
   1283 {
   1284     PyObject *v;
   1285 
   1286     if (!PyUnicode_Check(unicode)) {
   1287         PyErr_BadArgument();
   1288         goto onError;
   1289     }
   1290 
   1291     if (encoding == NULL)
   1292         encoding = PyUnicode_GetDefaultEncoding();
   1293 
   1294     /* Decode via the codec registry */
   1295     v = PyCodec_Decode(unicode, encoding, errors);
   1296     if (v == NULL)
   1297         goto onError;
   1298     return v;
   1299 
   1300   onError:
   1301     return NULL;
   1302 }
   1303 
   1304 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
   1305                            Py_ssize_t size,
   1306                            const char *encoding,
   1307                            const char *errors)
   1308 {
   1309     PyObject *v, *unicode;
   1310 
   1311     unicode = PyUnicode_FromUnicode(s, size);
   1312     if (unicode == NULL)
   1313         return NULL;
   1314     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
   1315     Py_DECREF(unicode);
   1316     return v;
   1317 }
   1318 
   1319 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
   1320                                     const char *encoding,
   1321                                     const char *errors)
   1322 {
   1323     PyObject *v;
   1324 
   1325     if (!PyUnicode_Check(unicode)) {
   1326         PyErr_BadArgument();
   1327         goto onError;
   1328     }
   1329 
   1330     if (encoding == NULL)
   1331         encoding = PyUnicode_GetDefaultEncoding();
   1332 
   1333     /* Encode via the codec registry */
   1334     v = PyCodec_Encode(unicode, encoding, errors);
   1335     if (v == NULL)
   1336         goto onError;
   1337     return v;
   1338 
   1339   onError:
   1340     return NULL;
   1341 }
   1342 
   1343 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
   1344                                     const char *encoding,
   1345                                     const char *errors)
   1346 {
   1347     PyObject *v;
   1348 
   1349     if (!PyUnicode_Check(unicode)) {
   1350         PyErr_BadArgument();
   1351         goto onError;
   1352     }
   1353 
   1354     if (encoding == NULL)
   1355         encoding = PyUnicode_GetDefaultEncoding();
   1356 
   1357     /* Shortcuts for common default encodings */
   1358     if (errors == NULL) {
   1359         if (strcmp(encoding, "utf-8") == 0)
   1360             return PyUnicode_AsUTF8String(unicode);
   1361         else if (strcmp(encoding, "latin-1") == 0)
   1362             return PyUnicode_AsLatin1String(unicode);
   1363 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
   1364         else if (strcmp(encoding, "mbcs") == 0)
   1365             return PyUnicode_AsMBCSString(unicode);
   1366 #endif
   1367         else if (strcmp(encoding, "ascii") == 0)
   1368             return PyUnicode_AsASCIIString(unicode);
   1369     }
   1370 
   1371     /* Encode via the codec registry */
   1372     v = PyCodec_Encode(unicode, encoding, errors);
   1373     if (v == NULL)
   1374         goto onError;
   1375     if (!PyString_Check(v)) {
   1376         PyErr_Format(PyExc_TypeError,
   1377                      "encoder did not return a string object (type=%.400s)",
   1378                      Py_TYPE(v)->tp_name);
   1379         Py_DECREF(v);
   1380         goto onError;
   1381     }
   1382     return v;
   1383 
   1384   onError:
   1385     return NULL;
   1386 }
   1387 
   1388 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
   1389                                             const char *errors)
   1390 {
   1391     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
   1392 
   1393     if (v)
   1394         return v;
   1395     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
   1396     if (v && errors == NULL)
   1397         ((PyUnicodeObject *)unicode)->defenc = v;
   1398     return v;
   1399 }
   1400 
   1401 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
   1402 {
   1403     if (!PyUnicode_Check(unicode)) {
   1404         PyErr_BadArgument();
   1405         goto onError;
   1406     }
   1407     return PyUnicode_AS_UNICODE(unicode);
   1408 
   1409   onError:
   1410     return NULL;
   1411 }
   1412 
   1413 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
   1414 {
   1415     if (!PyUnicode_Check(unicode)) {
   1416         PyErr_BadArgument();
   1417         goto onError;
   1418     }
   1419     return PyUnicode_GET_SIZE(unicode);
   1420 
   1421   onError:
   1422     return -1;
   1423 }
   1424 
   1425 const char *PyUnicode_GetDefaultEncoding(void)
   1426 {
   1427     return unicode_default_encoding;
   1428 }
   1429 
   1430 int PyUnicode_SetDefaultEncoding(const char *encoding)
   1431 {
   1432     PyObject *v;
   1433 
   1434     /* Make sure the encoding is valid. As side effect, this also
   1435        loads the encoding into the codec registry cache. */
   1436     v = _PyCodec_Lookup(encoding);
   1437     if (v == NULL)
   1438         goto onError;
   1439     Py_DECREF(v);
   1440     strncpy(unicode_default_encoding,
   1441             encoding,
   1442             sizeof(unicode_default_encoding) - 1);
   1443     return 0;
   1444 
   1445   onError:
   1446     return -1;
   1447 }
   1448 
   1449 /* error handling callback helper:
   1450    build arguments, call the callback and check the arguments,
   1451    if no exception occurred, copy the replacement to the output
   1452    and adjust various state variables.
   1453    return 0 on success, -1 on error
   1454 */
   1455 
   1456 static
   1457 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
   1458                                      const char *encoding, const char *reason,
   1459                                      const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
   1460                                      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
   1461                                      PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
   1462 {
   1463     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
   1464 
   1465     PyObject *restuple = NULL;
   1466     PyObject *repunicode = NULL;
   1467     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
   1468     Py_ssize_t requiredsize;
   1469     Py_ssize_t newpos;
   1470     Py_UNICODE *repptr;
   1471     Py_ssize_t repsize;
   1472     int res = -1;
   1473 
   1474     if (*errorHandler == NULL) {
   1475         *errorHandler = PyCodec_LookupError(errors);
   1476         if (*errorHandler == NULL)
   1477             goto onError;
   1478     }
   1479 
   1480     if (*exceptionObject == NULL) {
   1481         *exceptionObject = PyUnicodeDecodeError_Create(
   1482             encoding, input, insize, *startinpos, *endinpos, reason);
   1483         if (*exceptionObject == NULL)
   1484             goto onError;
   1485     }
   1486     else {
   1487         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
   1488             goto onError;
   1489         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
   1490             goto onError;
   1491         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
   1492             goto onError;
   1493     }
   1494 
   1495     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
   1496     if (restuple == NULL)
   1497         goto onError;
   1498     if (!PyTuple_Check(restuple)) {
   1499         PyErr_SetString(PyExc_TypeError, &argparse[4]);
   1500         goto onError;
   1501     }
   1502     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
   1503         goto onError;
   1504     if (newpos<0)
   1505         newpos = insize+newpos;
   1506     if (newpos<0 || newpos>insize) {
   1507         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
   1508         goto onError;
   1509     }
   1510 
   1511     /* need more space? (at least enough for what we
   1512        have+the replacement+the rest of the string (starting
   1513        at the new input position), so we won't have to check space
   1514        when there are no errors in the rest of the string) */
   1515     repptr = PyUnicode_AS_UNICODE(repunicode);
   1516     repsize = PyUnicode_GET_SIZE(repunicode);
   1517     requiredsize = *outpos;
   1518     if (requiredsize > PY_SSIZE_T_MAX - repsize)
   1519         goto overflow;
   1520     requiredsize += repsize;
   1521     if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
   1522         goto overflow;
   1523     requiredsize += insize - newpos;
   1524     if (requiredsize > outsize) {
   1525         if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
   1526             requiredsize = 2*outsize;
   1527         if (_PyUnicode_Resize(output, requiredsize) < 0)
   1528             goto onError;
   1529         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
   1530     }
   1531     *endinpos = newpos;
   1532     *inptr = input + newpos;
   1533     Py_UNICODE_COPY(*outptr, repptr, repsize);
   1534     *outptr += repsize;
   1535     *outpos += repsize;
   1536     /* we made it! */
   1537     res = 0;
   1538 
   1539   onError:
   1540     Py_XDECREF(restuple);
   1541     return res;
   1542 
   1543   overflow:
   1544     PyErr_SetString(PyExc_OverflowError,
   1545                     "decoded result is too long for a Python string");
   1546     goto onError;
   1547 }
   1548 
   1549 /* --- UTF-7 Codec -------------------------------------------------------- */
   1550 
   1551 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
   1552 
   1553 /* Three simple macros defining base-64. */
   1554 
   1555 /* Is c a base-64 character? */
   1556 
   1557 #define IS_BASE64(c) \
   1558     (isalnum(c) || (c) == '+' || (c) == '/')
   1559 
   1560 /* given that c is a base-64 character, what is its base-64 value? */
   1561 
   1562 #define FROM_BASE64(c)                                                  \
   1563     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
   1564      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
   1565      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
   1566      (c) == '+' ? 62 : 63)
   1567 
   1568 /* What is the base-64 character of the bottom 6 bits of n? */
   1569 
   1570 #define TO_BASE64(n)  \
   1571     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
   1572 
   1573 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
   1574  * decoded as itself.  We are permissive on decoding; the only ASCII
   1575  * byte not decoding to itself is the + which begins a base64
   1576  * string. */
   1577 
   1578 #define DECODE_DIRECT(c)                                \
   1579     ((c) <= 127 && (c) != '+')
   1580 
   1581 /* The UTF-7 encoder treats ASCII characters differently according to
   1582  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
   1583  * the above).  See RFC2152.  This array identifies these different
   1584  * sets:
   1585  * 0 : "Set D"
   1586  *     alphanumeric and '(),-./:?
   1587  * 1 : "Set O"
   1588  *     !"#$%&*;<=>@[]^_`{|}
   1589  * 2 : "whitespace"
   1590  *     ht nl cr sp
   1591  * 3 : special (must be base64 encoded)
   1592  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
   1593  */
   1594 
   1595 static
   1596 char utf7_category[128] = {
   1597 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
   1598     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
   1599 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
   1600     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
   1601 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
   1602     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
   1603 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
   1604     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
   1605 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
   1606     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   1607 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
   1608     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
   1609 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
   1610     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   1611 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
   1612     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
   1613 };
   1614 
   1615 /* ENCODE_DIRECT: this character should be encoded as itself.  The
   1616  * answer depends on whether we are encoding set O as itself, and also
   1617  * on whether we are encoding whitespace as itself.  RFC2152 makes it
   1618  * clear that the answers to these questions vary between
   1619  * applications, so this code needs to be flexible.  */
   1620 
   1621 #define ENCODE_DIRECT(c, directO, directWS)             \
   1622     ((c) < 128 && (c) > 0 &&                            \
   1623      ((utf7_category[(c)] == 0) ||                      \
   1624       (directWS && (utf7_category[(c)] == 2)) ||        \
   1625       (directO && (utf7_category[(c)] == 1))))
   1626 
   1627 PyObject *PyUnicode_DecodeUTF7(const char *s,
   1628                                Py_ssize_t size,
   1629                                const char *errors)
   1630 {
   1631     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
   1632 }
   1633 
   1634 /* The decoder.  The only state we preserve is our read position,
   1635  * i.e. how many characters we have consumed.  So if we end in the
   1636  * middle of a shift sequence we have to back off the read position
   1637  * and the output to the beginning of the sequence, otherwise we lose
   1638  * all the shift state (seen bits, number of bits seen, high
   1639  * surrogate). */
   1640 
   1641 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
   1642                                        Py_ssize_t size,
   1643                                        const char *errors,
   1644                                        Py_ssize_t *consumed)
   1645 {
   1646     const char *starts = s;
   1647     Py_ssize_t startinpos;
   1648     Py_ssize_t endinpos;
   1649     Py_ssize_t outpos;
   1650     const char *e;
   1651     PyUnicodeObject *unicode;
   1652     Py_UNICODE *p;
   1653     const char *errmsg = "";
   1654     int inShift = 0;
   1655     Py_UNICODE *shiftOutStart;
   1656     unsigned int base64bits = 0;
   1657     unsigned long base64buffer = 0;
   1658     Py_UNICODE surrogate = 0;
   1659     PyObject *errorHandler = NULL;
   1660     PyObject *exc = NULL;
   1661 
   1662     unicode = _PyUnicode_New(size);
   1663     if (!unicode)
   1664         return NULL;
   1665     if (size == 0) {
   1666         if (consumed)
   1667             *consumed = 0;
   1668         return (PyObject *)unicode;
   1669     }
   1670 
   1671     p = unicode->str;
   1672     shiftOutStart = p;
   1673     e = s + size;
   1674 
   1675     while (s < e) {
   1676         Py_UNICODE ch = (unsigned char) *s;
   1677 
   1678         if (inShift) { /* in a base-64 section */
   1679             if (IS_BASE64(ch)) { /* consume a base-64 character */
   1680                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
   1681                 base64bits += 6;
   1682                 s++;
   1683                 if (base64bits >= 16) {
   1684                     /* we have enough bits for a UTF-16 value */
   1685                     Py_UNICODE outCh = (Py_UNICODE)
   1686                                        (base64buffer >> (base64bits-16));
   1687                     base64bits -= 16;
   1688                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
   1689                     assert(outCh <= 0xffff);
   1690                     if (surrogate) {
   1691                         /* expecting a second surrogate */
   1692                         if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
   1693 #ifdef Py_UNICODE_WIDE
   1694                             *p++ = (((surrogate & 0x3FF)<<10)
   1695                                     | (outCh & 0x3FF)) + 0x10000;
   1696 #else
   1697                             *p++ = surrogate;
   1698                             *p++ = outCh;
   1699 #endif
   1700                             surrogate = 0;
   1701                             continue;
   1702                         }
   1703                         else {
   1704                             *p++ = surrogate;
   1705                             surrogate = 0;
   1706                         }
   1707                     }
   1708                     if (outCh >= 0xD800 && outCh <= 0xDBFF) {
   1709                         /* first surrogate */
   1710                         surrogate = outCh;
   1711                     }
   1712                     else {
   1713                         *p++ = outCh;
   1714                     }
   1715                 }
   1716             }
   1717             else { /* now leaving a base-64 section */
   1718                 inShift = 0;
   1719                 s++;
   1720                 if (surrogate) {
   1721                     *p++ = surrogate;
   1722                     surrogate = 0;
   1723                 }
   1724                 if (base64bits > 0) { /* left-over bits */
   1725                     if (base64bits >= 6) {
   1726                         /* We've seen at least one base-64 character */
   1727                         errmsg = "partial character in shift sequence";
   1728                         goto utf7Error;
   1729                     }
   1730                     else {
   1731                         /* Some bits remain; they should be zero */
   1732                         if (base64buffer != 0) {
   1733                             errmsg = "non-zero padding bits in shift sequence";
   1734                             goto utf7Error;
   1735                         }
   1736                     }
   1737                 }
   1738                 if (ch != '-') {
   1739                     /* '-' is absorbed; other terminating
   1740                        characters are preserved */
   1741                     *p++ = ch;
   1742                 }
   1743             }
   1744         }
   1745         else if ( ch == '+' ) {
   1746             startinpos = s-starts;
   1747             s++; /* consume '+' */
   1748             if (s < e && *s == '-') { /* '+-' encodes '+' */
   1749                 s++;
   1750                 *p++ = '+';
   1751             }
   1752             else { /* begin base64-encoded section */
   1753                 inShift = 1;
   1754                 shiftOutStart = p;
   1755                 base64bits = 0;
   1756                 base64buffer = 0;
   1757             }
   1758         }
   1759         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
   1760             *p++ = ch;
   1761             s++;
   1762         }
   1763         else {
   1764             startinpos = s-starts;
   1765             s++;
   1766             errmsg = "unexpected special character";
   1767             goto utf7Error;
   1768         }
   1769         continue;
   1770 utf7Error:
   1771         outpos = p-PyUnicode_AS_UNICODE(unicode);
   1772         endinpos = s-starts;
   1773         if (unicode_decode_call_errorhandler(
   1774                 errors, &errorHandler,
   1775                 "utf7", errmsg,
   1776                 starts, size, &startinpos, &endinpos, &exc, &s,
   1777                 &unicode, &outpos, &p))
   1778             goto onError;
   1779     }
   1780 
   1781     /* end of string */
   1782 
   1783     if (inShift && !consumed) { /* in shift sequence, no more to follow */
   1784         /* if we're in an inconsistent state, that's an error */
   1785         if (surrogate ||
   1786                 (base64bits >= 6) ||
   1787                 (base64bits > 0 && base64buffer != 0)) {
   1788             outpos = p-PyUnicode_AS_UNICODE(unicode);
   1789             endinpos = size;
   1790             if (unicode_decode_call_errorhandler(
   1791                     errors, &errorHandler,
   1792                     "utf7", "unterminated shift sequence",
   1793                     starts, size, &startinpos, &endinpos, &exc, &s,
   1794                     &unicode, &outpos, &p))
   1795                 goto onError;
   1796         }
   1797     }
   1798 
   1799     /* return state */
   1800     if (consumed) {
   1801         if (inShift) {
   1802             p = shiftOutStart; /* back off output */
   1803             *consumed = startinpos;
   1804         }
   1805         else {
   1806             *consumed = s-starts;
   1807         }
   1808     }
   1809 
   1810     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
   1811         goto onError;
   1812 
   1813     Py_XDECREF(errorHandler);
   1814     Py_XDECREF(exc);
   1815     return (PyObject *)unicode;
   1816 
   1817   onError:
   1818     Py_XDECREF(errorHandler);
   1819     Py_XDECREF(exc);
   1820     Py_DECREF(unicode);
   1821     return NULL;
   1822 }
   1823 
   1824 
   1825 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
   1826                                Py_ssize_t size,
   1827                                int base64SetO,
   1828                                int base64WhiteSpace,
   1829                                const char *errors)
   1830 {
   1831     PyObject *v;
   1832     /* It might be possible to tighten this worst case */
   1833     Py_ssize_t allocated = 8 * size;
   1834     int inShift = 0;
   1835     Py_ssize_t i = 0;
   1836     unsigned int base64bits = 0;
   1837     unsigned long base64buffer = 0;
   1838     char * out;
   1839     char * start;
   1840 
   1841     if (allocated / 8 != size)
   1842         return PyErr_NoMemory();
   1843 
   1844     if (size == 0)
   1845         return PyString_FromStringAndSize(NULL, 0);
   1846 
   1847     v = PyString_FromStringAndSize(NULL, allocated);
   1848     if (v == NULL)
   1849         return NULL;
   1850 
   1851     start = out = PyString_AS_STRING(v);
   1852     for (;i < size; ++i) {
   1853         Py_UNICODE ch = s[i];
   1854 
   1855         if (inShift) {
   1856             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
   1857                 /* shifting out */
   1858                 if (base64bits) { /* output remaining bits */
   1859                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
   1860                     base64buffer = 0;
   1861                     base64bits = 0;
   1862                 }
   1863                 inShift = 0;
   1864                 /* Characters not in the BASE64 set implicitly unshift the sequence
   1865                    so no '-' is required, except if the character is itself a '-' */
   1866                 if (IS_BASE64(ch) || ch == '-') {
   1867                     *out++ = '-';
   1868                 }
   1869                 *out++ = (char) ch;
   1870             }
   1871             else {
   1872                 goto encode_char;
   1873             }
   1874         }
   1875         else { /* not in a shift sequence */
   1876             if (ch == '+') {
   1877                 *out++ = '+';
   1878                         *out++ = '-';
   1879             }
   1880             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
   1881                 *out++ = (char) ch;
   1882             }
   1883             else {
   1884                 *out++ = '+';
   1885                 inShift = 1;
   1886                 goto encode_char;
   1887             }
   1888         }
   1889         continue;
   1890 encode_char:
   1891 #ifdef Py_UNICODE_WIDE
   1892         if (ch >= 0x10000) {
   1893             /* code first surrogate */
   1894             base64bits += 16;
   1895             base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
   1896             while (base64bits >= 6) {
   1897                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
   1898                 base64bits -= 6;
   1899             }
   1900             /* prepare second surrogate */
   1901             ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
   1902         }
   1903 #endif
   1904         base64bits += 16;
   1905         base64buffer = (base64buffer << 16) | ch;
   1906         while (base64bits >= 6) {
   1907             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
   1908             base64bits -= 6;
   1909         }
   1910     }
   1911     if (base64bits)
   1912         *out++= TO_BASE64(base64buffer << (6-base64bits) );
   1913     if (inShift)
   1914         *out++ = '-';
   1915 
   1916     if (_PyString_Resize(&v, out - start))
   1917         return NULL;
   1918     return v;
   1919 }
   1920 
   1921 #undef IS_BASE64
   1922 #undef FROM_BASE64
   1923 #undef TO_BASE64
   1924 #undef DECODE_DIRECT
   1925 #undef ENCODE_DIRECT
   1926 
   1927 /* --- UTF-8 Codec -------------------------------------------------------- */
   1928 
   1929 static
   1930 char utf8_code_length[256] = {
   1931     /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
   1932        illegal prefix.  See RFC 3629 for details */
   1933     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
   1934     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1935     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1936     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1937     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1938     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1939     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1940     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
   1941     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
   1942     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1943     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1944     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
   1945     0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
   1946     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
   1947     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
   1948     4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
   1949 };
   1950 
   1951 PyObject *PyUnicode_DecodeUTF8(const char *s,
   1952                                Py_ssize_t size,
   1953                                const char *errors)
   1954 {
   1955     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
   1956 }
   1957 
   1958 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
   1959                                        Py_ssize_t size,
   1960                                        const char *errors,
   1961                                        Py_ssize_t *consumed)
   1962 {
   1963     const char *starts = s;
   1964     int n;
   1965     int k;
   1966     Py_ssize_t startinpos;
   1967     Py_ssize_t endinpos;
   1968     Py_ssize_t outpos;
   1969     const char *e;
   1970     PyUnicodeObject *unicode;
   1971     Py_UNICODE *p;
   1972     const char *errmsg = "";
   1973     PyObject *errorHandler = NULL;
   1974     PyObject *exc = NULL;
   1975 
   1976     /* Note: size will always be longer than the resulting Unicode
   1977        character count */
   1978     unicode = _PyUnicode_New(size);
   1979     if (!unicode)
   1980         return NULL;
   1981     if (size == 0) {
   1982         if (consumed)
   1983             *consumed = 0;
   1984         return (PyObject *)unicode;
   1985     }
   1986 
   1987     /* Unpack UTF-8 encoded data */
   1988     p = unicode->str;
   1989     e = s + size;
   1990 
   1991     while (s < e) {
   1992         Py_UCS4 ch = (unsigned char)*s;
   1993 
   1994         if (ch < 0x80) {
   1995             *p++ = (Py_UNICODE)ch;
   1996             s++;
   1997             continue;
   1998         }
   1999 
   2000         n = utf8_code_length[ch];
   2001 
   2002         if (s + n > e) {
   2003             if (consumed)
   2004                 break;
   2005             else {
   2006                 errmsg = "unexpected end of data";
   2007                 startinpos = s-starts;
   2008                 endinpos = startinpos+1;
   2009                 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
   2010                     endinpos++;
   2011                 goto utf8Error;
   2012             }
   2013         }
   2014 
   2015         switch (n) {
   2016 
   2017         case 0:
   2018             errmsg = "invalid start byte";
   2019             startinpos = s-starts;
   2020             endinpos = startinpos+1;
   2021             goto utf8Error;
   2022 
   2023         case 1:
   2024             errmsg = "internal error";
   2025             startinpos = s-starts;
   2026             endinpos = startinpos+1;
   2027             goto utf8Error;
   2028 
   2029         case 2:
   2030             if ((s[1] & 0xc0) != 0x80) {
   2031                 errmsg = "invalid continuation byte";
   2032                 startinpos = s-starts;
   2033                 endinpos = startinpos + 1;
   2034                 goto utf8Error;
   2035             }
   2036             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
   2037             assert ((ch > 0x007F) && (ch <= 0x07FF));
   2038             *p++ = (Py_UNICODE)ch;
   2039             break;
   2040 
   2041         case 3:
   2042             /* XXX: surrogates shouldn't be valid UTF-8!
   2043                see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
   2044                (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
   2045                Uncomment the 2 lines below to make them invalid,
   2046                code points: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
   2047             if ((s[1] & 0xc0) != 0x80 ||
   2048                 (s[2] & 0xc0) != 0x80 ||
   2049                 ((unsigned char)s[0] == 0xE0 &&
   2050                  (unsigned char)s[1] < 0xA0)/* ||
   2051                 ((unsigned char)s[0] == 0xED &&
   2052                  (unsigned char)s[1] > 0x9F)*/) {
   2053                 errmsg = "invalid continuation byte";
   2054                 startinpos = s-starts;
   2055                 endinpos = startinpos + 1;
   2056 
   2057                 /* if s[1] first two bits are 1 and 0, then the invalid
   2058                    continuation byte is s[2], so increment endinpos by 1,
   2059                    if not, s[1] is invalid and endinpos doesn't need to
   2060                    be incremented. */
   2061                 if ((s[1] & 0xC0) == 0x80)
   2062                     endinpos++;
   2063                 goto utf8Error;
   2064             }
   2065             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
   2066             assert ((ch > 0x07FF) && (ch <= 0xFFFF));
   2067             *p++ = (Py_UNICODE)ch;
   2068             break;
   2069 
   2070         case 4:
   2071             if ((s[1] & 0xc0) != 0x80 ||
   2072                 (s[2] & 0xc0) != 0x80 ||
   2073                 (s[3] & 0xc0) != 0x80 ||
   2074                 ((unsigned char)s[0] == 0xF0 &&
   2075                  (unsigned char)s[1] < 0x90) ||
   2076                 ((unsigned char)s[0] == 0xF4 &&
   2077                  (unsigned char)s[1] > 0x8F)) {
   2078                 errmsg = "invalid continuation byte";
   2079                 startinpos = s-starts;
   2080                 endinpos = startinpos + 1;
   2081                 if ((s[1] & 0xC0) == 0x80) {
   2082                     endinpos++;
   2083                     if ((s[2] & 0xC0) == 0x80)
   2084                         endinpos++;
   2085                 }
   2086                 goto utf8Error;
   2087             }
   2088             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
   2089                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
   2090             assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
   2091 
   2092 #ifdef Py_UNICODE_WIDE
   2093             *p++ = (Py_UNICODE)ch;
   2094 #else
   2095             /*  compute and append the two surrogates: */
   2096 
   2097             /*  translate from 10000..10FFFF to 0..FFFF */
   2098             ch -= 0x10000;
   2099 
   2100             /*  high surrogate = top 10 bits added to D800 */
   2101             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
   2102 
   2103             /*  low surrogate = bottom 10 bits added to DC00 */
   2104             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
   2105 #endif
   2106             break;
   2107         }
   2108         s += n;
   2109         continue;
   2110 
   2111       utf8Error:
   2112         outpos = p-PyUnicode_AS_UNICODE(unicode);
   2113         if (unicode_decode_call_errorhandler(
   2114                 errors, &errorHandler,
   2115                 "utf8", errmsg,
   2116                 starts, size, &startinpos, &endinpos, &exc, &s,
   2117                 &unicode, &outpos, &p))
   2118             goto onError;
   2119     }
   2120     if (consumed)
   2121         *consumed = s-starts;
   2122 
   2123     /* Adjust length */
   2124     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
   2125         goto onError;
   2126 
   2127     Py_XDECREF(errorHandler);
   2128     Py_XDECREF(exc);
   2129     return (PyObject *)unicode;
   2130 
   2131   onError:
   2132     Py_XDECREF(errorHandler);
   2133     Py_XDECREF(exc);
   2134     Py_DECREF(unicode);
   2135     return NULL;
   2136 }
   2137 
   2138 /* Allocation strategy:  if the string is short, convert into a stack buffer
   2139    and allocate exactly as much space needed at the end.  Else allocate the
   2140    maximum possible needed (4 result bytes per Unicode character), and return
   2141    the excess memory at the end.
   2142 */
   2143 PyObject *
   2144 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
   2145                      Py_ssize_t size,
   2146                      const char *errors)
   2147 {
   2148 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
   2149 
   2150     Py_ssize_t i;           /* index into s of next input byte */
   2151     PyObject *v;        /* result string object */
   2152     char *p;            /* next free byte in output buffer */
   2153     Py_ssize_t nallocated;  /* number of result bytes allocated */
   2154     Py_ssize_t nneeded;        /* number of result bytes needed */
   2155     char stackbuf[MAX_SHORT_UNICHARS * 4];
   2156 
   2157     assert(s != NULL);
   2158     assert(size >= 0);
   2159 
   2160     if (size <= MAX_SHORT_UNICHARS) {
   2161         /* Write into the stack buffer; nallocated can't overflow.
   2162          * At the end, we'll allocate exactly as much heap space as it
   2163          * turns out we need.
   2164          */
   2165         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
   2166         v = NULL;   /* will allocate after we're done */
   2167         p = stackbuf;
   2168     }
   2169     else {
   2170         /* Overallocate on the heap, and give the excess back at the end. */
   2171         nallocated = size * 4;
   2172         if (nallocated / 4 != size)  /* overflow! */
   2173             return PyErr_NoMemory();
   2174         v = PyString_FromStringAndSize(NULL, nallocated);
   2175         if (v == NULL)
   2176             return NULL;
   2177         p = PyString_AS_STRING(v);
   2178     }
   2179 
   2180     for (i = 0; i < size;) {
   2181         Py_UCS4 ch = s[i++];
   2182 
   2183         if (ch < 0x80)
   2184             /* Encode ASCII */
   2185             *p++ = (char) ch;
   2186 
   2187         else if (ch < 0x0800) {
   2188             /* Encode Latin-1 */
   2189             *p++ = (char)(0xc0 | (ch >> 6));
   2190             *p++ = (char)(0x80 | (ch & 0x3f));
   2191         }
   2192         else {
   2193             /* Encode UCS2 Unicode ordinals */
   2194             if (ch < 0x10000) {
   2195                 /* Special case: check for high surrogate */
   2196                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
   2197                     Py_UCS4 ch2 = s[i];
   2198                     /* Check for low surrogate and combine the two to
   2199                        form a UCS4 value */
   2200                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
   2201                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
   2202                         i++;
   2203                         goto encodeUCS4;
   2204                     }
   2205                     /* Fall through: handles isolated high surrogates */
   2206                 }
   2207                 *p++ = (char)(0xe0 | (ch >> 12));
   2208                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
   2209                 *p++ = (char)(0x80 | (ch & 0x3f));
   2210                 continue;
   2211             }
   2212           encodeUCS4:
   2213             /* Encode UCS4 Unicode ordinals */
   2214             *p++ = (char)(0xf0 | (ch >> 18));
   2215             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
   2216             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
   2217             *p++ = (char)(0x80 | (ch & 0x3f));
   2218         }
   2219     }
   2220 
   2221     if (v == NULL) {
   2222         /* This was stack allocated. */
   2223         nneeded = p - stackbuf;
   2224         assert(nneeded <= nallocated);
   2225         v = PyString_FromStringAndSize(stackbuf, nneeded);
   2226     }
   2227     else {
   2228         /* Cut back to size actually needed. */
   2229         nneeded = p - PyString_AS_STRING(v);
   2230         assert(nneeded <= nallocated);
   2231         if (_PyString_Resize(&v, nneeded))
   2232             return NULL;
   2233     }
   2234     return v;
   2235 
   2236 #undef MAX_SHORT_UNICHARS
   2237 }
   2238 
   2239 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
   2240 {
   2241     if (!PyUnicode_Check(unicode)) {
   2242         PyErr_BadArgument();
   2243         return NULL;
   2244     }
   2245     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
   2246                                 PyUnicode_GET_SIZE(unicode),
   2247                                 NULL);
   2248 }
   2249 
   2250 /* --- UTF-32 Codec ------------------------------------------------------- */
   2251 
   2252 PyObject *
   2253 PyUnicode_DecodeUTF32(const char *s,
   2254                       Py_ssize_t size,
   2255                       const char *errors,
   2256                       int *byteorder)
   2257 {
   2258     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
   2259 }
   2260 
   2261 PyObject *
   2262 PyUnicode_DecodeUTF32Stateful(const char *s,
   2263                               Py_ssize_t size,
   2264                               const char *errors,
   2265                               int *byteorder,
   2266                               Py_ssize_t *consumed)
   2267 {
   2268     const char *starts = s;
   2269     Py_ssize_t startinpos;
   2270     Py_ssize_t endinpos;
   2271     Py_ssize_t outpos;
   2272     PyUnicodeObject *unicode;
   2273     Py_UNICODE *p;
   2274 #ifndef Py_UNICODE_WIDE
   2275     int pairs = 0;
   2276     const unsigned char *qq;
   2277 #else
   2278     const int pairs = 0;
   2279 #endif
   2280     const unsigned char *q, *e;
   2281     int bo = 0;       /* assume native ordering by default */
   2282     const char *errmsg = "";
   2283     /* Offsets from q for retrieving bytes in the right order. */
   2284 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
   2285     int iorder[] = {0, 1, 2, 3};
   2286 #else
   2287     int iorder[] = {3, 2, 1, 0};
   2288 #endif
   2289     PyObject *errorHandler = NULL;
   2290     PyObject *exc = NULL;
   2291 
   2292     q = (unsigned char *)s;
   2293     e = q + size;
   2294 
   2295     if (byteorder)
   2296         bo = *byteorder;
   2297 
   2298     /* Check for BOM marks (U+FEFF) in the input and adjust current
   2299        byte order setting accordingly. In native mode, the leading BOM
   2300        mark is skipped, in all other modes, it is copied to the output
   2301        stream as-is (giving a ZWNBSP character). */
   2302     if (bo == 0) {
   2303         if (size >= 4) {
   2304             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
   2305                 (q[iorder[1]] << 8) | q[iorder[0]];
   2306 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
   2307             if (bom == 0x0000FEFF) {
   2308                 q += 4;
   2309                 bo = -1;
   2310             }
   2311             else if (bom == 0xFFFE0000) {
   2312                 q += 4;
   2313                 bo = 1;
   2314             }
   2315 #else
   2316             if (bom == 0x0000FEFF) {
   2317                 q += 4;
   2318                 bo = 1;
   2319             }
   2320             else if (bom == 0xFFFE0000) {
   2321                 q += 4;
   2322                 bo = -1;
   2323             }
   2324 #endif
   2325         }
   2326     }
   2327 
   2328     if (bo == -1) {
   2329         /* force LE */
   2330         iorder[0] = 0;
   2331         iorder[1] = 1;
   2332         iorder[2] = 2;
   2333         iorder[3] = 3;
   2334     }
   2335     else if (bo == 1) {
   2336         /* force BE */
   2337         iorder[0] = 3;
   2338         iorder[1] = 2;
   2339         iorder[2] = 1;
   2340         iorder[3] = 0;
   2341     }
   2342 
   2343     /* On narrow builds we split characters outside the BMP into two
   2344        code points => count how much extra space we need. */
   2345 #ifndef Py_UNICODE_WIDE
   2346     for (qq = q; e - qq >= 4; qq += 4)
   2347         if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
   2348             pairs++;
   2349 #endif
   2350 
   2351     /* This might be one to much, because of a BOM */
   2352     unicode = _PyUnicode_New((size+3)/4+pairs);
   2353     if (!unicode)
   2354         return NULL;
   2355     if (size == 0)
   2356         return (PyObject *)unicode;
   2357 
   2358     /* Unpack UTF-32 encoded data */
   2359     p = unicode->str;
   2360 
   2361     while (q < e) {
   2362         Py_UCS4 ch;
   2363         /* remaining bytes at the end? (size should be divisible by 4) */
   2364         if (e-q<4) {
   2365             if (consumed)
   2366                 break;
   2367             errmsg = "truncated data";
   2368             startinpos = ((const char *)q)-starts;
   2369             endinpos = ((const char *)e)-starts;
   2370             goto utf32Error;
   2371             /* The remaining input chars are ignored if the callback
   2372                chooses to skip the input */
   2373         }
   2374         ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
   2375             (q[iorder[1]] << 8) | q[iorder[0]];
   2376 
   2377         if (ch >= 0x110000)
   2378         {
   2379             errmsg = "code point not in range(0x110000)";
   2380             startinpos = ((const char *)q)-starts;
   2381             endinpos = startinpos+4;
   2382             goto utf32Error;
   2383         }
   2384 #ifndef Py_UNICODE_WIDE
   2385         if (ch >= 0x10000)
   2386         {
   2387             *p++ = 0xD800 | ((ch-0x10000) >> 10);
   2388             *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
   2389         }
   2390         else
   2391 #endif
   2392             *p++ = ch;
   2393         q += 4;
   2394         continue;
   2395       utf32Error:
   2396         outpos = p-PyUnicode_AS_UNICODE(unicode);
   2397         if (unicode_decode_call_errorhandler(
   2398                 errors, &errorHandler,
   2399                 "utf32", errmsg,
   2400                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
   2401                 &unicode, &outpos, &p))
   2402             goto onError;
   2403     }
   2404 
   2405     if (byteorder)
   2406         *byteorder = bo;
   2407 
   2408     if (consumed)
   2409         *consumed = (const char *)q-starts;
   2410 
   2411     /* Adjust length */
   2412     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
   2413         goto onError;
   2414 
   2415     Py_XDECREF(errorHandler);
   2416     Py_XDECREF(exc);
   2417     return (PyObject *)unicode;
   2418 
   2419   onError:
   2420     Py_DECREF(unicode);
   2421     Py_XDECREF(errorHandler);
   2422     Py_XDECREF(exc);
   2423     return NULL;
   2424 }
   2425 
   2426 PyObject *
   2427 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
   2428                       Py_ssize_t size,
   2429                       const char *errors,
   2430                       int byteorder)
   2431 {
   2432     PyObject *v;
   2433     unsigned char *p;
   2434     Py_ssize_t nsize, bytesize;
   2435 #ifndef Py_UNICODE_WIDE
   2436     Py_ssize_t i, pairs;
   2437 #else
   2438     const int pairs = 0;
   2439 #endif
   2440     /* Offsets from p for storing byte pairs in the right order. */
   2441 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
   2442     int iorder[] = {0, 1, 2, 3};
   2443 #else
   2444     int iorder[] = {3, 2, 1, 0};
   2445 #endif
   2446 
   2447 #define STORECHAR(CH)                           \
   2448     do {                                        \
   2449         p[iorder[3]] = ((CH) >> 24) & 0xff;     \
   2450         p[iorder[2]] = ((CH) >> 16) & 0xff;     \
   2451         p[iorder[1]] = ((CH) >> 8) & 0xff;      \
   2452         p[iorder[0]] = (CH) & 0xff;             \
   2453         p += 4;                                 \
   2454     } while(0)
   2455 
   2456     /* In narrow builds we can output surrogate pairs as one code point,
   2457        so we need less space. */
   2458 #ifndef Py_UNICODE_WIDE
   2459     for (i = pairs = 0; i < size-1; i++)
   2460         if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
   2461             0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
   2462             pairs++;
   2463 #endif
   2464     nsize = (size - pairs + (byteorder == 0));
   2465     bytesize = nsize * 4;
   2466     if (bytesize / 4 != nsize)
   2467         return PyErr_NoMemory();
   2468     v = PyString_FromStringAndSize(NULL, bytesize);
   2469     if (v == NULL)
   2470         return NULL;
   2471 
   2472     p = (unsigned char *)PyString_AS_STRING(v);
   2473     if (byteorder == 0)
   2474         STORECHAR(0xFEFF);
   2475     if (size == 0)
   2476         return v;
   2477 
   2478     if (byteorder == -1) {
   2479         /* force LE */
   2480         iorder[0] = 0;
   2481         iorder[1] = 1;
   2482         iorder[2] = 2;
   2483         iorder[3] = 3;
   2484     }
   2485     else if (byteorder == 1) {
   2486         /* force BE */
   2487         iorder[0] = 3;
   2488         iorder[1] = 2;
   2489         iorder[2] = 1;
   2490         iorder[3] = 0;
   2491     }
   2492 
   2493     while (size-- > 0) {
   2494         Py_UCS4 ch = *s++;
   2495 #ifndef Py_UNICODE_WIDE
   2496         if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
   2497             Py_UCS4 ch2 = *s;
   2498             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
   2499                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
   2500                 s++;
   2501                 size--;
   2502             }
   2503         }
   2504 #endif
   2505         STORECHAR(ch);
   2506     }
   2507     return v;
   2508 #undef STORECHAR
   2509 }
   2510 
   2511 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
   2512 {
   2513     if (!PyUnicode_Check(unicode)) {
   2514         PyErr_BadArgument();
   2515         return NULL;
   2516     }
   2517     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
   2518                                  PyUnicode_GET_SIZE(unicode),
   2519                                  NULL,
   2520                                  0);
   2521 }
   2522 
   2523 /* --- UTF-16 Codec ------------------------------------------------------- */
   2524 
   2525 PyObject *
   2526 PyUnicode_DecodeUTF16(const char *s,
   2527                       Py_ssize_t size,
   2528                       const char *errors,
   2529                       int *byteorder)
   2530 {
   2531     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
   2532 }
   2533 
   2534 PyObject *
   2535 PyUnicode_DecodeUTF16Stateful(const char *s,
   2536                               Py_ssize_t size,
   2537                               const char *errors,
   2538                               int *byteorder,
   2539                               Py_ssize_t *consumed)
   2540 {
   2541     const char *starts = s;
   2542     Py_ssize_t startinpos;
   2543     Py_ssize_t endinpos;
   2544     Py_ssize_t outpos;
   2545     PyUnicodeObject *unicode;
   2546     Py_UNICODE *p;
   2547     const unsigned char *q, *e;
   2548     int bo = 0;       /* assume native ordering by default */
   2549     const char *errmsg = "";
   2550     /* Offsets from q for retrieving byte pairs in the right order. */
   2551 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
   2552     int ihi = 1, ilo = 0;
   2553 #else
   2554     int ihi = 0, ilo = 1;
   2555 #endif
   2556     PyObject *errorHandler = NULL;
   2557     PyObject *exc = NULL;
   2558 
   2559     /* Note: size will always be longer than the resulting Unicode
   2560        character count */
   2561     unicode = _PyUnicode_New(size);
   2562     if (!unicode)
   2563         return NULL;
   2564     if (size == 0)
   2565         return (PyObject *)unicode;
   2566 
   2567     /* Unpack UTF-16 encoded data */
   2568     p = unicode->str;
   2569     q = (unsigned char *)s;
   2570     e = q + size;
   2571 
   2572     if (byteorder)
   2573         bo = *byteorder;
   2574 
   2575     /* Check for BOM marks (U+FEFF) in the input and adjust current
   2576        byte order setting accordingly. In native mode, the leading BOM
   2577        mark is skipped, in all other modes, it is copied to the output
   2578        stream as-is (giving a ZWNBSP character). */
   2579     if (bo == 0) {
   2580         if (size >= 2) {
   2581             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
   2582 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
   2583             if (bom == 0xFEFF) {
   2584                 q += 2;
   2585                 bo = -1;
   2586             }
   2587             else if (bom == 0xFFFE) {
   2588                 q += 2;
   2589                 bo = 1;
   2590             }
   2591 #else
   2592             if (bom == 0xFEFF) {
   2593                 q += 2;
   2594                 bo = 1;
   2595             }
   2596             else if (bom == 0xFFFE) {
   2597                 q += 2;
   2598                 bo = -1;
   2599             }
   2600 #endif
   2601         }
   2602     }
   2603 
   2604     if (bo == -1) {
   2605         /* force LE */
   2606         ihi = 1;
   2607         ilo = 0;
   2608     }
   2609     else if (bo == 1) {
   2610         /* force BE */
   2611         ihi = 0;
   2612         ilo = 1;
   2613     }
   2614 
   2615     while (q < e) {
   2616         Py_UNICODE ch;
   2617         /* remaining bytes at the end? (size should be even) */
   2618         if (e-q<2) {
   2619             if (consumed)
   2620                 break;
   2621             errmsg = "truncated data";
   2622             startinpos = ((const char *)q)-starts;
   2623             endinpos = ((const char *)e)-starts;
   2624             goto utf16Error;
   2625             /* The remaining input chars are ignored if the callback
   2626                chooses to skip the input */
   2627         }
   2628         ch = (q[ihi] << 8) | q[ilo];
   2629 
   2630         q += 2;
   2631 
   2632         if (ch < 0xD800 || ch > 0xDFFF) {
   2633             *p++ = ch;
   2634             continue;
   2635         }
   2636 
   2637         /* UTF-16 code pair: */
   2638         if (e - q < 2) {
   2639             q -= 2;
   2640             if (consumed)
   2641                 break;
   2642             errmsg = "unexpected end of data";
   2643             startinpos = ((const char *)q)-starts;
   2644             endinpos = ((const char *)e)-starts;
   2645             goto utf16Error;
   2646         }
   2647         if (0xD800 <= ch && ch <= 0xDBFF) {
   2648             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
   2649             q += 2;
   2650             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
   2651 #ifndef Py_UNICODE_WIDE
   2652                 *p++ = ch;
   2653                 *p++ = ch2;
   2654 #else
   2655                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
   2656 #endif
   2657                 continue;
   2658             }
   2659             else {
   2660                 errmsg = "illegal UTF-16 surrogate";
   2661                 startinpos = (((const char *)q)-4)-starts;
   2662                 endinpos = startinpos+2;
   2663                 goto utf16Error;
   2664             }
   2665 
   2666         }
   2667         errmsg = "illegal encoding";
   2668         startinpos = (((const char *)q)-2)-starts;
   2669         endinpos = startinpos+2;
   2670         /* Fall through to report the error */
   2671 
   2672       utf16Error:
   2673         outpos = p-PyUnicode_AS_UNICODE(unicode);
   2674         if (unicode_decode_call_errorhandler(
   2675                 errors, &errorHandler,
   2676                 "utf16", errmsg,
   2677                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
   2678                 &unicode, &outpos, &p))
   2679             goto onError;
   2680     }
   2681 
   2682     if (byteorder)
   2683         *byteorder = bo;
   2684 
   2685     if (consumed)
   2686         *consumed = (const char *)q-starts;
   2687 
   2688     /* Adjust length */
   2689     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
   2690         goto onError;
   2691 
   2692     Py_XDECREF(errorHandler);
   2693     Py_XDECREF(exc);
   2694     return (PyObject *)unicode;
   2695 
   2696   onError:
   2697     Py_DECREF(unicode);
   2698     Py_XDECREF(errorHandler);
   2699     Py_XDECREF(exc);
   2700     return NULL;
   2701 }
   2702 
   2703 PyObject *
   2704 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
   2705                       Py_ssize_t size,
   2706                       const char *errors,
   2707                       int byteorder)
   2708 {
   2709     PyObject *v;
   2710     unsigned char *p;
   2711     Py_ssize_t nsize, bytesize;
   2712 #ifdef Py_UNICODE_WIDE
   2713     Py_ssize_t i, pairs;
   2714 #else
   2715     const int pairs = 0;
   2716 #endif
   2717     /* Offsets from p for storing byte pairs in the right order. */
   2718 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
   2719     int ihi = 1, ilo = 0;
   2720 #else
   2721     int ihi = 0, ilo = 1;
   2722 #endif
   2723 
   2724 #define STORECHAR(CH)                           \
   2725     do {                                        \
   2726         p[ihi] = ((CH) >> 8) & 0xff;            \
   2727         p[ilo] = (CH) & 0xff;                   \
   2728         p += 2;                                 \
   2729     } while(0)
   2730 
   2731 #ifdef Py_UNICODE_WIDE
   2732     for (i = pairs = 0; i < size; i++)
   2733         if (s[i] >= 0x10000)
   2734             pairs++;
   2735 #endif
   2736     /* 2 * (size + pairs + (byteorder == 0)) */
   2737     if (size > PY_SSIZE_T_MAX ||
   2738         size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
   2739         return PyErr_NoMemory();
   2740     nsize = size + pairs + (byteorder == 0);
   2741     bytesize = nsize * 2;
   2742     if (bytesize / 2 != nsize)
   2743         return PyErr_NoMemory();
   2744     v = PyString_FromStringAndSize(NULL, bytesize);
   2745     if (v == NULL)
   2746         return NULL;
   2747 
   2748     p = (unsigned char *)PyString_AS_STRING(v);
   2749     if (byteorder == 0)
   2750         STORECHAR(0xFEFF);
   2751     if (size == 0)
   2752         return v;
   2753 
   2754     if (byteorder == -1) {
   2755         /* force LE */
   2756         ihi = 1;
   2757         ilo = 0;
   2758     }
   2759     else if (byteorder == 1) {
   2760         /* force BE */
   2761         ihi = 0;
   2762         ilo = 1;
   2763     }
   2764 
   2765     while (size-- > 0) {
   2766         Py_UNICODE ch = *s++;
   2767         Py_UNICODE ch2 = 0;
   2768 #ifdef Py_UNICODE_WIDE
   2769         if (ch >= 0x10000) {
   2770             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
   2771             ch  = 0xD800 | ((ch-0x10000) >> 10);
   2772         }
   2773 #endif
   2774         STORECHAR(ch);
   2775         if (ch2)
   2776             STORECHAR(ch2);
   2777     }
   2778     return v;
   2779 #undef STORECHAR
   2780 }
   2781 
   2782 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
   2783 {
   2784     if (!PyUnicode_Check(unicode)) {
   2785         PyErr_BadArgument();
   2786         return NULL;
   2787     }
   2788     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
   2789                                  PyUnicode_GET_SIZE(unicode),
   2790                                  NULL,
   2791                                  0);
   2792 }
   2793 
   2794 /* --- Unicode Escape Codec ----------------------------------------------- */
   2795 
   2796 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
   2797 
   2798 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
   2799                                         Py_ssize_t size,
   2800                                         const char *errors)
   2801 {
   2802     const char *starts = s;
   2803     Py_ssize_t startinpos;
   2804     Py_ssize_t endinpos;
   2805     Py_ssize_t outpos;
   2806     PyUnicodeObject *v;
   2807     Py_UNICODE *p;
   2808     const char *end;
   2809     char* message;
   2810     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
   2811     PyObject *errorHandler = NULL;
   2812     PyObject *exc = NULL;
   2813 
   2814     /* Escaped strings will always be longer than the resulting
   2815        Unicode string, so we start with size here and then reduce the
   2816        length after conversion to the true value.
   2817        (but if the error callback returns a long replacement string
   2818        we'll have to allocate more space) */
   2819     v = _PyUnicode_New(size);
   2820     if (v == NULL)
   2821         goto onError;
   2822     if (size == 0)
   2823         return (PyObject *)v;
   2824 
   2825     p = PyUnicode_AS_UNICODE(v);
   2826     end = s + size;
   2827 
   2828     while (s < end) {
   2829         unsigned char c;
   2830         Py_UNICODE x;
   2831         int digits;
   2832 
   2833         /* Non-escape characters are interpreted as Unicode ordinals */
   2834         if (*s != '\\') {
   2835             *p++ = (unsigned char) *s++;
   2836             continue;
   2837         }
   2838 
   2839         startinpos = s-starts;
   2840         /* \ - Escapes */
   2841         s++;
   2842         c = *s++;
   2843         if (s > end)
   2844             c = '\0'; /* Invalid after \ */
   2845         switch (c) {
   2846 
   2847             /* \x escapes */
   2848         case '\n': break;
   2849         case '\\': *p++ = '\\'; break;
   2850         case '\'': *p++ = '\''; break;
   2851         case '\"': *p++ = '\"'; break;
   2852         case 'b': *p++ = '\b'; break;
   2853         case 'f': *p++ = '\014'; break; /* FF */
   2854         case 't': *p++ = '\t'; break;
   2855         case 'n': *p++ = '\n'; break;
   2856         case 'r': *p++ = '\r'; break;
   2857         case 'v': *p++ = '\013'; break; /* VT */
   2858         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
   2859 
   2860             /* \OOO (octal) escapes */
   2861         case '0': case '1': case '2': case '3':
   2862         case '4': case '5': case '6': case '7':
   2863             x = s[-1] - '0';
   2864             if (s < end && '0' <= *s && *s <= '7') {
   2865                 x = (x<<3) + *s++ - '0';
   2866                 if (s < end && '0' <= *s && *s <= '7')
   2867                     x = (x<<3) + *s++ - '0';
   2868             }
   2869             *p++ = x;
   2870             break;
   2871 
   2872             /* hex escapes */
   2873             /* \xXX */
   2874         case 'x':
   2875             digits = 2;
   2876             message = "truncated \\xXX escape";
   2877             goto hexescape;
   2878 
   2879             /* \uXXXX */
   2880         case 'u':
   2881             digits = 4;
   2882             message = "truncated \\uXXXX escape";
   2883             goto hexescape;
   2884 
   2885             /* \UXXXXXXXX */
   2886         case 'U':
   2887             digits = 8;
   2888             message = "truncated \\UXXXXXXXX escape";
   2889         hexescape:
   2890             chr = 0;
   2891             if (end - s < digits) {
   2892                 /* count only hex digits */
   2893                 for (; s < end; ++s) {
   2894                     c = (unsigned char)*s;
   2895                     if (!Py_ISXDIGIT(c))
   2896                         goto error;
   2897                 }
   2898                 goto error;
   2899             }
   2900             for (; digits--; ++s) {
   2901                 c = (unsigned char)*s;
   2902                 if (!Py_ISXDIGIT(c))
   2903                     goto error;
   2904                 chr = (chr<<4) & ~0xF;
   2905                 if (c >= '0' && c <= '9')
   2906                     chr += c - '0';
   2907                 else if (c >= 'a' && c <= 'f')
   2908                     chr += 10 + c - 'a';
   2909                 else
   2910                     chr += 10 + c - 'A';
   2911             }
   2912             if (chr == 0xffffffff && PyErr_Occurred())
   2913                 /* _decoding_error will have already written into the
   2914                    target buffer. */
   2915                 break;
   2916         store:
   2917             /* when we get here, chr is a 32-bit unicode character */
   2918             if (chr <= 0xffff)
   2919                 /* UCS-2 character */
   2920                 *p++ = (Py_UNICODE) chr;
   2921             else if (chr <= 0x10ffff) {
   2922                 /* UCS-4 character. Either store directly, or as
   2923                    surrogate pair. */
   2924 #ifdef Py_UNICODE_WIDE
   2925                 *p++ = chr;
   2926 #else
   2927                 chr -= 0x10000L;
   2928                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
   2929                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
   2930 #endif
   2931             } else {
   2932                 message = "illegal Unicode character";
   2933                 goto error;
   2934             }
   2935             break;
   2936 
   2937             /* \N{name} */
   2938         case 'N':
   2939             message = "malformed \\N character escape";
   2940             if (ucnhash_CAPI == NULL) {
   2941                 /* load the unicode data module */
   2942                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
   2943                 if (ucnhash_CAPI == NULL)
   2944                     goto ucnhashError;
   2945             }
   2946             if (*s == '{') {
   2947                 const char *start = s+1;
   2948                 /* look for the closing brace */
   2949                 while (*s != '}' && s < end)
   2950                     s++;
   2951                 if (s > start && s < end && *s == '}') {
   2952                     /* found a name.  look it up in the unicode database */
   2953                     message = "unknown Unicode character name";
   2954                     s++;
   2955                     if (s - start - 1 <= INT_MAX &&
   2956                         ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
   2957                         goto store;
   2958                 }
   2959             }
   2960             goto error;
   2961 
   2962         default:
   2963             if (s > end) {
   2964                 message = "\\ at end of string";
   2965                 s--;
   2966                 goto error;
   2967             }
   2968             else {
   2969                 *p++ = '\\';
   2970                 *p++ = (unsigned char)s[-1];
   2971             }
   2972             break;
   2973         }
   2974         continue;
   2975 
   2976       error:
   2977         endinpos = s-starts;
   2978         outpos = p-PyUnicode_AS_UNICODE(v);
   2979         if (unicode_decode_call_errorhandler(
   2980                 errors, &errorHandler,
   2981                 "unicodeescape", message,
   2982                 starts, size, &startinpos, &endinpos, &exc, &s,
   2983                 &v, &outpos, &p))
   2984             goto onError;
   2985         continue;
   2986     }
   2987     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
   2988         goto onError;
   2989     Py_XDECREF(errorHandler);
   2990     Py_XDECREF(exc);
   2991     return (PyObject *)v;
   2992 
   2993   ucnhashError:
   2994     PyErr_SetString(
   2995         PyExc_UnicodeError,
   2996         "\\N escapes not supported (can't load unicodedata module)"
   2997         );
   2998     Py_XDECREF(v);
   2999     Py_XDECREF(errorHandler);
   3000     Py_XDECREF(exc);
   3001     return NULL;
   3002 
   3003   onError:
   3004     Py_XDECREF(v);
   3005     Py_XDECREF(errorHandler);
   3006     Py_XDECREF(exc);
   3007     return NULL;
   3008 }
   3009 
   3010 /* Return a Unicode-Escape string version of the Unicode object.
   3011 
   3012    If quotes is true, the string is enclosed in u"" or u'' quotes as
   3013    appropriate.
   3014 
   3015 */
   3016 
   3017 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
   3018                                              Py_ssize_t size,
   3019                                              Py_UNICODE ch)
   3020 {
   3021     /* like wcschr, but doesn't stop at NULL characters */
   3022 
   3023     while (size-- > 0) {
   3024         if (*s == ch)
   3025             return s;
   3026         s++;
   3027     }
   3028 
   3029     return NULL;
   3030 }
   3031 
   3032 static
   3033 PyObject *unicodeescape_string(const Py_UNICODE *s,
   3034                                Py_ssize_t size,
   3035                                int quotes)
   3036 {
   3037     PyObject *repr;
   3038     char *p;
   3039 
   3040     static const char *hexdigit = "0123456789abcdef";
   3041 #ifdef Py_UNICODE_WIDE
   3042     const Py_ssize_t expandsize = 10;
   3043 #else
   3044     const Py_ssize_t expandsize = 6;
   3045 #endif
   3046 
   3047     /* XXX(nnorwitz): rather than over-allocating, it would be
   3048        better to choose a different scheme.  Perhaps scan the
   3049        first N-chars of the string and allocate based on that size.
   3050     */
   3051     /* Initial allocation is based on the longest-possible unichr
   3052        escape.
   3053 
   3054        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
   3055        unichr, so in this case it's the longest unichr escape. In
   3056        narrow (UTF-16) builds this is five chars per source unichr
   3057        since there are two unichrs in the surrogate pair, so in narrow
   3058        (UTF-16) builds it's not the longest unichr escape.
   3059 
   3060        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
   3061        so in the narrow (UTF-16) build case it's the longest unichr
   3062        escape.
   3063     */
   3064 
   3065     if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
   3066         return PyErr_NoMemory();
   3067 
   3068     repr = PyString_FromStringAndSize(NULL,
   3069                                       2
   3070                                       + expandsize*size
   3071                                       + 1);
   3072     if (repr == NULL)
   3073         return NULL;
   3074 
   3075     p = PyString_AS_STRING(repr);
   3076 
   3077     if (quotes) {
   3078         *p++ = 'u';
   3079         *p++ = (findchar(s, size, '\'') &&
   3080                 !findchar(s, size, '"')) ? '"' : '\'';
   3081     }
   3082     while (size-- > 0) {
   3083         Py_UNICODE ch = *s++;
   3084 
   3085         /* Escape quotes and backslashes */
   3086         if ((quotes &&
   3087              ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
   3088             *p++ = '\\';
   3089             *p++ = (char) ch;
   3090             continue;
   3091         }
   3092 
   3093 #ifdef Py_UNICODE_WIDE
   3094         /* Map 21-bit characters to '\U00xxxxxx' */
   3095         else if (ch >= 0x10000) {
   3096             *p++ = '\\';
   3097             *p++ = 'U';
   3098             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
   3099             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
   3100             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
   3101             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
   3102             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
   3103             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
   3104             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
   3105             *p++ = hexdigit[ch & 0x0000000F];
   3106             continue;
   3107         }
   3108 #else
   3109         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
   3110         else if (ch >= 0xD800 && ch < 0xDC00) {
   3111             Py_UNICODE ch2;
   3112             Py_UCS4 ucs;
   3113 
   3114             ch2 = *s++;
   3115             size--;
   3116             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
   3117                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
   3118                 *p++ = '\\';
   3119                 *p++ = 'U';
   3120                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
   3121                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
   3122                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
   3123                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
   3124                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
   3125                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
   3126                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
   3127                 *p++ = hexdigit[ucs & 0x0000000F];
   3128                 continue;
   3129             }
   3130             /* Fall through: isolated surrogates are copied as-is */
   3131             s--;
   3132             size++;
   3133         }
   3134 #endif
   3135 
   3136         /* Map 16-bit characters to '\uxxxx' */
   3137         if (ch >= 256) {
   3138             *p++ = '\\';
   3139             *p++ = 'u';
   3140             *p++ = hexdigit[(ch >> 12) & 0x000F];
   3141             *p++ = hexdigit[(ch >> 8) & 0x000F];
   3142             *p++ = hexdigit[(ch >> 4) & 0x000F];
   3143             *p++ = hexdigit[ch & 0x000F];
   3144         }
   3145 
   3146         /* Map special whitespace to '\t', \n', '\r' */
   3147         else if (ch == '\t') {
   3148             *p++ = '\\';
   3149             *p++ = 't';
   3150         }
   3151         else if (ch == '\n') {
   3152             *p++ = '\\';
   3153             *p++ = 'n';
   3154         }
   3155         else if (ch == '\r') {
   3156             *p++ = '\\';
   3157             *p++ = 'r';
   3158         }
   3159 
   3160         /* Map non-printable US ASCII to '\xhh' */
   3161         else if (ch < ' ' || ch >= 0x7F) {
   3162             *p++ = '\\';
   3163             *p++ = 'x';
   3164             *p++ = hexdigit[(ch >> 4) & 0x000F];
   3165             *p++ = hexdigit[ch & 0x000F];
   3166         }
   3167 
   3168         /* Copy everything else as-is */
   3169         else
   3170             *p++ = (char) ch;
   3171     }
   3172     if (quotes)
   3173         *p++ = PyString_AS_STRING(repr)[1];
   3174 
   3175     *p = '\0';
   3176     if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
   3177         return NULL;
   3178     return repr;
   3179 }
   3180 
   3181 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
   3182                                         Py_ssize_t size)
   3183 {
   3184     return unicodeescape_string(s, size, 0);
   3185 }
   3186 
   3187 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
   3188 {
   3189     if (!PyUnicode_Check(unicode)) {
   3190         PyErr_BadArgument();
   3191         return NULL;
   3192     }
   3193     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
   3194                                          PyUnicode_GET_SIZE(unicode));
   3195 }
   3196 
   3197 /* --- Raw Unicode Escape Codec ------------------------------------------- */
   3198 
   3199 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
   3200                                            Py_ssize_t size,
   3201                                            const char *errors)
   3202 {
   3203     const char *starts = s;
   3204     Py_ssize_t startinpos;
   3205     Py_ssize_t endinpos;
   3206     Py_ssize_t outpos;
   3207     PyUnicodeObject *v;
   3208     Py_UNICODE *p;
   3209     const char *end;
   3210     const char *bs;
   3211     PyObject *errorHandler = NULL;
   3212     PyObject *exc = NULL;
   3213 
   3214     /* Escaped strings will always be longer than the resulting
   3215        Unicode string, so we start with size here and then reduce the
   3216        length after conversion to the true value. (But decoding error
   3217        handler might have to resize the string) */
   3218     v = _PyUnicode_New(size);
   3219     if (v == NULL)
   3220         goto onError;
   3221     if (size == 0)
   3222         return (PyObject *)v;
   3223     p = PyUnicode_AS_UNICODE(v);
   3224     end = s + size;
   3225     while (s < end) {
   3226         unsigned char c;
   3227         Py_UCS4 x;
   3228         int i;
   3229         int count;
   3230 
   3231         /* Non-escape characters are interpreted as Unicode ordinals */
   3232         if (*s != '\\') {
   3233             *p++ = (unsigned char)*s++;
   3234             continue;
   3235         }
   3236         startinpos = s-starts;
   3237 
   3238         /* \u-escapes are only interpreted iff the number of leading
   3239            backslashes if odd */
   3240         bs = s;
   3241         for (;s < end;) {
   3242             if (*s != '\\')
   3243                 break;
   3244             *p++ = (unsigned char)*s++;
   3245         }
   3246         if (((s - bs) & 1) == 0 ||
   3247             s >= end ||
   3248             (*s != 'u' && *s != 'U')) {
   3249             continue;
   3250         }
   3251         p--;
   3252         count = *s=='u' ? 4 : 8;
   3253         s++;
   3254 
   3255         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
   3256         outpos = p-PyUnicode_AS_UNICODE(v);
   3257         for (x = 0, i = 0; i < count; ++i, ++s) {
   3258             c = (unsigned char)*s;
   3259             if (!isxdigit(c)) {
   3260                 endinpos = s-starts;
   3261                 if (unicode_decode_call_errorhandler(
   3262                         errors, &errorHandler,
   3263                         "rawunicodeescape", "truncated \\uXXXX",
   3264                         starts, size, &startinpos, &endinpos, &exc, &s,
   3265                         &v, &outpos, &p))
   3266                     goto onError;
   3267                 goto nextByte;
   3268             }
   3269             x = (x<<4) & ~0xF;
   3270             if (c >= '0' && c <= '9')
   3271                 x += c - '0';
   3272             else if (c >= 'a' && c <= 'f')
   3273                 x += 10 + c - 'a';
   3274             else
   3275                 x += 10 + c - 'A';
   3276         }
   3277         if (x <= 0xffff)
   3278             /* UCS-2 character */
   3279             *p++ = (Py_UNICODE) x;
   3280         else if (x <= 0x10ffff) {
   3281             /* UCS-4 character. Either store directly, or as
   3282                surrogate pair. */
   3283 #ifdef Py_UNICODE_WIDE
   3284             *p++ = (Py_UNICODE) x;
   3285 #else
   3286             x -= 0x10000L;
   3287             *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
   3288             *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
   3289 #endif
   3290         } else {
   3291             endinpos = s-starts;
   3292             outpos = p-PyUnicode_AS_UNICODE(v);
   3293             if (unicode_decode_call_errorhandler(
   3294                     errors, &errorHandler,
   3295                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
   3296                     starts, size, &startinpos, &endinpos, &exc, &s,
   3297                     &v, &outpos, &p))
   3298                 goto onError;
   3299         }
   3300       nextByte:
   3301         ;
   3302     }
   3303     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
   3304         goto onError;
   3305     Py_XDECREF(errorHandler);
   3306     Py_XDECREF(exc);
   3307     return (PyObject *)v;
   3308 
   3309   onError:
   3310     Py_XDECREF(v);
   3311     Py_XDECREF(errorHandler);
   3312     Py_XDECREF(exc);
   3313     return NULL;
   3314 }
   3315 
   3316 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
   3317                                            Py_ssize_t size)
   3318 {
   3319     PyObject *repr;
   3320     char *p;
   3321     char *q;
   3322 
   3323     static const char *hexdigit = "0123456789abcdef";
   3324 #ifdef Py_UNICODE_WIDE
   3325     const Py_ssize_t expandsize = 10;
   3326 #else
   3327     const Py_ssize_t expandsize = 6;
   3328 #endif
   3329 
   3330     if (size > PY_SSIZE_T_MAX / expandsize)
   3331         return PyErr_NoMemory();
   3332 
   3333     repr = PyString_FromStringAndSize(NULL, expandsize * size);
   3334     if (repr == NULL)
   3335         return NULL;
   3336     if (size == 0)
   3337         return repr;
   3338 
   3339     p = q = PyString_AS_STRING(repr);
   3340     while (size-- > 0) {
   3341         Py_UNICODE ch = *s++;
   3342 #ifdef Py_UNICODE_WIDE
   3343         /* Map 32-bit characters to '\Uxxxxxxxx' */
   3344         if (ch >= 0x10000) {
   3345             *p++ = '\\';
   3346             *p++ = 'U';
   3347             *p++ = hexdigit[(ch >> 28) & 0xf];
   3348             *p++ = hexdigit[(ch >> 24) & 0xf];
   3349             *p++ = hexdigit[(ch >> 20) & 0xf];
   3350             *p++ = hexdigit[(ch >> 16) & 0xf];
   3351             *p++ = hexdigit[(ch >> 12) & 0xf];
   3352             *p++ = hexdigit[(ch >> 8) & 0xf];
   3353             *p++ = hexdigit[(ch >> 4) & 0xf];
   3354             *p++ = hexdigit[ch & 15];
   3355         }
   3356         else
   3357 #else
   3358             /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
   3359             if (ch >= 0xD800 && ch < 0xDC00) {
   3360                 Py_UNICODE ch2;
   3361                 Py_UCS4 ucs;
   3362 
   3363                 ch2 = *s++;
   3364                 size--;
   3365                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
   3366                     ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
   3367                     *p++ = '\\';
   3368                     *p++ = 'U';
   3369                     *p++ = hexdigit[(ucs >> 28) & 0xf];
   3370                     *p++ = hexdigit[(ucs >> 24) & 0xf];
   3371                     *p++ = hexdigit[(ucs >> 20) & 0xf];
   3372                     *p++ = hexdigit[(ucs >> 16) & 0xf];
   3373                     *p++ = hexdigit[(ucs >> 12) & 0xf];
   3374                     *p++ = hexdigit[(ucs >> 8) & 0xf];
   3375                     *p++ = hexdigit[(ucs >> 4) & 0xf];
   3376                     *p++ = hexdigit[ucs & 0xf];
   3377                     continue;
   3378                 }
   3379                 /* Fall through: isolated surrogates are copied as-is */
   3380                 s--;
   3381                 size++;
   3382             }
   3383 #endif
   3384         /* Map 16-bit characters to '\uxxxx' */
   3385         if (ch >= 256) {
   3386             *p++ = '\\';
   3387             *p++ = 'u';
   3388             *p++ = hexdigit[(ch >> 12) & 0xf];
   3389             *p++ = hexdigit[(ch >> 8) & 0xf];
   3390             *p++ = hexdigit[(ch >> 4) & 0xf];
   3391             *p++ = hexdigit[ch & 15];
   3392         }
   3393         /* Copy everything else as-is */
   3394         else
   3395             *p++ = (char) ch;
   3396     }
   3397     *p = '\0';
   3398     if (_PyString_Resize(&repr, p - q))
   3399         return NULL;
   3400     return repr;
   3401 }
   3402 
   3403 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
   3404 {
   3405     if (!PyUnicode_Check(unicode)) {
   3406         PyErr_BadArgument();
   3407         return NULL;
   3408     }
   3409     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
   3410                                             PyUnicode_GET_SIZE(unicode));
   3411 }
   3412 
   3413 /* --- Unicode Internal Codec ------------------------------------------- */
   3414 
   3415 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
   3416                                            Py_ssize_t size,
   3417                                            const char *errors)
   3418 {
   3419     const char *starts = s;
   3420     Py_ssize_t startinpos;
   3421     Py_ssize_t endinpos;
   3422     Py_ssize_t outpos;
   3423     PyUnicodeObject *v;
   3424     Py_UNICODE *p;
   3425     const char *end;
   3426     const char *reason;
   3427     PyObject *errorHandler = NULL;
   3428     PyObject *exc = NULL;
   3429 
   3430 #ifdef Py_UNICODE_WIDE
   3431     Py_UNICODE unimax = PyUnicode_GetMax();
   3432 #endif
   3433 
   3434     /* XXX overflow detection missing */
   3435     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
   3436     if (v == NULL)
   3437         goto onError;
   3438     if (PyUnicode_GetSize((PyObject *)v) == 0)
   3439         return (PyObject *)v;
   3440     p = PyUnicode_AS_UNICODE(v);
   3441     end = s + size;
   3442 
   3443     while (s < end) {
   3444         if (end-s < Py_UNICODE_SIZE) {
   3445             endinpos = end-starts;
   3446             reason = "truncated input";
   3447             goto error;
   3448         }
   3449         memcpy(p, s, sizeof(Py_UNICODE));
   3450 #ifdef Py_UNICODE_WIDE
   3451         /* We have to sanity check the raw data, otherwise doom looms for
   3452            some malformed UCS-4 data. */
   3453         if (*p > unimax || *p < 0) {
   3454             endinpos = s - starts + Py_UNICODE_SIZE;
   3455             reason = "illegal code point (> 0x10FFFF)";
   3456             goto error;
   3457         }
   3458 #endif
   3459         p++;
   3460         s += Py_UNICODE_SIZE;
   3461         continue;
   3462 
   3463   error:
   3464         startinpos = s - starts;
   3465         outpos = p - PyUnicode_AS_UNICODE(v);
   3466         if (unicode_decode_call_errorhandler(
   3467                 errors, &errorHandler,
   3468                 "unicode_internal", reason,
   3469                 starts, size, &startinpos, &endinpos, &exc, &s,
   3470                 &v, &outpos, &p)) {
   3471             goto onError;
   3472         }
   3473     }
   3474 
   3475     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
   3476         goto onError;
   3477     Py_XDECREF(errorHandler);
   3478     Py_XDECREF(exc);
   3479     return (PyObject *)v;
   3480 
   3481   onError:
   3482     Py_XDECREF(v);
   3483     Py_XDECREF(errorHandler);
   3484     Py_XDECREF(exc);
   3485     return NULL;
   3486 }
   3487 
   3488 /* --- Latin-1 Codec ------------------------------------------------------ */
   3489 
   3490 PyObject *PyUnicode_DecodeLatin1(const char *s,
   3491                                  Py_ssize_t size,
   3492                                  const char *errors)
   3493 {
   3494     PyUnicodeObject *v;
   3495     Py_UNICODE *p;
   3496 
   3497     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
   3498     if (size == 1) {
   3499         Py_UNICODE r = *(unsigned char*)s;
   3500         return PyUnicode_FromUnicode(&r, 1);
   3501     }
   3502 
   3503     v = _PyUnicode_New(size);
   3504     if (v == NULL)
   3505         goto onError;
   3506     if (size == 0)
   3507         return (PyObject *)v;
   3508     p = PyUnicode_AS_UNICODE(v);
   3509     while (size-- > 0)
   3510         *p++ = (unsigned char)*s++;
   3511     return (PyObject *)v;
   3512 
   3513   onError:
   3514     Py_XDECREF(v);
   3515     return NULL;
   3516 }
   3517 
   3518 /* create or adjust a UnicodeEncodeError */
   3519 static void make_encode_exception(PyObject **exceptionObject,
   3520                                   const char *encoding,
   3521                                   const Py_UNICODE *unicode, Py_ssize_t size,
   3522                                   Py_ssize_t startpos, Py_ssize_t endpos,
   3523                                   const char *reason)
   3524 {
   3525     if (*exceptionObject == NULL) {
   3526         *exceptionObject = PyUnicodeEncodeError_Create(
   3527             encoding, unicode, size, startpos, endpos, reason);
   3528     }
   3529     else {
   3530         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
   3531             goto onError;
   3532         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
   3533             goto onError;
   3534         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
   3535             goto onError;
   3536         return;
   3537       onError:
   3538         Py_CLEAR(*exceptionObject);
   3539     }
   3540 }
   3541 
   3542 /* raises a UnicodeEncodeError */
   3543 static void raise_encode_exception(PyObject **exceptionObject,
   3544                                    const char *encoding,
   3545                                    const Py_UNICODE *unicode, Py_ssize_t size,
   3546                                    Py_ssize_t startpos, Py_ssize_t endpos,
   3547                                    const char *reason)
   3548 {
   3549     make_encode_exception(exceptionObject,
   3550                           encoding, unicode, size, startpos, endpos, reason);
   3551     if (*exceptionObject != NULL)
   3552         PyCodec_StrictErrors(*exceptionObject);
   3553 }
   3554 
   3555 /* error handling callback helper:
   3556    build arguments, call the callback and check the arguments,
   3557    put the result into newpos and return the replacement string, which
   3558    has to be freed by the caller */
   3559 static PyObject *unicode_encode_call_errorhandler(const char *errors,
   3560                                                   PyObject **errorHandler,
   3561                                                   const char *encoding, const char *reason,
   3562                                                   const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
   3563                                                   Py_ssize_t startpos, Py_ssize_t endpos,
   3564                                                   Py_ssize_t *newpos)
   3565 {
   3566     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
   3567 
   3568     PyObject *restuple;
   3569     PyObject *resunicode;
   3570 
   3571     if (*errorHandler == NULL) {
   3572         *errorHandler = PyCodec_LookupError(errors);
   3573         if (*errorHandler == NULL)
   3574             return NULL;
   3575     }
   3576 
   3577     make_encode_exception(exceptionObject,
   3578                           encoding, unicode, size, startpos, endpos, reason);
   3579     if (*exceptionObject == NULL)
   3580         return NULL;
   3581 
   3582     restuple = PyObject_CallFunctionObjArgs(
   3583         *errorHandler, *exceptionObject, NULL);
   3584     if (restuple == NULL)
   3585         return NULL;
   3586     if (!PyTuple_Check(restuple)) {
   3587         PyErr_SetString(PyExc_TypeError, &argparse[4]);
   3588         Py_DECREF(restuple);
   3589         return NULL;
   3590     }
   3591     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
   3592                           &resunicode, newpos)) {
   3593         Py_DECREF(restuple);
   3594         return NULL;
   3595     }
   3596     if (*newpos<0)
   3597         *newpos = size+*newpos;
   3598     if (*newpos<0 || *newpos>size) {
   3599         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
   3600         Py_DECREF(restuple);
   3601         return NULL;
   3602     }
   3603     Py_INCREF(resunicode);
   3604     Py_DECREF(restuple);
   3605     return resunicode;
   3606 }
   3607 
   3608 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
   3609                                      Py_ssize_t size,
   3610                                      const char *errors,
   3611                                      int limit)
   3612 {
   3613     /* output object */
   3614     PyObject *res;
   3615     /* pointers to the beginning and end+1 of input */
   3616     const Py_UNICODE *startp = p;
   3617     const Py_UNICODE *endp = p + size;
   3618     /* pointer to the beginning of the unencodable characters */
   3619     /* const Py_UNICODE *badp = NULL; */
   3620     /* pointer into the output */
   3621     char *str;
   3622     /* current output position */
   3623     Py_ssize_t respos = 0;
   3624     Py_ssize_t ressize;
   3625     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
   3626     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
   3627     PyObject *errorHandler = NULL;
   3628     PyObject *exc = NULL;
   3629     /* the following variable is used for caching string comparisons
   3630      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
   3631     int known_errorHandler = -1;
   3632 
   3633     /* allocate enough for a simple encoding without
   3634        replacements, if we need more, we'll resize */
   3635     res = PyString_FromStringAndSize(NULL, size);
   3636     if (res == NULL)
   3637         goto onError;
   3638     if (size == 0)
   3639         return res;
   3640     str = PyString_AS_STRING(res);
   3641     ressize = size;
   3642 
   3643     while (p<endp) {
   3644         Py_UNICODE c = *p;
   3645 
   3646         /* can we encode this? */
   3647         if (c<limit) {
   3648             /* no overflow check, because we know that the space is enough */
   3649             *str++ = (char)c;
   3650             ++p;
   3651         }
   3652         else {
   3653             Py_ssize_t unicodepos = p-startp;
   3654             Py_ssize_t requiredsize;
   3655             PyObject *repunicode;
   3656             Py_ssize_t repsize;
   3657             Py_ssize_t newpos;
   3658             Py_ssize_t respos;
   3659             Py_UNICODE *uni2;
   3660             /* startpos for collecting unencodable chars */
   3661             const Py_UNICODE *collstart = p;
   3662             const Py_UNICODE *collend = p;
   3663             /* find all unecodable characters */
   3664             while ((collend < endp) && ((*collend) >= limit))
   3665                 ++collend;
   3666             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
   3667             if (known_errorHandler==-1) {
   3668                 if ((errors==NULL) || (!strcmp(errors, "strict")))
   3669                     known_errorHandler = 1;
   3670                 else if (!strcmp(errors, "replace"))
   3671                     known_errorHandler = 2;
   3672                 else if (!strcmp(errors, "ignore"))
   3673                     known_errorHandler = 3;
   3674                 else if (!strcmp(errors, "xmlcharrefreplace"))
   3675                     known_errorHandler = 4;
   3676                 else
   3677                     known_errorHandler = 0;
   3678             }
   3679             switch (known_errorHandler) {
   3680             case 1: /* strict */
   3681                 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
   3682                 goto onError;
   3683             case 2: /* replace */
   3684                 while (collstart++ < collend)
   3685                     *str++ = '?'; /* fall through */
   3686             case 3: /* ignore */
   3687                 p = collend;
   3688                 break;
   3689             case 4: /* xmlcharrefreplace */
   3690                 respos = str - PyString_AS_STRING(res);
   3691                 /* determine replacement size (temporarily (mis)uses p) */
   3692                 requiredsize = respos;
   3693                 for (p = collstart; p < collend;) {
   3694                     Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
   3695                     Py_ssize_t incr;
   3696                     if (ch < 10)
   3697                         incr = 2+1+1;
   3698                     else if (ch < 100)
   3699                         incr = 2+2+1;
   3700                     else if (ch < 1000)
   3701                         incr = 2+3+1;
   3702                     else if (ch < 10000)
   3703                         incr = 2+4+1;
   3704                     else if (ch < 100000)
   3705                         incr = 2+5+1;
   3706                     else if (ch < 1000000)
   3707                         incr = 2+6+1;
   3708                     else
   3709                         incr = 2+7+1;
   3710                     if (requiredsize > PY_SSIZE_T_MAX - incr)
   3711                         goto overflow;
   3712                     requiredsize += incr;
   3713                 }
   3714                 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
   3715                     goto overflow;
   3716                 requiredsize += endp - collend;
   3717                 if (requiredsize > ressize) {
   3718                     if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
   3719                         requiredsize = 2*ressize;
   3720                     if (_PyString_Resize(&res, requiredsize))
   3721                         goto onError;
   3722                     str = PyString_AS_STRING(res) + respos;
   3723                     ressize = requiredsize;
   3724                 }
   3725                 /* generate replacement (temporarily (mis)uses p) */
   3726                 for (p = collstart; p < collend;) {
   3727                     Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
   3728                     str += sprintf(str, "&#%d;", (int)ch);
   3729                 }
   3730                 p = collend;
   3731                 break;
   3732             default:
   3733                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
   3734                                                               encoding, reason, startp, size, &exc,
   3735                                                               collstart-startp, collend-startp, &newpos);
   3736                 if (repunicode == NULL)
   3737                     goto onError;
   3738                 /* need more space? (at least enough for what we have+the
   3739                    replacement+the rest of the string, so we won't have to
   3740                    check space for encodable characters) */
   3741                 respos = str - PyString_AS_STRING(res);
   3742                 repsize = PyUnicode_GET_SIZE(repunicode);
   3743                 if (respos > PY_SSIZE_T_MAX - repsize)
   3744                     goto overflow;
   3745                 requiredsize = respos + repsize;
   3746                 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
   3747                     goto overflow;
   3748                 requiredsize += endp - collend;
   3749                 if (requiredsize > ressize) {
   3750                     if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
   3751                         requiredsize = 2*ressize;
   3752                     if (_PyString_Resize(&res, requiredsize)) {
   3753                         Py_DECREF(repunicode);
   3754                         goto onError;
   3755                     }
   3756                     str = PyString_AS_STRING(res) + respos;
   3757                     ressize = requiredsize;
   3758                 }
   3759                 /* check if there is anything unencodable in the replacement
   3760                    and copy it to the output */
   3761                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2, ++str) {
   3762                     c = *uni2;
   3763                     if (c >= limit) {
   3764                         raise_encode_exception(&exc, encoding, startp, size,
   3765                                                unicodepos, unicodepos+1, reason);
   3766                         Py_DECREF(repunicode);
   3767                         goto onError;
   3768                     }
   3769                     *str = (char)c;
   3770                 }
   3771                 p = startp + newpos;
   3772                 Py_DECREF(repunicode);
   3773             }
   3774         }
   3775     }
   3776     /* Resize if we allocated to much */
   3777     respos = str - PyString_AS_STRING(res);
   3778     if (respos < ressize)
   3779         /* If this falls res will be NULL */
   3780         _PyString_Resize(&res, respos);
   3781     Py_XDECREF(errorHandler);
   3782     Py_XDECREF(exc);
   3783     return res;
   3784 
   3785   overflow:
   3786     PyErr_SetString(PyExc_OverflowError,
   3787                     "encoded result is too long for a Python string");
   3788 
   3789   onError:
   3790     Py_XDECREF(res);
   3791     Py_XDECREF(errorHandler);
   3792     Py_XDECREF(exc);
   3793     return NULL;
   3794 }
   3795 
   3796 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
   3797                                  Py_ssize_t size,
   3798                                  const char *errors)
   3799 {
   3800     return unicode_encode_ucs1(p, size, errors, 256);
   3801 }
   3802 
   3803 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
   3804 {
   3805     if (!PyUnicode_Check(unicode)) {
   3806         PyErr_BadArgument();
   3807         return NULL;
   3808     }
   3809     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
   3810                                   PyUnicode_GET_SIZE(unicode),
   3811                                   NULL);
   3812 }
   3813 
   3814 /* --- 7-bit ASCII Codec -------------------------------------------------- */
   3815 
   3816 PyObject *PyUnicode_DecodeASCII(const char *s,
   3817                                 Py_ssize_t size,
   3818                                 const char *errors)
   3819 {
   3820     const char *starts = s;
   3821     PyUnicodeObject *v;
   3822     Py_UNICODE *p;
   3823     Py_ssize_t startinpos;
   3824     Py_ssize_t endinpos;
   3825     Py_ssize_t outpos;
   3826     const char *e;
   3827     PyObject *errorHandler = NULL;
   3828     PyObject *exc = NULL;
   3829 
   3830     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
   3831     if (size == 1 && *(unsigned char*)s < 128) {
   3832         Py_UNICODE r = *(unsigned char*)s;
   3833         return PyUnicode_FromUnicode(&r, 1);
   3834     }
   3835 
   3836     v = _PyUnicode_New(size);
   3837     if (v == NULL)
   3838         goto onError;
   3839     if (size == 0)
   3840         return (PyObject *)v;
   3841     p = PyUnicode_AS_UNICODE(v);
   3842     e = s + size;
   3843     while (s < e) {
   3844         register unsigned char c = (unsigned char)*s;
   3845         if (c < 128) {
   3846             *p++ = c;
   3847             ++s;
   3848         }
   3849         else {
   3850             startinpos = s-starts;
   3851             endinpos = startinpos + 1;
   3852             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
   3853             if (unicode_decode_call_errorhandler(
   3854                     errors, &errorHandler,
   3855                     "ascii", "ordinal not in range(128)",
   3856                     starts, size, &startinpos, &endinpos, &exc, &s,
   3857                     &v, &outpos, &p))
   3858                 goto onError;
   3859         }
   3860     }
   3861     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
   3862         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
   3863             goto onError;
   3864     Py_XDECREF(errorHandler);
   3865     Py_XDECREF(exc);
   3866     return (PyObject *)v;
   3867 
   3868   onError:
   3869     Py_XDECREF(v);
   3870     Py_XDECREF(errorHandler);
   3871     Py_XDECREF(exc);
   3872     return NULL;
   3873 }
   3874 
   3875 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
   3876                                 Py_ssize_t size,
   3877                                 const char *errors)
   3878 {
   3879     return unicode_encode_ucs1(p, size, errors, 128);
   3880 }
   3881 
   3882 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
   3883 {
   3884     if (!PyUnicode_Check(unicode)) {
   3885         PyErr_BadArgument();
   3886         return NULL;
   3887     }
   3888     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
   3889                                  PyUnicode_GET_SIZE(unicode),
   3890                                  NULL);
   3891 }
   3892 
   3893 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
   3894 
   3895 /* --- MBCS codecs for Windows -------------------------------------------- */
   3896 
   3897 #if SIZEOF_INT < SIZEOF_SIZE_T
   3898 #define NEED_RETRY
   3899 #endif
   3900 
   3901 /* XXX This code is limited to "true" double-byte encodings, as
   3902    a) it assumes an incomplete character consists of a single byte, and
   3903    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
   3904    encodings, see IsDBCSLeadByteEx documentation. */
   3905 
   3906 static int is_dbcs_lead_byte(const char *s, int offset)
   3907 {
   3908     const char *curr = s + offset;
   3909 
   3910     if (IsDBCSLeadByte(*curr)) {
   3911         const char *prev = CharPrev(s, curr);
   3912         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
   3913     }
   3914     return 0;
   3915 }
   3916 
   3917 /*
   3918  * Decode MBCS string into unicode object. If 'final' is set, converts
   3919  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
   3920  */
   3921 static int decode_mbcs(PyUnicodeObject **v,
   3922                        const char *s, /* MBCS string */
   3923                        int size, /* sizeof MBCS string */
   3924                        int final)
   3925 {
   3926     Py_UNICODE *p;
   3927     Py_ssize_t n = 0;
   3928     int usize = 0;
   3929 
   3930     assert(size >= 0);
   3931 
   3932     /* Skip trailing lead-byte unless 'final' is set */
   3933     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
   3934         --size;
   3935 
   3936     /* First get the size of the result */
   3937     if (size > 0) {
   3938         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
   3939         if (usize == 0) {
   3940             PyErr_SetFromWindowsErrWithFilename(0, NULL);
   3941             return -1;
   3942         }
   3943     }
   3944 
   3945     if (*v == NULL) {
   3946         /* Create unicode object */
   3947         *v = _PyUnicode_New(usize);
   3948         if (*v == NULL)
   3949             return -1;
   3950     }
   3951     else {
   3952         /* Extend unicode object */
   3953         n = PyUnicode_GET_SIZE(*v);
   3954         if (_PyUnicode_Resize(v, n + usize) < 0)
   3955             return -1;
   3956     }
   3957 
   3958     /* Do the conversion */
   3959     if (size > 0) {
   3960         p = PyUnicode_AS_UNICODE(*v) + n;
   3961         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
   3962             PyErr_SetFromWindowsErrWithFilename(0, NULL);
   3963             return -1;
   3964         }
   3965     }
   3966 
   3967     return size;
   3968 }
   3969 
   3970 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
   3971                                        Py_ssize_t size,
   3972                                        const char *errors,
   3973                                        Py_ssize_t *consumed)
   3974 {
   3975     PyUnicodeObject *v = NULL;
   3976     int done;
   3977 
   3978     if (consumed)
   3979         *consumed = 0;
   3980 
   3981 #ifdef NEED_RETRY
   3982   retry:
   3983     if (size > INT_MAX)
   3984         done = decode_mbcs(&v, s, INT_MAX, 0);
   3985     else
   3986 #endif
   3987         done = decode_mbcs(&v, s, (int)size, !consumed);
   3988 
   3989     if (done < 0) {
   3990         Py_XDECREF(v);
   3991         return NULL;
   3992     }
   3993 
   3994     if (consumed)
   3995         *consumed += done;
   3996 
   3997 #ifdef NEED_RETRY
   3998     if (size > INT_MAX) {
   3999         s += done;
   4000         size -= done;
   4001         goto retry;
   4002     }
   4003 #endif
   4004 
   4005     return (PyObject *)v;
   4006 }
   4007 
   4008 PyObject *PyUnicode_DecodeMBCS(const char *s,
   4009                                Py_ssize_t size,
   4010                                const char *errors)
   4011 {
   4012     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
   4013 }
   4014 
   4015 /*
   4016  * Convert unicode into string object (MBCS).
   4017  * Returns 0 if succeed, -1 otherwise.
   4018  */
   4019 static int encode_mbcs(PyObject **repr,
   4020                        const Py_UNICODE *p, /* unicode */
   4021                        int size) /* size of unicode */
   4022 {
   4023     int mbcssize = 0;
   4024     Py_ssize_t n = 0;
   4025 
   4026     assert(size >= 0);
   4027 
   4028     /* First get the size of the result */
   4029     if (size > 0) {
   4030         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
   4031         if (mbcssize == 0) {
   4032             PyErr_SetFromWindowsErrWithFilename(0, NULL);
   4033             return -1;
   4034         }
   4035     }
   4036 
   4037     if (*repr == NULL) {
   4038         /* Create string object */
   4039         *repr = PyString_FromStringAndSize(NULL, mbcssize);
   4040         if (*repr == NULL)
   4041             return -1;
   4042     }
   4043     else {
   4044         /* Extend string object */
   4045         n = PyString_Size(*repr);
   4046         if (_PyString_Resize(repr, n + mbcssize) < 0)
   4047             return -1;
   4048     }
   4049 
   4050     /* Do the conversion */
   4051     if (size > 0) {
   4052         char *s = PyString_AS_STRING(*repr) + n;
   4053         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
   4054             PyErr_SetFromWindowsErrWithFilename(0, NULL);
   4055             return -1;
   4056         }
   4057     }
   4058 
   4059     return 0;
   4060 }
   4061 
   4062 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
   4063                                Py_ssize_t size,
   4064                                const char *errors)
   4065 {
   4066     PyObject *repr = NULL;
   4067     int ret;
   4068 
   4069 #ifdef NEED_RETRY
   4070   retry:
   4071     if (size > INT_MAX)
   4072         ret = encode_mbcs(&repr, p, INT_MAX);
   4073     else
   4074 #endif
   4075         ret = encode_mbcs(&repr, p, (int)size);
   4076 
   4077     if (ret < 0) {
   4078         Py_XDECREF(repr);
   4079         return NULL;
   4080     }
   4081 
   4082 #ifdef NEED_RETRY
   4083     if (size > INT_MAX) {
   4084         p += INT_MAX;
   4085         size -= INT_MAX;
   4086         goto retry;
   4087     }
   4088 #endif
   4089 
   4090     return repr;
   4091 }
   4092 
   4093 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
   4094 {
   4095     if (!PyUnicode_Check(unicode)) {
   4096         PyErr_BadArgument();
   4097         return NULL;
   4098     }
   4099     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
   4100                                 PyUnicode_GET_SIZE(unicode),
   4101                                 NULL);
   4102 }
   4103 
   4104 #undef NEED_RETRY
   4105 
   4106 #endif /* MS_WINDOWS */
   4107 
   4108 /* --- Character Mapping Codec -------------------------------------------- */
   4109 
   4110 PyObject *PyUnicode_DecodeCharmap(const char *s,
   4111                                   Py_ssize_t size,
   4112                                   PyObject *mapping,
   4113                                   const char *errors)
   4114 {
   4115     const char *starts = s;
   4116     Py_ssize_t startinpos;
   4117     Py_ssize_t endinpos;
   4118     Py_ssize_t outpos;
   4119     const char *e;
   4120     PyUnicodeObject *v;
   4121     Py_UNICODE *p;
   4122     Py_ssize_t extrachars = 0;
   4123     PyObject *errorHandler = NULL;
   4124     PyObject *exc = NULL;
   4125     Py_UNICODE *mapstring = NULL;
   4126     Py_ssize_t maplen = 0;
   4127 
   4128     /* Default to Latin-1 */
   4129     if (mapping == NULL)
   4130         return PyUnicode_DecodeLatin1(s, size, errors);
   4131 
   4132     v = _PyUnicode_New(size);
   4133     if (v == NULL)
   4134         goto onError;
   4135     if (size == 0)
   4136         return (PyObject *)v;
   4137     p = PyUnicode_AS_UNICODE(v);
   4138     e = s + size;
   4139     if (PyUnicode_CheckExact(mapping)) {
   4140         mapstring = PyUnicode_AS_UNICODE(mapping);
   4141         maplen = PyUnicode_GET_SIZE(mapping);
   4142         while (s < e) {
   4143             unsigned char ch = *s;
   4144             Py_UNICODE x = 0xfffe; /* illegal value */
   4145 
   4146             if (ch < maplen)
   4147                 x = mapstring[ch];
   4148 
   4149             if (x == 0xfffe) {
   4150                 /* undefined mapping */
   4151                 outpos = p-PyUnicode_AS_UNICODE(v);
   4152                 startinpos = s-starts;
   4153                 endinpos = startinpos+1;
   4154                 if (unicode_decode_call_errorhandler(
   4155                         errors, &errorHandler,
   4156                         "charmap", "character maps to <undefined>",
   4157                         starts, size, &startinpos, &endinpos, &exc, &s,
   4158                         &v, &outpos, &p)) {
   4159                     goto onError;
   4160                 }
   4161                 continue;
   4162             }
   4163             *p++ = x;
   4164             ++s;
   4165         }
   4166     }
   4167     else {
   4168         while (s < e) {
   4169             unsigned char ch = *s;
   4170             PyObject *w, *x;
   4171 
   4172             /* Get mapping (char ordinal -> integer, Unicode char or None) */
   4173             w = PyInt_FromLong((long)ch);
   4174             if (w == NULL)
   4175                 goto onError;
   4176             x = PyObject_GetItem(mapping, w);
   4177             Py_DECREF(w);
   4178             if (x == NULL) {
   4179                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
   4180                     /* No mapping found means: mapping is undefined. */
   4181                     PyErr_Clear();
   4182                     goto Undefined;
   4183                 } else
   4184                     goto onError;
   4185             }
   4186 
   4187             /* Apply mapping */
   4188             if (x == Py_None)
   4189                 goto Undefined;
   4190             if (PyInt_Check(x)) {
   4191                 long value = PyInt_AS_LONG(x);
   4192                 if (value == 0xFFFE)
   4193                     goto Undefined;
   4194                 if (value < 0 || value > 0x10FFFF) {
   4195                     PyErr_SetString(PyExc_TypeError,
   4196                                     "character mapping must be in range(0x110000)");
   4197                     Py_DECREF(x);
   4198                     goto onError;
   4199                 }
   4200 
   4201 #ifndef Py_UNICODE_WIDE
   4202                 if (value > 0xFFFF) {
   4203                     /* see the code for 1-n mapping below */
   4204                     if (extrachars < 2) {
   4205                         /* resize first */
   4206                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
   4207                         Py_ssize_t needed = 10 - extrachars;
   4208                         extrachars += needed;
   4209                         /* XXX overflow detection missing */
   4210                         if (_PyUnicode_Resize(&v,
   4211                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
   4212                             Py_DECREF(x);
   4213                             goto onError;
   4214                         }
   4215                         p = PyUnicode_AS_UNICODE(v) + oldpos;
   4216                     }
   4217                     value -= 0x10000;
   4218                     *p++ = 0xD800 | (value >> 10);
   4219                     *p++ = 0xDC00 | (value & 0x3FF);
   4220                     extrachars -= 2;
   4221                 }
   4222                 else
   4223 #endif
   4224                 *p++ = (Py_UNICODE)value;
   4225             }
   4226             else if (PyUnicode_Check(x)) {
   4227                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
   4228 
   4229                 if (targetsize == 1) {
   4230                     /* 1-1 mapping */
   4231                     Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
   4232                     if (value == 0xFFFE)
   4233                         goto Undefined;
   4234                     *p++ = value;
   4235                 }
   4236                 else if (targetsize > 1) {
   4237                     /* 1-n mapping */
   4238                     if (targetsize > extrachars) {
   4239                         /* resize first */
   4240                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
   4241                         Py_ssize_t needed = (targetsize - extrachars) + \
   4242                             (targetsize << 2);
   4243                         extrachars += needed;
   4244                         /* XXX overflow detection missing */
   4245                         if (_PyUnicode_Resize(&v,
   4246                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
   4247                             Py_DECREF(x);
   4248                             goto onError;
   4249                         }
   4250                         p = PyUnicode_AS_UNICODE(v) + oldpos;
   4251                     }
   4252                     Py_UNICODE_COPY(p,
   4253                                     PyUnicode_AS_UNICODE(x),
   4254                                     targetsize);
   4255                     p += targetsize;
   4256                     extrachars -= targetsize;
   4257                 }
   4258                 /* 1-0 mapping: skip the character */
   4259             }
   4260             else {
   4261                 /* wrong return value */
   4262                 PyErr_SetString(PyExc_TypeError,
   4263                                 "character mapping must return integer, None or unicode");
   4264                 Py_DECREF(x);
   4265                 goto onError;
   4266             }
   4267             Py_DECREF(x);
   4268             ++s;
   4269             continue;
   4270 Undefined:
   4271             /* undefined mapping */
   4272             Py_XDECREF(x);
   4273             outpos = p-PyUnicode_AS_UNICODE(v);
   4274             startinpos = s-starts;
   4275             endinpos = startinpos+1;
   4276             if (unicode_decode_call_errorhandler(
   4277                     errors, &errorHandler,
   4278                     "charmap", "character maps to <undefined>",
   4279                     starts, size, &startinpos, &endinpos, &exc, &s,
   4280                     &v, &outpos, &p)) {
   4281                 goto onError;
   4282             }
   4283         }
   4284     }
   4285     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
   4286         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
   4287             goto onError;
   4288     Py_XDECREF(errorHandler);
   4289     Py_XDECREF(exc);
   4290     return (PyObject *)v;
   4291 
   4292   onError:
   4293     Py_XDECREF(errorHandler);
   4294     Py_XDECREF(exc);
   4295     Py_XDECREF(v);
   4296     return NULL;
   4297 }
   4298 
   4299 /* Charmap encoding: the lookup table */
   4300 
   4301 struct encoding_map{
   4302     PyObject_HEAD
   4303     unsigned char level1[32];
   4304     int count2, count3;
   4305     unsigned char level23[1];
   4306 };
   4307 
   4308 static PyObject*
   4309 encoding_map_size(PyObject *obj, PyObject* args)
   4310 {
   4311     struct encoding_map *map = (struct encoding_map*)obj;
   4312     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
   4313                           128*map->count3);
   4314 }
   4315 
   4316 static PyMethodDef encoding_map_methods[] = {
   4317     {"size", encoding_map_size, METH_NOARGS,
   4318      PyDoc_STR("Return the size (in bytes) of this object") },
   4319     { 0 }
   4320 };
   4321 
   4322 static void
   4323 encoding_map_dealloc(PyObject* o)
   4324 {
   4325     PyObject_FREE(o);
   4326 }
   4327 
   4328 static PyTypeObject EncodingMapType = {
   4329     PyVarObject_HEAD_INIT(NULL, 0)
   4330     "EncodingMap",          /*tp_name*/
   4331     sizeof(struct encoding_map),   /*tp_basicsize*/
   4332     0,                      /*tp_itemsize*/
   4333     /* methods */
   4334     encoding_map_dealloc,   /*tp_dealloc*/
   4335     0,                      /*tp_print*/
   4336     0,                      /*tp_getattr*/
   4337     0,                      /*tp_setattr*/
   4338     0,                      /*tp_compare*/
   4339     0,                      /*tp_repr*/
   4340     0,                      /*tp_as_number*/
   4341     0,                      /*tp_as_sequence*/
   4342     0,                      /*tp_as_mapping*/
   4343     0,                      /*tp_hash*/
   4344     0,                      /*tp_call*/
   4345     0,                      /*tp_str*/
   4346     0,                      /*tp_getattro*/
   4347     0,                      /*tp_setattro*/
   4348     0,                      /*tp_as_buffer*/
   4349     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
   4350     0,                      /*tp_doc*/
   4351     0,                      /*tp_traverse*/
   4352     0,                      /*tp_clear*/
   4353     0,                      /*tp_richcompare*/
   4354     0,                      /*tp_weaklistoffset*/
   4355     0,                      /*tp_iter*/
   4356     0,                      /*tp_iternext*/
   4357     encoding_map_methods,   /*tp_methods*/
   4358     0,                      /*tp_members*/
   4359     0,                      /*tp_getset*/
   4360     0,                      /*tp_base*/
   4361     0,                      /*tp_dict*/
   4362     0,                      /*tp_descr_get*/
   4363     0,                      /*tp_descr_set*/
   4364     0,                      /*tp_dictoffset*/
   4365     0,                      /*tp_init*/
   4366     0,                      /*tp_alloc*/
   4367     0,                      /*tp_new*/
   4368     0,                      /*tp_free*/
   4369     0,                      /*tp_is_gc*/
   4370 };
   4371 
   4372 PyObject*
   4373 PyUnicode_BuildEncodingMap(PyObject* string)
   4374 {
   4375     Py_UNICODE *decode;
   4376     PyObject *result;
   4377     struct encoding_map *mresult;
   4378     int i;
   4379     int need_dict = 0;
   4380     unsigned char level1[32];
   4381     unsigned char level2[512];
   4382     unsigned char *mlevel1, *mlevel2, *mlevel3;
   4383     int count2 = 0, count3 = 0;
   4384 
   4385     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
   4386         PyErr_BadArgument();
   4387         return NULL;
   4388     }
   4389     decode = PyUnicode_AS_UNICODE(string);
   4390     memset(level1, 0xFF, sizeof level1);
   4391     memset(level2, 0xFF, sizeof level2);
   4392 
   4393     /* If there isn't a one-to-one mapping of NULL to \0,
   4394        or if there are non-BMP characters, we need to use
   4395        a mapping dictionary. */
   4396     if (decode[0] != 0)
   4397         need_dict = 1;
   4398     for (i = 1; i < 256; i++) {
   4399         int l1, l2;
   4400         if (decode[i] == 0
   4401 #ifdef Py_UNICODE_WIDE
   4402             || decode[i] > 0xFFFF
   4403 #endif
   4404             ) {
   4405             need_dict = 1;
   4406             break;
   4407         }
   4408         if (decode[i] == 0xFFFE)
   4409             /* unmapped character */
   4410             continue;
   4411         l1 = decode[i] >> 11;
   4412         l2 = decode[i] >> 7;
   4413         if (level1[l1] == 0xFF)
   4414             level1[l1] = count2++;
   4415         if (level2[l2] == 0xFF)
   4416             level2[l2] = count3++;
   4417     }
   4418 
   4419     if (count2 >= 0xFF || count3 >= 0xFF)
   4420         need_dict = 1;
   4421 
   4422     if (need_dict) {
   4423         PyObject *result = PyDict_New();
   4424         PyObject *key, *value;
   4425         if (!result)
   4426             return NULL;
   4427         for (i = 0; i < 256; i++) {
   4428             value = NULL;
   4429             key = PyInt_FromLong(decode[i]);
   4430             value = PyInt_FromLong(i);
   4431             if (!key || !value)
   4432                 goto failed1;
   4433             if (PyDict_SetItem(result, key, value) == -1)
   4434                 goto failed1;
   4435             Py_DECREF(key);
   4436             Py_DECREF(value);
   4437         }
   4438         return result;
   4439       failed1:
   4440         Py_XDECREF(key);
   4441         Py_XDECREF(value);
   4442         Py_DECREF(result);
   4443         return NULL;
   4444     }
   4445 
   4446     /* Create a three-level trie */
   4447     result = PyObject_MALLOC(sizeof(struct encoding_map) +
   4448                              16*count2 + 128*count3 - 1);
   4449     if (!result)
   4450         return PyErr_NoMemory();
   4451     PyObject_Init(result, &EncodingMapType);
   4452     mresult = (struct encoding_map*)result;
   4453     mresult->count2 = count2;
   4454     mresult->count3 = count3;
   4455     mlevel1 = mresult->level1;
   4456     mlevel2 = mresult->level23;
   4457     mlevel3 = mresult->level23 + 16*count2;
   4458     memcpy(mlevel1, level1, 32);
   4459     memset(mlevel2, 0xFF, 16*count2);
   4460     memset(mlevel3, 0, 128*count3);
   4461     count3 = 0;
   4462     for (i = 1; i < 256; i++) {
   4463         int o1, o2, o3, i2, i3;
   4464         if (decode[i] == 0xFFFE)
   4465             /* unmapped character */
   4466             continue;
   4467         o1 = decode[i]>>11;
   4468         o2 = (decode[i]>>7) & 0xF;
   4469         i2 = 16*mlevel1[o1] + o2;
   4470         if (mlevel2[i2] == 0xFF)
   4471             mlevel2[i2] = count3++;
   4472         o3 = decode[i] & 0x7F;
   4473         i3 = 128*mlevel2[i2] + o3;
   4474         mlevel3[i3] = i;
   4475     }
   4476     return result;
   4477 }
   4478 
   4479 static int
   4480 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
   4481 {
   4482     struct encoding_map *map = (struct encoding_map*)mapping;
   4483     int l1 = c>>11;
   4484     int l2 = (c>>7) & 0xF;
   4485     int l3 = c & 0x7F;
   4486     int i;
   4487 
   4488 #ifdef Py_UNICODE_WIDE
   4489     if (c > 0xFFFF) {
   4490         return -1;
   4491     }
   4492 #endif
   4493     if (c == 0)
   4494         return 0;
   4495     /* level 1*/
   4496     i = map->level1[l1];
   4497     if (i == 0xFF) {
   4498         return -1;
   4499     }
   4500     /* level 2*/
   4501     i = map->level23[16*i+l2];
   4502     if (i == 0xFF) {
   4503         return -1;
   4504     }
   4505     /* level 3 */
   4506     i = map->level23[16*map->count2 + 128*i + l3];
   4507     if (i == 0) {
   4508         return -1;
   4509     }
   4510     return i;
   4511 }
   4512 
   4513 /* Lookup the character ch in the mapping. If the character
   4514    can't be found, Py_None is returned (or NULL, if another
   4515    error occurred). */
   4516 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
   4517 {
   4518     PyObject *w = PyInt_FromLong((long)c);
   4519     PyObject *x;
   4520 
   4521     if (w == NULL)
   4522         return NULL;
   4523     x = PyObject_GetItem(mapping, w);
   4524     Py_DECREF(w);
   4525     if (x == NULL) {
   4526         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
   4527             /* No mapping found means: mapping is undefined. */
   4528             PyErr_Clear();
   4529             x = Py_None;
   4530             Py_INCREF(x);
   4531             return x;
   4532         } else
   4533             return NULL;
   4534     }
   4535     else if (x == Py_None)
   4536         return x;
   4537     else if (PyInt_Check(x)) {
   4538         long value = PyInt_AS_LONG(x);
   4539         if (value < 0 || value > 255) {
   4540             PyErr_SetString(PyExc_TypeError,
   4541                             "character mapping must be in range(256)");
   4542             Py_DECREF(x);
   4543             return NULL;
   4544         }
   4545         return x;
   4546     }
   4547     else if (PyString_Check(x))
   4548         return x;
   4549     else {
   4550         /* wrong return value */
   4551         PyErr_SetString(PyExc_TypeError,
   4552                         "character mapping must return integer, None or str");
   4553         Py_DECREF(x);
   4554         return NULL;
   4555     }
   4556 }
   4557 
   4558 static int
   4559 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
   4560 {
   4561     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
   4562     /* exponentially overallocate to minimize reallocations */
   4563     if (requiredsize < 2*outsize)
   4564         requiredsize = 2*outsize;
   4565     if (_PyString_Resize(outobj, requiredsize)) {
   4566         return 0;
   4567     }
   4568     return 1;
   4569 }
   4570 
   4571 typedef enum charmapencode_result {
   4572     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
   4573 }charmapencode_result;
   4574 /* lookup the character, put the result in the output string and adjust
   4575    various state variables. Reallocate the output string if not enough
   4576    space is available. Return a new reference to the object that
   4577    was put in the output buffer, or Py_None, if the mapping was undefined
   4578    (in which case no character was written) or NULL, if a
   4579    reallocation error occurred. The caller must decref the result */
   4580 static
   4581 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
   4582                                           PyObject **outobj, Py_ssize_t *outpos)
   4583 {
   4584     PyObject *rep;
   4585     char *outstart;
   4586     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
   4587 
   4588     if (Py_TYPE(mapping) == &EncodingMapType) {
   4589         int res = encoding_map_lookup(c, mapping);
   4590         Py_ssize_t requiredsize = *outpos+1;
   4591         if (res == -1)
   4592             return enc_FAILED;
   4593         if (outsize<requiredsize)
   4594             if (!charmapencode_resize(outobj, outpos, requiredsize))
   4595                 return enc_EXCEPTION;
   4596         outstart = PyString_AS_STRING(*outobj);
   4597         outstart[(*outpos)++] = (char)res;
   4598         return enc_SUCCESS;
   4599     }
   4600 
   4601     rep = charmapencode_lookup(c, mapping);
   4602     if (rep==NULL)
   4603         return enc_EXCEPTION;
   4604     else if (rep==Py_None) {
   4605         Py_DECREF(rep);
   4606         return enc_FAILED;
   4607     } else {
   4608         if (PyInt_Check(rep)) {
   4609             Py_ssize_t requiredsize = *outpos+1;
   4610             if (outsize<requiredsize)
   4611                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
   4612                     Py_DECREF(rep);
   4613                     return enc_EXCEPTION;
   4614                 }
   4615             outstart = PyString_AS_STRING(*outobj);
   4616             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
   4617         }
   4618         else {
   4619             const char *repchars = PyString_AS_STRING(rep);
   4620             Py_ssize_t repsize = PyString_GET_SIZE(rep);
   4621             Py_ssize_t requiredsize = *outpos+repsize;
   4622             if (outsize<requiredsize)
   4623                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
   4624                     Py_DECREF(rep);
   4625                     return enc_EXCEPTION;
   4626                 }
   4627             outstart = PyString_AS_STRING(*outobj);
   4628             memcpy(outstart + *outpos, repchars, repsize);
   4629             *outpos += repsize;
   4630         }
   4631     }
   4632     Py_DECREF(rep);
   4633     return enc_SUCCESS;
   4634 }
   4635 
   4636 /* handle an error in PyUnicode_EncodeCharmap
   4637    Return 0 on success, -1 on error */
   4638 static
   4639 int charmap_encoding_error(
   4640     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
   4641     PyObject **exceptionObject,
   4642     int *known_errorHandler, PyObject **errorHandler, const char *errors,
   4643     PyObject **res, Py_ssize_t *respos)
   4644 {
   4645     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
   4646     Py_ssize_t repsize;
   4647     Py_ssize_t newpos;
   4648     Py_UNICODE *uni2;
   4649     /* startpos for collecting unencodable chars */
   4650     Py_ssize_t collstartpos = *inpos;
   4651     Py_ssize_t collendpos = *inpos+1;
   4652     Py_ssize_t collpos;
   4653     char *encoding = "charmap";
   4654     char *reason = "character maps to <undefined>";
   4655     charmapencode_result x;
   4656 
   4657     /* find all unencodable characters */
   4658     while (collendpos < size) {
   4659         PyObject *rep;
   4660         if (Py_TYPE(mapping) == &EncodingMapType) {
   4661             int res = encoding_map_lookup(p[collendpos], mapping);
   4662             if (res != -1)
   4663                 break;
   4664             ++collendpos;
   4665             continue;
   4666         }
   4667 
   4668         rep = charmapencode_lookup(p[collendpos], mapping);
   4669         if (rep==NULL)
   4670             return -1;
   4671         else if (rep!=Py_None) {
   4672             Py_DECREF(rep);
   4673             break;
   4674         }
   4675         Py_DECREF(rep);
   4676         ++collendpos;
   4677     }
   4678     /* cache callback name lookup
   4679      * (if not done yet, i.e. it's the first error) */
   4680     if (*known_errorHandler==-1) {
   4681         if ((errors==NULL) || (!strcmp(errors, "strict")))
   4682             *known_errorHandler = 1;
   4683         else if (!strcmp(errors, "replace"))
   4684             *known_errorHandler = 2;
   4685         else if (!strcmp(errors, "ignore"))
   4686             *known_errorHandler = 3;
   4687         else if (!strcmp(errors, "xmlcharrefreplace"))
   4688             *known_errorHandler = 4;
   4689         else
   4690             *known_errorHandler = 0;
   4691     }
   4692     switch (*known_errorHandler) {
   4693     case 1: /* strict */
   4694         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
   4695         return -1;
   4696     case 2: /* replace */
   4697         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
   4698             x = charmapencode_output('?', mapping, res, respos);
   4699             if (x==enc_EXCEPTION) {
   4700                 return -1;
   4701             }
   4702             else if (x==enc_FAILED) {
   4703                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
   4704                 return -1;
   4705             }
   4706         }
   4707         /* fall through */
   4708     case 3: /* ignore */
   4709         *inpos = collendpos;
   4710         break;
   4711     case 4: /* xmlcharrefreplace */
   4712         /* generate replacement */
   4713         for (collpos = collstartpos; collpos < collendpos;) {
   4714             char buffer[2+29+1+1];
   4715             char *cp;
   4716             Py_UCS4 ch = p[collpos++];
   4717 #ifndef Py_UNICODE_WIDE
   4718             if ((0xD800 <= ch && ch <= 0xDBFF) &&
   4719                 (collpos < collendpos) &&
   4720                 (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
   4721                 ch = ((((ch & 0x03FF) << 10) |
   4722                        ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
   4723             }
   4724 #endif
   4725             sprintf(buffer, "&#%d;", (int)ch);
   4726             for (cp = buffer; *cp; ++cp) {
   4727                 x = charmapencode_output(*cp, mapping, res, respos);
   4728                 if (x==enc_EXCEPTION)
   4729                     return -1;
   4730                 else if (x==enc_FAILED) {
   4731                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
   4732                     return -1;
   4733                 }
   4734             }
   4735         }
   4736         *inpos = collendpos;
   4737         break;
   4738     default:
   4739         repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
   4740                                                       encoding, reason, p, size, exceptionObject,
   4741                                                       collstartpos, collendpos, &newpos);
   4742         if (repunicode == NULL)
   4743             return -1;
   4744         /* generate replacement  */
   4745         repsize = PyUnicode_GET_SIZE(repunicode);
   4746         for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
   4747             x = charmapencode_output(*uni2, mapping, res, respos);
   4748             if (x==enc_EXCEPTION) {
   4749                 return -1;
   4750             }
   4751             else if (x==enc_FAILED) {
   4752                 Py_DECREF(repunicode);
   4753                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
   4754                 return -1;
   4755             }
   4756         }
   4757         *inpos = newpos;
   4758         Py_DECREF(repunicode);
   4759     }
   4760     return 0;
   4761 }
   4762 
   4763 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
   4764                                   Py_ssize_t size,
   4765                                   PyObject *mapping,
   4766                                   const char *errors)
   4767 {
   4768     /* output object */
   4769     PyObject *res = NULL;
   4770     /* current input position */
   4771     Py_ssize_t inpos = 0;
   4772     /* current output position */
   4773     Py_ssize_t respos = 0;
   4774     PyObject *errorHandler = NULL;
   4775     PyObject *exc = NULL;
   4776     /* the following variable is used for caching string comparisons
   4777      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
   4778      * 3=ignore, 4=xmlcharrefreplace */
   4779     int known_errorHandler = -1;
   4780 
   4781     /* Default to Latin-1 */
   4782     if (mapping == NULL)
   4783         return PyUnicode_EncodeLatin1(p, size, errors);
   4784 
   4785     /* allocate enough for a simple encoding without
   4786        replacements, if we need more, we'll resize */
   4787     res = PyString_FromStringAndSize(NULL, size);
   4788     if (res == NULL)
   4789         goto onError;
   4790     if (size == 0)
   4791         return res;
   4792 
   4793     while (inpos<size) {
   4794         /* try to encode it */
   4795         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
   4796         if (x==enc_EXCEPTION) /* error */
   4797             goto onError;
   4798         if (x==enc_FAILED) { /* unencodable character */
   4799             if (charmap_encoding_error(p, size, &inpos, mapping,
   4800                                        &exc,
   4801                                        &known_errorHandler, &errorHandler, errors,
   4802                                        &res, &respos)) {
   4803                 goto onError;
   4804             }
   4805         }
   4806         else
   4807             /* done with this character => adjust input position */
   4808             ++inpos;
   4809     }
   4810 
   4811     /* Resize if we allocated to much */
   4812     if (respos<PyString_GET_SIZE(res)) {
   4813         if (_PyString_Resize(&res, respos))
   4814             goto onError;
   4815     }
   4816     Py_XDECREF(exc);
   4817     Py_XDECREF(errorHandler);
   4818     return res;
   4819 
   4820   onError:
   4821     Py_XDECREF(res);
   4822     Py_XDECREF(exc);
   4823     Py_XDECREF(errorHandler);
   4824     return NULL;
   4825 }
   4826 
   4827 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
   4828                                     PyObject *mapping)
   4829 {
   4830     if (!PyUnicode_Check(unicode) || mapping == NULL) {
   4831         PyErr_BadArgument();
   4832         return NULL;
   4833     }
   4834     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
   4835                                    PyUnicode_GET_SIZE(unicode),
   4836                                    mapping,
   4837                                    NULL);
   4838 }
   4839 
   4840 /* create or adjust a UnicodeTranslateError */
   4841 static void make_translate_exception(PyObject **exceptionObject,
   4842                                      const Py_UNICODE *unicode, Py_ssize_t size,
   4843                                      Py_ssize_t startpos, Py_ssize_t endpos,
   4844                                      const char *reason)
   4845 {
   4846     if (*exceptionObject == NULL) {
   4847         *exceptionObject = PyUnicodeTranslateError_Create(
   4848             unicode, size, startpos, endpos, reason);
   4849     }
   4850     else {
   4851         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
   4852             goto onError;
   4853         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
   4854             goto onError;
   4855         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
   4856             goto onError;
   4857         return;
   4858       onError:
   4859         Py_CLEAR(*exceptionObject);
   4860     }
   4861 }
   4862 
   4863 /* raises a UnicodeTranslateError */
   4864 static void raise_translate_exception(PyObject **exceptionObject,
   4865                                       const Py_UNICODE *unicode, Py_ssize_t size,
   4866                                       Py_ssize_t startpos, Py_ssize_t endpos,
   4867                                       const char *reason)
   4868 {
   4869     make_translate_exception(exceptionObject,
   4870                              unicode, size, startpos, endpos, reason);
   4871     if (*exceptionObject != NULL)
   4872         PyCodec_StrictErrors(*exceptionObject);
   4873 }
   4874 
   4875 /* error handling callback helper:
   4876    build arguments, call the callback and check the arguments,
   4877    put the result into newpos and return the replacement string, which
   4878    has to be freed by the caller */
   4879 static PyObject *unicode_translate_call_errorhandler(const char *errors,
   4880                                                      PyObject **errorHandler,
   4881                                                      const char *reason,
   4882                                                      const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
   4883                                                      Py_ssize_t startpos, Py_ssize_t endpos,
   4884                                                      Py_ssize_t *newpos)
   4885 {
   4886     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
   4887 
   4888     Py_ssize_t i_newpos;
   4889     PyObject *restuple;
   4890     PyObject *resunicode;
   4891 
   4892     if (*errorHandler == NULL) {
   4893         *errorHandler = PyCodec_LookupError(errors);
   4894         if (*errorHandler == NULL)
   4895             return NULL;
   4896     }
   4897 
   4898     make_translate_exception(exceptionObject,
   4899                              unicode, size, startpos, endpos, reason);
   4900     if (*exceptionObject == NULL)
   4901         return NULL;
   4902 
   4903     restuple = PyObject_CallFunctionObjArgs(
   4904         *errorHandler, *exceptionObject, NULL);
   4905     if (restuple == NULL)
   4906         return NULL;
   4907     if (!PyTuple_Check(restuple)) {
   4908         PyErr_SetString(PyExc_TypeError, &argparse[4]);
   4909         Py_DECREF(restuple);
   4910         return NULL;
   4911     }
   4912     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
   4913                           &resunicode, &i_newpos)) {
   4914         Py_DECREF(restuple);
   4915         return NULL;
   4916     }
   4917     if (i_newpos<0)
   4918         *newpos = size+i_newpos;
   4919     else
   4920         *newpos = i_newpos;
   4921     if (*newpos<0 || *newpos>size) {
   4922         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
   4923         Py_DECREF(restuple);
   4924         return NULL;
   4925     }
   4926     Py_INCREF(resunicode);
   4927     Py_DECREF(restuple);
   4928     return resunicode;
   4929 }
   4930 
   4931 /* Lookup the character ch in the mapping and put the result in result,
   4932    which must be decrefed by the caller.
   4933    Return 0 on success, -1 on error */
   4934 static
   4935 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
   4936 {
   4937     PyObject *w = PyInt_FromLong((long)c);
   4938     PyObject *x;
   4939 
   4940     if (w == NULL)
   4941         return -1;
   4942     x = PyObject_GetItem(mapping, w);
   4943     Py_DECREF(w);
   4944     if (x == NULL) {
   4945         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
   4946             /* No mapping found means: use 1:1 mapping. */
   4947             PyErr_Clear();
   4948             *result = NULL;
   4949             return 0;
   4950         } else
   4951             return -1;
   4952     }
   4953     else if (x == Py_None) {
   4954         *result = x;
   4955         return 0;
   4956     }
   4957     else if (PyInt_Check(x)) {
   4958         long value = PyInt_AS_LONG(x);
   4959         long max = PyUnicode_GetMax();
   4960         if (value < 0 || value > max) {
   4961             PyErr_Format(PyExc_TypeError,
   4962                          "character mapping must be in range(0x%lx)", max+1);
   4963             Py_DECREF(x);
   4964             return -1;
   4965         }
   4966         *result = x;
   4967         return 0;
   4968     }
   4969     else if (PyUnicode_Check(x)) {
   4970         *result = x;
   4971         return 0;
   4972     }
   4973     else {
   4974         /* wrong return value */
   4975         PyErr_SetString(PyExc_TypeError,
   4976                         "character mapping must return integer, None or unicode");
   4977         Py_DECREF(x);
   4978         return -1;
   4979     }
   4980 }
   4981 /* ensure that *outobj is at least requiredsize characters long,
   4982    if not reallocate and adjust various state variables.
   4983    Return 0 on success, -1 on error */
   4984 static
   4985 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
   4986                                Py_ssize_t requiredsize)
   4987 {
   4988     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
   4989     if (requiredsize > oldsize) {
   4990         /* remember old output position */
   4991         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
   4992         /* exponentially overallocate to minimize reallocations */
   4993         if (requiredsize < 2 * oldsize)
   4994             requiredsize = 2 * oldsize;
   4995         if (PyUnicode_Resize(outobj, requiredsize) < 0)
   4996             return -1;
   4997         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
   4998     }
   4999     return 0;
   5000 }
   5001 /* lookup the character, put the result in the output string and adjust
   5002    various state variables. Return a new reference to the object that
   5003    was put in the output buffer in *result, or Py_None, if the mapping was
   5004    undefined (in which case no character was written).
   5005    The called must decref result.
   5006    Return 0 on success, -1 on error. */
   5007 static
   5008 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
   5009                             Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
   5010                             PyObject **res)
   5011 {
   5012     if (charmaptranslate_lookup(*curinp, mapping, res))
   5013         return -1;
   5014     if (*res==NULL) {
   5015         /* not found => default to 1:1 mapping */
   5016         *(*outp)++ = *curinp;
   5017     }
   5018     else if (*res==Py_None)
   5019         ;
   5020     else if (PyInt_Check(*res)) {
   5021         /* no overflow check, because we know that the space is enough */
   5022         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
   5023     }
   5024     else if (PyUnicode_Check(*res)) {
   5025         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
   5026         if (repsize==1) {
   5027             /* no overflow check, because we know that the space is enough */
   5028             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
   5029         }
   5030         else if (repsize!=0) {
   5031             /* more than one character */
   5032             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
   5033                 (insize - (curinp-startinp)) +
   5034                 repsize - 1;
   5035             if (charmaptranslate_makespace(outobj, outp, requiredsize))
   5036                 return -1;
   5037             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
   5038             *outp += repsize;
   5039         }
   5040     }
   5041     else
   5042         return -1;
   5043     return 0;
   5044 }
   5045 
   5046 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
   5047                                      Py_ssize_t size,
   5048                                      PyObject *mapping,
   5049                                      const char *errors)
   5050 {
   5051     /* output object */
   5052     PyObject *res = NULL;
   5053     /* pointers to the beginning and end+1 of input */
   5054     const Py_UNICODE *startp = p;
   5055     const Py_UNICODE *endp = p + size;
   5056     /* pointer into the output */
   5057     Py_UNICODE *str;
   5058     /* current output position */
   5059     Py_ssize_t respos = 0;
   5060     char *reason = "character maps to <undefined>";
   5061     PyObject *errorHandler = NULL;
   5062     PyObject *exc = NULL;
   5063     /* the following variable is used for caching string comparisons
   5064      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
   5065      * 3=ignore, 4=xmlcharrefreplace */
   5066     int known_errorHandler = -1;
   5067 
   5068     if (mapping == NULL) {
   5069         PyErr_BadArgument();
   5070         return NULL;
   5071     }
   5072 
   5073     /* allocate enough for a simple 1:1 translation without
   5074        replacements, if we need more, we'll resize */
   5075     res = PyUnicode_FromUnicode(NULL, size);
   5076     if (res == NULL)
   5077         goto onError;
   5078     if (size == 0)
   5079         return res;
   5080     str = PyUnicode_AS_UNICODE(res);
   5081 
   5082     while (p<endp) {
   5083         /* try to encode it */
   5084         PyObject *x = NULL;
   5085         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
   5086             Py_XDECREF(x);
   5087             goto onError;
   5088         }
   5089         Py_XDECREF(x);
   5090         if (x!=Py_None) /* it worked => adjust input pointer */
   5091             ++p;
   5092         else { /* untranslatable character */
   5093             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
   5094             Py_ssize_t repsize;
   5095             Py_ssize_t newpos;
   5096             Py_UNICODE *uni2;
   5097             /* startpos for collecting untranslatable chars */
   5098             const Py_UNICODE *collstart = p;
   5099             const Py_UNICODE *collend = p+1;
   5100             const Py_UNICODE *coll;
   5101 
   5102             /* find all untranslatable characters */
   5103             while (collend < endp) {
   5104                 if (charmaptranslate_lookup(*collend, mapping, &x))
   5105                     goto onError;
   5106                 Py_XDECREF(x);
   5107                 if (x!=Py_None)
   5108                     break;
   5109                 ++collend;
   5110             }
   5111             /* cache callback name lookup
   5112              * (if not done yet, i.e. it's the first error) */
   5113             if (known_errorHandler==-1) {
   5114                 if ((errors==NULL) || (!strcmp(errors, "strict")))
   5115                     known_errorHandler = 1;
   5116                 else if (!strcmp(errors, "replace"))
   5117                     known_errorHandler = 2;
   5118                 else if (!strcmp(errors, "ignore"))
   5119                     known_errorHandler = 3;
   5120                 else if (!strcmp(errors, "xmlcharrefreplace"))
   5121                     known_errorHandler = 4;
   5122                 else
   5123                     known_errorHandler = 0;
   5124             }
   5125             switch (known_errorHandler) {
   5126             case 1: /* strict */
   5127                 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
   5128                 goto onError;
   5129             case 2: /* replace */
   5130                 /* No need to check for space, this is a 1:1 replacement */
   5131                 for (coll = collstart; coll<collend; ++coll)
   5132                     *str++ = '?';
   5133                 /* fall through */
   5134             case 3: /* ignore */
   5135                 p = collend;
   5136                 break;
   5137             case 4: /* xmlcharrefreplace */
   5138                 /* generate replacement (temporarily (mis)uses p) */
   5139                 for (p = collstart; p < collend;) {
   5140                     char buffer[2+29+1+1];
   5141                     char *cp;
   5142                     Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
   5143                     sprintf(buffer, "&#%d;", (int)ch);
   5144                     if (charmaptranslate_makespace(&res, &str,
   5145                                                    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
   5146                         goto onError;
   5147                     for (cp = buffer; *cp; ++cp)
   5148                         *str++ = *cp;
   5149                 }
   5150                 p = collend;
   5151                 break;
   5152             default:
   5153                 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
   5154                                                                  reason, startp, size, &exc,
   5155                                                                  collstart-startp, collend-startp, &newpos);
   5156                 if (repunicode == NULL)
   5157                     goto onError;
   5158                 /* generate replacement  */
   5159                 repsize = PyUnicode_GET_SIZE(repunicode);
   5160                 if (charmaptranslate_makespace(&res, &str,
   5161                                                (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
   5162                     Py_DECREF(repunicode);
   5163                     goto onError;
   5164                 }
   5165                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
   5166                     *str++ = *uni2;
   5167                 p = startp + newpos;
   5168                 Py_DECREF(repunicode);
   5169             }
   5170         }
   5171     }
   5172     /* Resize if we allocated to much */
   5173     respos = str-PyUnicode_AS_UNICODE(res);
   5174     if (respos<PyUnicode_GET_SIZE(res)) {
   5175         if (PyUnicode_Resize(&res, respos) < 0)
   5176             goto onError;
   5177     }
   5178     Py_XDECREF(exc);
   5179     Py_XDECREF(errorHandler);
   5180     return res;
   5181 
   5182   onError:
   5183     Py_XDECREF(res);
   5184     Py_XDECREF(exc);
   5185     Py_XDECREF(errorHandler);
   5186     return NULL;
   5187 }
   5188 
   5189 PyObject *PyUnicode_Translate(PyObject *str,
   5190                               PyObject *mapping,
   5191                               const char *errors)
   5192 {
   5193     PyObject *result;
   5194 
   5195     str = PyUnicode_FromObject(str);
   5196     if (str == NULL)
   5197         goto onError;
   5198     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
   5199                                         PyUnicode_GET_SIZE(str),
   5200                                         mapping,
   5201                                         errors);
   5202     Py_DECREF(str);
   5203     return result;
   5204 
   5205   onError:
   5206     Py_XDECREF(str);
   5207     return NULL;
   5208 }
   5209 
   5210 /* --- Decimal Encoder ---------------------------------------------------- */
   5211 
   5212 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
   5213                             Py_ssize_t length,
   5214                             char *output,
   5215                             const char *errors)
   5216 {
   5217     Py_UNICODE *p, *end;
   5218     PyObject *errorHandler = NULL;
   5219     PyObject *exc = NULL;
   5220     const char *encoding = "decimal";
   5221     const char *reason = "invalid decimal Unicode string";
   5222     /* the following variable is used for caching string comparisons
   5223      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
   5224     int known_errorHandler = -1;
   5225 
   5226     if (output == NULL) {
   5227         PyErr_BadArgument();
   5228         return -1;
   5229     }
   5230 
   5231     p = s;
   5232     end = s + length;
   5233     while (p < end) {
   5234         register Py_UNICODE ch = *p;
   5235         int decimal;
   5236         PyObject *repunicode;
   5237         Py_ssize_t repsize;
   5238         Py_ssize_t newpos;
   5239         Py_UNICODE *uni2;
   5240         Py_UNICODE *collstart;
   5241         Py_UNICODE *collend;
   5242 
   5243         if (Py_UNICODE_ISSPACE(ch)) {
   5244             *output++ = ' ';
   5245             ++p;
   5246             continue;
   5247         }
   5248         decimal = Py_UNICODE_TODECIMAL(ch);
   5249         if (decimal >= 0) {
   5250             *output++ = '0' + decimal;
   5251             ++p;
   5252             continue;
   5253         }
   5254         if (0 < ch && ch < 256) {
   5255             *output++ = (char)ch;
   5256             ++p;
   5257             continue;
   5258         }
   5259         /* All other characters are considered unencodable */
   5260         collstart = p;
   5261         for (collend = p+1; collend < end; collend++) {
   5262             if ((0 < *collend && *collend < 256) ||
   5263                 Py_UNICODE_ISSPACE(*collend) ||
   5264                 0 <= Py_UNICODE_TODECIMAL(*collend))
   5265                 break;
   5266         }
   5267         /* cache callback name lookup
   5268          * (if not done yet, i.e. it's the first error) */
   5269         if (known_errorHandler==-1) {
   5270             if ((errors==NULL) || (!strcmp(errors, "strict")))
   5271                 known_errorHandler = 1;
   5272             else if (!strcmp(errors, "replace"))
   5273                 known_errorHandler = 2;
   5274             else if (!strcmp(errors, "ignore"))
   5275                 known_errorHandler = 3;
   5276             else if (!strcmp(errors, "xmlcharrefreplace"))
   5277                 known_errorHandler = 4;
   5278             else
   5279                 known_errorHandler = 0;
   5280         }
   5281         switch (known_errorHandler) {
   5282         case 1: /* strict */
   5283             raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
   5284             goto onError;
   5285         case 2: /* replace */
   5286             for (p = collstart; p < collend; ++p)
   5287                 *output++ = '?';
   5288             /* fall through */
   5289         case 3: /* ignore */
   5290             p = collend;
   5291             break;
   5292         case 4: /* xmlcharrefreplace */
   5293             /* generate replacement (temporarily (mis)uses p) */
   5294             for (p = collstart; p < collend;) {
   5295                 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
   5296                 output += sprintf(output, "&#%d;", ch);
   5297             }
   5298             p = collend;
   5299             break;
   5300         default:
   5301             repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
   5302                                                           encoding, reason, s, length, &exc,
   5303                                                           collstart-s, collend-s, &newpos);
   5304             if (repunicode == NULL)
   5305                 goto onError;
   5306             /* generate replacement  */
   5307             repsize = PyUnicode_GET_SIZE(repunicode);
   5308             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
   5309                 Py_UNICODE ch = *uni2;
   5310                 if (Py_UNICODE_ISSPACE(ch))
   5311                     *output++ = ' ';
   5312                 else {
   5313                     decimal = Py_UNICODE_TODECIMAL(ch);
   5314                     if (decimal >= 0)
   5315                         *output++ = '0' + decimal;
   5316                     else if (0 < ch && ch < 256)
   5317                         *output++ = (char)ch;
   5318                     else {
   5319                         Py_DECREF(repunicode);
   5320                         raise_encode_exception(&exc, encoding,
   5321                                                s, length, collstart-s, collend-s, reason);
   5322                         goto onError;
   5323                     }
   5324                 }
   5325             }
   5326             p = s + newpos;
   5327             Py_DECREF(repunicode);
   5328         }
   5329     }
   5330     /* 0-terminate the output string */
   5331     *output++ = '\0';
   5332     Py_XDECREF(exc);
   5333     Py_XDECREF(errorHandler);
   5334     return 0;
   5335 
   5336   onError:
   5337     Py_XDECREF(exc);
   5338     Py_XDECREF(errorHandler);
   5339     return -1;
   5340 }
   5341 
   5342 /* --- Helpers ------------------------------------------------------------ */
   5343 
   5344 #include "stringlib/unicodedefs.h"
   5345 #include "stringlib/fastsearch.h"
   5346 
   5347 #include "stringlib/count.h"
   5348 #include "stringlib/find.h"
   5349 #include "stringlib/partition.h"
   5350 #include "stringlib/split.h"
   5351 
   5352 /* helper macro to fixup start/end slice values */
   5353 #define ADJUST_INDICES(start, end, len)         \
   5354     if (end > len)                              \
   5355         end = len;                              \
   5356     else if (end < 0) {                         \
   5357         end += len;                             \
   5358         if (end < 0)                            \
   5359             end = 0;                            \
   5360     }                                           \
   5361     if (start < 0) {                            \
   5362         start += len;                           \
   5363         if (start < 0)                          \
   5364             start = 0;                          \
   5365     }
   5366 
   5367 Py_ssize_t PyUnicode_Count(PyObject *str,
   5368                            PyObject *substr,
   5369                            Py_ssize_t start,
   5370                            Py_ssize_t end)
   5371 {
   5372     Py_ssize_t result;
   5373     PyUnicodeObject* str_obj;
   5374     PyUnicodeObject* sub_obj;
   5375 
   5376     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
   5377     if (!str_obj)
   5378         return -1;
   5379     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
   5380     if (!sub_obj) {
   5381         Py_DECREF(str_obj);
   5382         return -1;
   5383     }
   5384 
   5385     ADJUST_INDICES(start, end, str_obj->length);
   5386     result = stringlib_count(
   5387         str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
   5388         PY_SSIZE_T_MAX
   5389         );
   5390 
   5391     Py_DECREF(sub_obj);
   5392     Py_DECREF(str_obj);
   5393 
   5394     return result;
   5395 }
   5396 
   5397 Py_ssize_t PyUnicode_Find(PyObject *str,
   5398                           PyObject *sub,
   5399                           Py_ssize_t start,
   5400                           Py_ssize_t end,
   5401                           int direction)
   5402 {
   5403     Py_ssize_t result;
   5404 
   5405     str = PyUnicode_FromObject(str);
   5406     if (!str)
   5407         return -2;
   5408     sub = PyUnicode_FromObject(sub);
   5409     if (!sub) {
   5410         Py_DECREF(str);
   5411         return -2;
   5412     }
   5413 
   5414     if (direction > 0)
   5415         result = stringlib_find_slice(
   5416             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
   5417             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
   5418             start, end
   5419             );
   5420     else
   5421         result = stringlib_rfind_slice(
   5422             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
   5423             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
   5424             start, end
   5425             );
   5426 
   5427     Py_DECREF(str);
   5428     Py_DECREF(sub);
   5429 
   5430     return result;
   5431 }
   5432 
   5433 static
   5434 int tailmatch(PyUnicodeObject *self,
   5435               PyUnicodeObject *substring,
   5436               Py_ssize_t start,
   5437               Py_ssize_t end,
   5438               int direction)
   5439 {
   5440     if (substring->length == 0)
   5441         return 1;
   5442 
   5443     ADJUST_INDICES(start, end, self->length);
   5444     end -= substring->length;
   5445     if (end < start)
   5446         return 0;
   5447 
   5448     if (direction > 0) {
   5449         if (Py_UNICODE_MATCH(self, end, substring))
   5450             return 1;
   5451     } else {
   5452         if (Py_UNICODE_MATCH(self, start, substring))
   5453             return 1;
   5454     }
   5455 
   5456     return 0;
   5457 }
   5458 
   5459 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
   5460                                PyObject *substr,
   5461                                Py_ssize_t start,
   5462                                Py_ssize_t end,
   5463                                int direction)
   5464 {
   5465     Py_ssize_t result;
   5466 
   5467     str = PyUnicode_FromObject(str);
   5468     if (str == NULL)
   5469         return -1;
   5470     substr = PyUnicode_FromObject(substr);
   5471     if (substr == NULL) {
   5472         Py_DECREF(str);
   5473         return -1;
   5474     }
   5475 
   5476     result = tailmatch((PyUnicodeObject *)str,
   5477                        (PyUnicodeObject *)substr,
   5478                        start, end, direction);
   5479     Py_DECREF(str);
   5480     Py_DECREF(substr);
   5481     return result;
   5482 }
   5483 
   5484 /* Apply fixfct filter to the Unicode object self and return a
   5485    reference to the modified object */
   5486 
   5487 static
   5488 PyObject *fixup(PyUnicodeObject *self,
   5489                 int (*fixfct)(PyUnicodeObject *s))
   5490 {
   5491 
   5492     PyUnicodeObject *u;
   5493 
   5494     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
   5495     if (u == NULL)
   5496         return NULL;
   5497 
   5498     Py_UNICODE_COPY(u->str, self->str, self->length);
   5499 
   5500     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
   5501         /* fixfct should return TRUE if it modified the buffer. If
   5502            FALSE, return a reference to the original buffer instead
   5503            (to save space, not time) */
   5504         Py_INCREF(self);
   5505         Py_DECREF(u);
   5506         return (PyObject*) self;
   5507     }
   5508     return (PyObject*) u;
   5509 }
   5510 
   5511 static
   5512 int fixupper(PyUnicodeObject *self)
   5513 {
   5514     Py_ssize_t len = self->length;
   5515     Py_UNICODE *s = self->str;
   5516     int status = 0;
   5517 
   5518     while (len-- > 0) {
   5519         register Py_UNICODE ch;
   5520 
   5521         ch = Py_UNICODE_TOUPPER(*s);
   5522         if (ch != *s) {
   5523             status = 1;
   5524             *s = ch;
   5525         }
   5526         s++;
   5527     }
   5528 
   5529     return status;
   5530 }
   5531 
   5532 static
   5533 int fixlower(PyUnicodeObject *self)
   5534 {
   5535     Py_ssize_t len = self->length;
   5536     Py_UNICODE *s = self->str;
   5537     int status = 0;
   5538 
   5539     while (len-- > 0) {
   5540         register Py_UNICODE ch;
   5541 
   5542         ch = Py_UNICODE_TOLOWER(*s);
   5543         if (ch != *s) {
   5544             status = 1;
   5545             *s = ch;
   5546         }
   5547         s++;
   5548     }
   5549 
   5550     return status;
   5551 }
   5552 
   5553 static
   5554 int fixswapcase(PyUnicodeObject *self)
   5555 {
   5556     Py_ssize_t len = self->length;
   5557     Py_UNICODE *s = self->str;
   5558     int status = 0;
   5559 
   5560     while (len-- > 0) {
   5561         if (Py_UNICODE_ISUPPER(*s)) {
   5562             *s = Py_UNICODE_TOLOWER(*s);
   5563             status = 1;
   5564         } else if (Py_UNICODE_ISLOWER(*s)) {
   5565             *s = Py_UNICODE_TOUPPER(*s);
   5566             status = 1;
   5567         }
   5568         s++;
   5569     }
   5570 
   5571     return status;
   5572 }
   5573 
   5574 static
   5575 int fixcapitalize(PyUnicodeObject *self)
   5576 {
   5577     Py_ssize_t len = self->length;
   5578     Py_UNICODE *s = self->str;
   5579     int status = 0;
   5580 
   5581     if (len == 0)
   5582         return 0;
   5583     if (!Py_UNICODE_ISUPPER(*s)) {
   5584         *s = Py_UNICODE_TOUPPER(*s);
   5585         status = 1;
   5586     }
   5587     s++;
   5588     while (--len > 0) {
   5589         if (!Py_UNICODE_ISLOWER(*s)) {
   5590             *s = Py_UNICODE_TOLOWER(*s);
   5591             status = 1;
   5592         }
   5593         s++;
   5594     }
   5595     return status;
   5596 }
   5597 
   5598 static
   5599 int fixtitle(PyUnicodeObject *self)
   5600 {
   5601     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   5602     register Py_UNICODE *e;
   5603     int previous_is_cased;
   5604 
   5605     /* Shortcut for single character strings */
   5606     if (PyUnicode_GET_SIZE(self) == 1) {
   5607         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
   5608         if (*p != ch) {
   5609             *p = ch;
   5610             return 1;
   5611         }
   5612         else
   5613             return 0;
   5614     }
   5615 
   5616     e = p + PyUnicode_GET_SIZE(self);
   5617     previous_is_cased = 0;
   5618     for (; p < e; p++) {
   5619         register const Py_UNICODE ch = *p;
   5620 
   5621         if (previous_is_cased)
   5622             *p = Py_UNICODE_TOLOWER(ch);
   5623         else
   5624             *p = Py_UNICODE_TOTITLE(ch);
   5625 
   5626         if (Py_UNICODE_ISLOWER(ch) ||
   5627             Py_UNICODE_ISUPPER(ch) ||
   5628             Py_UNICODE_ISTITLE(ch))
   5629             previous_is_cased = 1;
   5630         else
   5631             previous_is_cased = 0;
   5632     }
   5633     return 1;
   5634 }
   5635 
   5636 PyObject *
   5637 PyUnicode_Join(PyObject *separator, PyObject *seq)
   5638 {
   5639     PyObject *internal_separator = NULL;
   5640     const Py_UNICODE blank = ' ';
   5641     const Py_UNICODE *sep = &blank;
   5642     Py_ssize_t seplen = 1;
   5643     PyUnicodeObject *res = NULL; /* the result */
   5644     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
   5645     Py_ssize_t res_used;         /* # used bytes */
   5646     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
   5647     PyObject *fseq;          /* PySequence_Fast(seq) */
   5648     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
   5649     PyObject *item;
   5650     Py_ssize_t i;
   5651 
   5652     fseq = PySequence_Fast(seq, "can only join an iterable");
   5653     if (fseq == NULL) {
   5654         return NULL;
   5655     }
   5656 
   5657     /* Grrrr.  A codec may be invoked to convert str objects to
   5658      * Unicode, and so it's possible to call back into Python code
   5659      * during PyUnicode_FromObject(), and so it's possible for a sick
   5660      * codec to change the size of fseq (if seq is a list).  Therefore
   5661      * we have to keep refetching the size -- can't assume seqlen
   5662      * is invariant.
   5663      */
   5664     seqlen = PySequence_Fast_GET_SIZE(fseq);
   5665     /* If empty sequence, return u"". */
   5666     if (seqlen == 0) {
   5667         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
   5668         goto Done;
   5669     }
   5670     /* If singleton sequence with an exact Unicode, return that. */
   5671     if (seqlen == 1) {
   5672         item = PySequence_Fast_GET_ITEM(fseq, 0);
   5673         if (PyUnicode_CheckExact(item)) {
   5674             Py_INCREF(item);
   5675             res = (PyUnicodeObject *)item;
   5676             goto Done;
   5677         }
   5678     }
   5679 
   5680     /* At least two items to join, or one that isn't exact Unicode. */
   5681     if (seqlen > 1) {
   5682         /* Set up sep and seplen -- they're needed. */
   5683         if (separator == NULL) {
   5684             sep = &blank;
   5685             seplen = 1;
   5686         }
   5687         else {
   5688             internal_separator = PyUnicode_FromObject(separator);
   5689             if (internal_separator == NULL)
   5690                 goto onError;
   5691             sep = PyUnicode_AS_UNICODE(internal_separator);
   5692             seplen = PyUnicode_GET_SIZE(internal_separator);
   5693             /* In case PyUnicode_FromObject() mutated seq. */
   5694             seqlen = PySequence_Fast_GET_SIZE(fseq);
   5695         }
   5696     }
   5697 
   5698     /* Get space. */
   5699     res = _PyUnicode_New(res_alloc);
   5700     if (res == NULL)
   5701         goto onError;
   5702     res_p = PyUnicode_AS_UNICODE(res);
   5703     res_used = 0;
   5704 
   5705     for (i = 0; i < seqlen; ++i) {
   5706         Py_ssize_t itemlen;
   5707         Py_ssize_t new_res_used;
   5708 
   5709         item = PySequence_Fast_GET_ITEM(fseq, i);
   5710         /* Convert item to Unicode. */
   5711         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
   5712             PyErr_Format(PyExc_TypeError,
   5713                          "sequence item %zd: expected string or Unicode,"
   5714                          " %.80s found",
   5715                          i, Py_TYPE(item)->tp_name);
   5716             goto onError;
   5717         }
   5718         item = PyUnicode_FromObject(item);
   5719         if (item == NULL)
   5720             goto onError;
   5721         /* We own a reference to item from here on. */
   5722 
   5723         /* In case PyUnicode_FromObject() mutated seq. */
   5724         seqlen = PySequence_Fast_GET_SIZE(fseq);
   5725 
   5726         /* Make sure we have enough space for the separator and the item. */
   5727         itemlen = PyUnicode_GET_SIZE(item);
   5728         new_res_used = res_used + itemlen;
   5729         if (new_res_used < 0)
   5730             goto Overflow;
   5731         if (i < seqlen - 1) {
   5732             new_res_used += seplen;
   5733             if (new_res_used < 0)
   5734                 goto Overflow;
   5735         }
   5736         if (new_res_used > res_alloc) {
   5737             /* double allocated size until it's big enough */
   5738             do {
   5739                 res_alloc += res_alloc;
   5740                 if (res_alloc <= 0)
   5741                     goto Overflow;
   5742             } while (new_res_used > res_alloc);
   5743             if (_PyUnicode_Resize(&res, res_alloc) < 0) {
   5744                 Py_DECREF(item);
   5745                 goto onError;
   5746             }
   5747             res_p = PyUnicode_AS_UNICODE(res) + res_used;
   5748         }
   5749 
   5750         /* Copy item, and maybe the separator. */
   5751         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
   5752         res_p += itemlen;
   5753         if (i < seqlen - 1) {
   5754             Py_UNICODE_COPY(res_p, sep, seplen);
   5755             res_p += seplen;
   5756         }
   5757         Py_DECREF(item);
   5758         res_used = new_res_used;
   5759     }
   5760 
   5761     /* Shrink res to match the used area; this probably can't fail,
   5762      * but it's cheap to check.
   5763      */
   5764     if (_PyUnicode_Resize(&res, res_used) < 0)
   5765         goto onError;
   5766 
   5767   Done:
   5768     Py_XDECREF(internal_separator);
   5769     Py_DECREF(fseq);
   5770     return (PyObject *)res;
   5771 
   5772   Overflow:
   5773     PyErr_SetString(PyExc_OverflowError,
   5774                     "join() result is too long for a Python string");
   5775     Py_DECREF(item);
   5776     /* fall through */
   5777 
   5778   onError:
   5779     Py_XDECREF(internal_separator);
   5780     Py_DECREF(fseq);
   5781     Py_XDECREF(res);
   5782     return NULL;
   5783 }
   5784 
   5785 static
   5786 PyUnicodeObject *pad(PyUnicodeObject *self,
   5787                      Py_ssize_t left,
   5788                      Py_ssize_t right,
   5789                      Py_UNICODE fill)
   5790 {
   5791     PyUnicodeObject *u;
   5792 
   5793     if (left < 0)
   5794         left = 0;
   5795     if (right < 0)
   5796         right = 0;
   5797 
   5798     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
   5799         Py_INCREF(self);
   5800         return self;
   5801     }
   5802 
   5803     if (left > PY_SSIZE_T_MAX - self->length ||
   5804         right > PY_SSIZE_T_MAX - (left + self->length)) {
   5805         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
   5806         return NULL;
   5807     }
   5808     u = _PyUnicode_New(left + self->length + right);
   5809     if (u) {
   5810         if (left)
   5811             Py_UNICODE_FILL(u->str, fill, left);
   5812         Py_UNICODE_COPY(u->str + left, self->str, self->length);
   5813         if (right)
   5814             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
   5815     }
   5816 
   5817     return u;
   5818 }
   5819 
   5820 PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
   5821 {
   5822     PyObject *list;
   5823 
   5824     string = PyUnicode_FromObject(string);
   5825     if (string == NULL)
   5826         return NULL;
   5827 
   5828     list = stringlib_splitlines(
   5829         (PyObject*) string, PyUnicode_AS_UNICODE(string),
   5830         PyUnicode_GET_SIZE(string), keepends);
   5831 
   5832     Py_DECREF(string);
   5833     return list;
   5834 }
   5835 
   5836 static
   5837 PyObject *split(PyUnicodeObject *self,
   5838                 PyUnicodeObject *substring,
   5839                 Py_ssize_t maxcount)
   5840 {
   5841     if (maxcount < 0)
   5842         maxcount = PY_SSIZE_T_MAX;
   5843 
   5844     if (substring == NULL)
   5845         return stringlib_split_whitespace(
   5846             (PyObject*) self,  self->str, self->length, maxcount
   5847             );
   5848 
   5849     return stringlib_split(
   5850         (PyObject*) self,  self->str, self->length,
   5851         substring->str, substring->length,
   5852         maxcount
   5853         );
   5854 }
   5855 
   5856 static
   5857 PyObject *rsplit(PyUnicodeObject *self,
   5858                  PyUnicodeObject *substring,
   5859                  Py_ssize_t maxcount)
   5860 {
   5861     if (maxcount < 0)
   5862         maxcount = PY_SSIZE_T_MAX;
   5863 
   5864     if (substring == NULL)
   5865         return stringlib_rsplit_whitespace(
   5866             (PyObject*) self,  self->str, self->length, maxcount
   5867             );
   5868 
   5869     return stringlib_rsplit(
   5870         (PyObject*) self,  self->str, self->length,
   5871         substring->str, substring->length,
   5872         maxcount
   5873         );
   5874 }
   5875 
   5876 static
   5877 PyObject *replace(PyUnicodeObject *self,
   5878                   PyUnicodeObject *str1,
   5879                   PyUnicodeObject *str2,
   5880                   Py_ssize_t maxcount)
   5881 {
   5882     PyUnicodeObject *u;
   5883 
   5884     if (maxcount < 0)
   5885         maxcount = PY_SSIZE_T_MAX;
   5886     else if (maxcount == 0 || self->length == 0)
   5887         goto nothing;
   5888 
   5889     if (str1->length == str2->length) {
   5890         Py_ssize_t i;
   5891         /* same length */
   5892         if (str1->length == 0)
   5893             goto nothing;
   5894         if (str1->length == 1) {
   5895             /* replace characters */
   5896             Py_UNICODE u1, u2;
   5897             if (!findchar(self->str, self->length, str1->str[0]))
   5898                 goto nothing;
   5899             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
   5900             if (!u)
   5901                 return NULL;
   5902             Py_UNICODE_COPY(u->str, self->str, self->length);
   5903             u1 = str1->str[0];
   5904             u2 = str2->str[0];
   5905             for (i = 0; i < u->length; i++)
   5906                 if (u->str[i] == u1) {
   5907                     if (--maxcount < 0)
   5908                         break;
   5909                     u->str[i] = u2;
   5910                 }
   5911         } else {
   5912             i = stringlib_find(
   5913                 self->str, self->length, str1->str, str1->length, 0
   5914                 );
   5915             if (i < 0)
   5916                 goto nothing;
   5917             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
   5918             if (!u)
   5919                 return NULL;
   5920             Py_UNICODE_COPY(u->str, self->str, self->length);
   5921 
   5922             /* change everything in-place, starting with this one */
   5923             Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
   5924             i += str1->length;
   5925 
   5926             while ( --maxcount > 0) {
   5927                 i = stringlib_find(self->str+i, self->length-i,
   5928                                    str1->str, str1->length,
   5929                                    i);
   5930                 if (i == -1)
   5931                     break;
   5932                 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
   5933                 i += str1->length;
   5934             }
   5935         }
   5936     } else {
   5937 
   5938         Py_ssize_t n, i, j;
   5939         Py_ssize_t product, new_size, delta;
   5940         Py_UNICODE *p;
   5941 
   5942         /* replace strings */
   5943         n = stringlib_count(self->str, self->length, str1->str, str1->length,
   5944                             maxcount);
   5945         if (n == 0)
   5946             goto nothing;
   5947         /* new_size = self->length + n * (str2->length - str1->length)); */
   5948         delta = (str2->length - str1->length);
   5949         if (delta == 0) {
   5950             new_size = self->length;
   5951         } else {
   5952             product = n * (str2->length - str1->length);
   5953             if ((product / (str2->length - str1->length)) != n) {
   5954                 PyErr_SetString(PyExc_OverflowError,
   5955                                 "replace string is too long");
   5956                 return NULL;
   5957             }
   5958             new_size = self->length + product;
   5959             if (new_size < 0) {
   5960                 PyErr_SetString(PyExc_OverflowError,
   5961                                 "replace string is too long");
   5962                 return NULL;
   5963             }
   5964         }
   5965         u = _PyUnicode_New(new_size);
   5966         if (!u)
   5967             return NULL;
   5968         i = 0;
   5969         p = u->str;
   5970         if (str1->length > 0) {
   5971             while (n-- > 0) {
   5972                 /* look for next match */
   5973                 j = stringlib_find(self->str+i, self->length-i,
   5974                                    str1->str, str1->length,
   5975                                    i);
   5976                 if (j == -1)
   5977                     break;
   5978                 else if (j > i) {
   5979                     /* copy unchanged part [i:j] */
   5980                     Py_UNICODE_COPY(p, self->str+i, j-i);
   5981                     p += j - i;
   5982                 }
   5983                 /* copy substitution string */
   5984                 if (str2->length > 0) {
   5985                     Py_UNICODE_COPY(p, str2->str, str2->length);
   5986                     p += str2->length;
   5987                 }
   5988                 i = j + str1->length;
   5989             }
   5990             if (i < self->length)
   5991                 /* copy tail [i:] */
   5992                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
   5993         } else {
   5994             /* interleave */
   5995             while (n > 0) {
   5996                 Py_UNICODE_COPY(p, str2->str, str2->length);
   5997                 p += str2->length;
   5998                 if (--n <= 0)
   5999                     break;
   6000                 *p++ = self->str[i++];
   6001             }
   6002             Py_UNICODE_COPY(p, self->str+i, self->length-i);
   6003         }
   6004     }
   6005     return (PyObject *) u;
   6006 
   6007   nothing:
   6008     /* nothing to replace; return original string (when possible) */
   6009     if (PyUnicode_CheckExact(self)) {
   6010         Py_INCREF(self);
   6011         return (PyObject *) self;
   6012     }
   6013     return PyUnicode_FromUnicode(self->str, self->length);
   6014 }
   6015 
   6016 /* --- Unicode Object Methods --------------------------------------------- */
   6017 
   6018 PyDoc_STRVAR(title__doc__,
   6019              "S.title() -> unicode\n\
   6020 \n\
   6021 Return a titlecased version of S, i.e. words start with title case\n\
   6022 characters, all remaining cased characters have lower case.");
   6023 
   6024 static PyObject*
   6025 unicode_title(PyUnicodeObject *self)
   6026 {
   6027     return fixup(self, fixtitle);
   6028 }
   6029 
   6030 PyDoc_STRVAR(capitalize__doc__,
   6031              "S.capitalize() -> unicode\n\
   6032 \n\
   6033 Return a capitalized version of S, i.e. make the first character\n\
   6034 have upper case and the rest lower case.");
   6035 
   6036 static PyObject*
   6037 unicode_capitalize(PyUnicodeObject *self)
   6038 {
   6039     return fixup(self, fixcapitalize);
   6040 }
   6041 
   6042 #if 0
   6043 PyDoc_STRVAR(capwords__doc__,
   6044              "S.capwords() -> unicode\n\
   6045 \n\
   6046 Apply .capitalize() to all words in S and return the result with\n\
   6047 normalized whitespace (all whitespace strings are replaced by ' ').");
   6048 
   6049 static PyObject*
   6050 unicode_capwords(PyUnicodeObject *self)
   6051 {
   6052     PyObject *list;
   6053     PyObject *item;
   6054     Py_ssize_t i;
   6055 
   6056     /* Split into words */
   6057     list = split(self, NULL, -1);
   6058     if (!list)
   6059         return NULL;
   6060 
   6061     /* Capitalize each word */
   6062     for (i = 0; i < PyList_GET_SIZE(list); i++) {
   6063         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
   6064                      fixcapitalize);
   6065         if (item == NULL)
   6066             goto onError;
   6067         Py_DECREF(PyList_GET_ITEM(list, i));
   6068         PyList_SET_ITEM(list, i, item);
   6069     }
   6070 
   6071     /* Join the words to form a new string */
   6072     item = PyUnicode_Join(NULL, list);
   6073 
   6074   onError:
   6075     Py_DECREF(list);
   6076     return (PyObject *)item;
   6077 }
   6078 #endif
   6079 
   6080 /* Argument converter.  Coerces to a single unicode character */
   6081 
   6082 static int
   6083 convert_uc(PyObject *obj, void *addr)
   6084 {
   6085     Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
   6086     PyObject *uniobj;
   6087     Py_UNICODE *unistr;
   6088 
   6089     uniobj = PyUnicode_FromObject(obj);
   6090     if (uniobj == NULL) {
   6091         PyErr_SetString(PyExc_TypeError,
   6092                         "The fill character cannot be converted to Unicode");
   6093         return 0;
   6094     }
   6095     if (PyUnicode_GET_SIZE(uniobj) != 1) {
   6096         PyErr_SetString(PyExc_TypeError,
   6097                         "The fill character must be exactly one character long");
   6098         Py_DECREF(uniobj);
   6099         return 0;
   6100     }
   6101     unistr = PyUnicode_AS_UNICODE(uniobj);
   6102     *fillcharloc = unistr[0];
   6103     Py_DECREF(uniobj);
   6104     return 1;
   6105 }
   6106 
   6107 PyDoc_STRVAR(center__doc__,
   6108              "S.center(width[, fillchar]) -> unicode\n\
   6109 \n\
   6110 Return S centered in a Unicode string of length width. Padding is\n\
   6111 done using the specified fill character (default is a space)");
   6112 
   6113 static PyObject *
   6114 unicode_center(PyUnicodeObject *self, PyObject *args)
   6115 {
   6116     Py_ssize_t marg, left;
   6117     Py_ssize_t width;
   6118     Py_UNICODE fillchar = ' ';
   6119 
   6120     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
   6121         return NULL;
   6122 
   6123     if (self->length >= width && PyUnicode_CheckExact(self)) {
   6124         Py_INCREF(self);
   6125         return (PyObject*) self;
   6126     }
   6127 
   6128     marg = width - self->length;
   6129     left = marg / 2 + (marg & width & 1);
   6130 
   6131     return (PyObject*) pad(self, left, marg - left, fillchar);
   6132 }
   6133 
   6134 #if 0
   6135 
   6136 /* This code should go into some future Unicode collation support
   6137    module. The basic comparison should compare ordinals on a naive
   6138    basis (this is what Java does and thus Jython too). */
   6139 
   6140 /* speedy UTF-16 code point order comparison */
   6141 /* gleaned from: */
   6142 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
   6143 
   6144 static short utf16Fixup[32] =
   6145 {
   6146     0, 0, 0, 0, 0, 0, 0, 0,
   6147     0, 0, 0, 0, 0, 0, 0, 0,
   6148     0, 0, 0, 0, 0, 0, 0, 0,
   6149     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
   6150 };
   6151 
   6152 static int
   6153 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
   6154 {
   6155     Py_ssize_t len1, len2;
   6156 
   6157     Py_UNICODE *s1 = str1->str;
   6158     Py_UNICODE *s2 = str2->str;
   6159 
   6160     len1 = str1->length;
   6161     len2 = str2->length;
   6162 
   6163     while (len1 > 0 && len2 > 0) {
   6164         Py_UNICODE c1, c2;
   6165 
   6166         c1 = *s1++;
   6167         c2 = *s2++;
   6168 
   6169         if (c1 > (1<<11) * 26)
   6170             c1 += utf16Fixup[c1>>11];
   6171         if (c2 > (1<<11) * 26)
   6172             c2 += utf16Fixup[c2>>11];
   6173         /* now c1 and c2 are in UTF-32-compatible order */
   6174 
   6175         if (c1 != c2)
   6176             return (c1 < c2) ? -1 : 1;
   6177 
   6178         len1--; len2--;
   6179     }
   6180 
   6181     return (len1 < len2) ? -1 : (len1 != len2);
   6182 }
   6183 
   6184 #else
   6185 
   6186 static int
   6187 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
   6188 {
   6189     register Py_ssize_t len1, len2;
   6190 
   6191     Py_UNICODE *s1 = str1->str;
   6192     Py_UNICODE *s2 = str2->str;
   6193 
   6194     len1 = str1->length;
   6195     len2 = str2->length;
   6196 
   6197     while (len1 > 0 && len2 > 0) {
   6198         Py_UNICODE c1, c2;
   6199 
   6200         c1 = *s1++;
   6201         c2 = *s2++;
   6202 
   6203         if (c1 != c2)
   6204             return (c1 < c2) ? -1 : 1;
   6205 
   6206         len1--; len2--;
   6207     }
   6208 
   6209     return (len1 < len2) ? -1 : (len1 != len2);
   6210 }
   6211 
   6212 #endif
   6213 
   6214 int PyUnicode_Compare(PyObject *left,
   6215                       PyObject *right)
   6216 {
   6217     PyUnicodeObject *u = NULL, *v = NULL;
   6218     int result;
   6219 
   6220     /* Coerce the two arguments */
   6221     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
   6222     if (u == NULL)
   6223         goto onError;
   6224     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
   6225     if (v == NULL)
   6226         goto onError;
   6227 
   6228     /* Shortcut for empty or interned objects */
   6229     if (v == u) {
   6230         Py_DECREF(u);
   6231         Py_DECREF(v);
   6232         return 0;
   6233     }
   6234 
   6235     result = unicode_compare(u, v);
   6236 
   6237     Py_DECREF(u);
   6238     Py_DECREF(v);
   6239     return result;
   6240 
   6241   onError:
   6242     Py_XDECREF(u);
   6243     Py_XDECREF(v);
   6244     return -1;
   6245 }
   6246 
   6247 PyObject *PyUnicode_RichCompare(PyObject *left,
   6248                                 PyObject *right,
   6249                                 int op)
   6250 {
   6251     int result;
   6252 
   6253     result = PyUnicode_Compare(left, right);
   6254     if (result == -1 && PyErr_Occurred())
   6255         goto onError;
   6256 
   6257     /* Convert the return value to a Boolean */
   6258     switch (op) {
   6259     case Py_EQ:
   6260         result = (result == 0);
   6261         break;
   6262     case Py_NE:
   6263         result = (result != 0);
   6264         break;
   6265     case Py_LE:
   6266         result = (result <= 0);
   6267         break;
   6268     case Py_GE:
   6269         result = (result >= 0);
   6270         break;
   6271     case Py_LT:
   6272         result = (result == -1);
   6273         break;
   6274     case Py_GT:
   6275         result = (result == 1);
   6276         break;
   6277     }
   6278     return PyBool_FromLong(result);
   6279 
   6280   onError:
   6281 
   6282     /* Standard case
   6283 
   6284        Type errors mean that PyUnicode_FromObject() could not convert
   6285        one of the arguments (usually the right hand side) to Unicode,
   6286        ie. we can't handle the comparison request. However, it is
   6287        possible that the other object knows a comparison method, which
   6288        is why we return Py_NotImplemented to give the other object a
   6289        chance.
   6290 
   6291     */
   6292     if (PyErr_ExceptionMatches(PyExc_TypeError)) {
   6293         PyErr_Clear();
   6294         Py_INCREF(Py_NotImplemented);
   6295         return Py_NotImplemented;
   6296     }
   6297     if (op != Py_EQ && op != Py_NE)
   6298         return NULL;
   6299 
   6300     /* Equality comparison.
   6301 
   6302        This is a special case: we silence any PyExc_UnicodeDecodeError
   6303        and instead turn it into a PyErr_UnicodeWarning.
   6304 
   6305     */
   6306     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
   6307         return NULL;
   6308     PyErr_Clear();
   6309     if (PyErr_Warn(PyExc_UnicodeWarning,
   6310                    (op == Py_EQ) ?
   6311                    "Unicode equal comparison "
   6312                    "failed to convert both arguments to Unicode - "
   6313                    "interpreting them as being unequal" :
   6314                    "Unicode unequal comparison "
   6315                    "failed to convert both arguments to Unicode - "
   6316                    "interpreting them as being unequal"
   6317             ) < 0)
   6318         return NULL;
   6319     result = (op == Py_NE);
   6320     return PyBool_FromLong(result);
   6321 }
   6322 
   6323 int PyUnicode_Contains(PyObject *container,
   6324                        PyObject *element)
   6325 {
   6326     PyObject *str, *sub;
   6327     int result;
   6328 
   6329     /* Coerce the two arguments */
   6330     sub = PyUnicode_FromObject(element);
   6331     if (!sub) {
   6332         return -1;
   6333     }
   6334 
   6335     str = PyUnicode_FromObject(container);
   6336     if (!str) {
   6337         Py_DECREF(sub);
   6338         return -1;
   6339     }
   6340 
   6341     result = stringlib_contains_obj(str, sub);
   6342 
   6343     Py_DECREF(str);
   6344     Py_DECREF(sub);
   6345 
   6346     return result;
   6347 }
   6348 
   6349 /* Concat to string or Unicode object giving a new Unicode object. */
   6350 
   6351 PyObject *PyUnicode_Concat(PyObject *left,
   6352                            PyObject *right)
   6353 {
   6354     PyUnicodeObject *u = NULL, *v = NULL, *w;
   6355 
   6356     /* Coerce the two arguments */
   6357     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
   6358     if (u == NULL)
   6359         goto onError;
   6360     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
   6361     if (v == NULL)
   6362         goto onError;
   6363 
   6364     /* Shortcuts */
   6365     if (v == unicode_empty) {
   6366         Py_DECREF(v);
   6367         return (PyObject *)u;
   6368     }
   6369     if (u == unicode_empty) {
   6370         Py_DECREF(u);
   6371         return (PyObject *)v;
   6372     }
   6373 
   6374     /* Concat the two Unicode strings */
   6375     w = _PyUnicode_New(u->length + v->length);
   6376     if (w == NULL)
   6377         goto onError;
   6378     Py_UNICODE_COPY(w->str, u->str, u->length);
   6379     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
   6380 
   6381     Py_DECREF(u);
   6382     Py_DECREF(v);
   6383     return (PyObject *)w;
   6384 
   6385   onError:
   6386     Py_XDECREF(u);
   6387     Py_XDECREF(v);
   6388     return NULL;
   6389 }
   6390 
   6391 PyDoc_STRVAR(count__doc__,
   6392              "S.count(sub[, start[, end]]) -> int\n\
   6393 \n\
   6394 Return the number of non-overlapping occurrences of substring sub in\n\
   6395 Unicode string S[start:end].  Optional arguments start and end are\n\
   6396 interpreted as in slice notation.");
   6397 
   6398 static PyObject *
   6399 unicode_count(PyUnicodeObject *self, PyObject *args)
   6400 {
   6401     PyUnicodeObject *substring;
   6402     Py_ssize_t start = 0;
   6403     Py_ssize_t end = PY_SSIZE_T_MAX;
   6404     PyObject *result;
   6405 
   6406     if (!stringlib_parse_args_finds_unicode("count", args, &substring,
   6407                                             &start, &end))
   6408         return NULL;
   6409 
   6410     ADJUST_INDICES(start, end, self->length);
   6411     result = PyInt_FromSsize_t(
   6412         stringlib_count(self->str + start, end - start,
   6413                         substring->str, substring->length,
   6414                         PY_SSIZE_T_MAX)
   6415         );
   6416 
   6417     Py_DECREF(substring);
   6418 
   6419     return result;
   6420 }
   6421 
   6422 PyDoc_STRVAR(encode__doc__,
   6423              "S.encode([encoding[,errors]]) -> string or unicode\n\
   6424 \n\
   6425 Encodes S using the codec registered for encoding. encoding defaults\n\
   6426 to the default encoding. errors may be given to set a different error\n\
   6427 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
   6428 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
   6429 'xmlcharrefreplace' as well as any other name registered with\n\
   6430 codecs.register_error that can handle UnicodeEncodeErrors.");
   6431 
   6432 static PyObject *
   6433 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
   6434 {
   6435     static char *kwlist[] = {"encoding", "errors", 0};
   6436     char *encoding = NULL;
   6437     char *errors = NULL;
   6438     PyObject *v;
   6439 
   6440     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
   6441                                      kwlist, &encoding, &errors))
   6442         return NULL;
   6443     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
   6444     if (v == NULL)
   6445         goto onError;
   6446     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
   6447         PyErr_Format(PyExc_TypeError,
   6448                      "encoder did not return a string/unicode object "
   6449                      "(type=%.400s)",
   6450                      Py_TYPE(v)->tp_name);
   6451         Py_DECREF(v);
   6452         return NULL;
   6453     }
   6454     return v;
   6455 
   6456   onError:
   6457     return NULL;
   6458 }
   6459 
   6460 PyDoc_STRVAR(decode__doc__,
   6461              "S.decode([encoding[,errors]]) -> string or unicode\n\
   6462 \n\
   6463 Decodes S using the codec registered for encoding. encoding defaults\n\
   6464 to the default encoding. errors may be given to set a different error\n\
   6465 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
   6466 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
   6467 as well as any other name registered with codecs.register_error that is\n\
   6468 able to handle UnicodeDecodeErrors.");
   6469 
   6470 static PyObject *
   6471 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
   6472 {
   6473     static char *kwlist[] = {"encoding", "errors", 0};
   6474     char *encoding = NULL;
   6475     char *errors = NULL;
   6476     PyObject *v;
   6477 
   6478     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
   6479                                      kwlist, &encoding, &errors))
   6480         return NULL;
   6481     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
   6482     if (v == NULL)
   6483         goto onError;
   6484     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
   6485         PyErr_Format(PyExc_TypeError,
   6486                      "decoder did not return a string/unicode object "
   6487                      "(type=%.400s)",
   6488                      Py_TYPE(v)->tp_name);
   6489         Py_DECREF(v);
   6490         return NULL;
   6491     }
   6492     return v;
   6493 
   6494   onError:
   6495     return NULL;
   6496 }
   6497 
   6498 PyDoc_STRVAR(expandtabs__doc__,
   6499              "S.expandtabs([tabsize]) -> unicode\n\
   6500 \n\
   6501 Return a copy of S where all tab characters are expanded using spaces.\n\
   6502 If tabsize is not given, a tab size of 8 characters is assumed.");
   6503 
   6504 static PyObject*
   6505 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
   6506 {
   6507     Py_UNICODE *e;
   6508     Py_UNICODE *p;
   6509     Py_UNICODE *q;
   6510     Py_UNICODE *qe;
   6511     Py_ssize_t i, j, incr;
   6512     PyUnicodeObject *u;
   6513     int tabsize = 8;
   6514 
   6515     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
   6516         return NULL;
   6517 
   6518     /* First pass: determine size of output string */
   6519     i = 0; /* chars up to and including most recent \n or \r */
   6520     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
   6521     e = self->str + self->length; /* end of input */
   6522     for (p = self->str; p < e; p++)
   6523         if (*p == '\t') {
   6524             if (tabsize > 0) {
   6525                 incr = tabsize - (j % tabsize); /* cannot overflow */
   6526                 if (j > PY_SSIZE_T_MAX - incr)
   6527                     goto overflow1;
   6528                 j += incr;
   6529             }
   6530         }
   6531         else {
   6532             if (j > PY_SSIZE_T_MAX - 1)
   6533                 goto overflow1;
   6534             j++;
   6535             if (*p == '\n' || *p == '\r') {
   6536                 if (i > PY_SSIZE_T_MAX - j)
   6537                     goto overflow1;
   6538                 i += j;
   6539                 j = 0;
   6540             }
   6541         }
   6542 
   6543     if (i > PY_SSIZE_T_MAX - j)
   6544         goto overflow1;
   6545 
   6546     /* Second pass: create output string and fill it */
   6547     u = _PyUnicode_New(i + j);
   6548     if (!u)
   6549         return NULL;
   6550 
   6551     j = 0; /* same as in first pass */
   6552     q = u->str; /* next output char */
   6553     qe = u->str + u->length; /* end of output */
   6554 
   6555     for (p = self->str; p < e; p++)
   6556         if (*p == '\t') {
   6557             if (tabsize > 0) {
   6558                 i = tabsize - (j % tabsize);
   6559                 j += i;
   6560                 while (i--) {
   6561                     if (q >= qe)
   6562                         goto overflow2;
   6563                     *q++ = ' ';
   6564                 }
   6565             }
   6566         }
   6567         else {
   6568             if (q >= qe)
   6569                 goto overflow2;
   6570             *q++ = *p;
   6571             j++;
   6572             if (*p == '\n' || *p == '\r')
   6573                 j = 0;
   6574         }
   6575 
   6576     return (PyObject*) u;
   6577 
   6578   overflow2:
   6579     Py_DECREF(u);
   6580   overflow1:
   6581     PyErr_SetString(PyExc_OverflowError, "new string is too long");
   6582     return NULL;
   6583 }
   6584 
   6585 PyDoc_STRVAR(find__doc__,
   6586              "S.find(sub [,start [,end]]) -> int\n\
   6587 \n\
   6588 Return the lowest index in S where substring sub is found,\n\
   6589 such that sub is contained within S[start:end].  Optional\n\
   6590 arguments start and end are interpreted as in slice notation.\n\
   6591 \n\
   6592 Return -1 on failure.");
   6593 
   6594 static PyObject *
   6595 unicode_find(PyUnicodeObject *self, PyObject *args)
   6596 {
   6597     PyUnicodeObject *substring;
   6598     Py_ssize_t start;
   6599     Py_ssize_t end;
   6600     Py_ssize_t result;
   6601 
   6602     if (!stringlib_parse_args_finds_unicode("find", args, &substring,
   6603                                             &start, &end))
   6604         return NULL;
   6605 
   6606     result = stringlib_find_slice(
   6607         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
   6608         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
   6609         start, end
   6610         );
   6611 
   6612     Py_DECREF(substring);
   6613 
   6614     return PyInt_FromSsize_t(result);
   6615 }
   6616 
   6617 static PyObject *
   6618 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
   6619 {
   6620     if (index < 0 || index >= self->length) {
   6621         PyErr_SetString(PyExc_IndexError, "string index out of range");
   6622         return NULL;
   6623     }
   6624 
   6625     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
   6626 }
   6627 
   6628 static long
   6629 unicode_hash(PyUnicodeObject *self)
   6630 {
   6631     /* Since Unicode objects compare equal to their ASCII string
   6632        counterparts, they should use the individual character values
   6633        as basis for their hash value.  This is needed to assure that
   6634        strings and Unicode objects behave in the same way as
   6635        dictionary keys. */
   6636 
   6637     register Py_ssize_t len;
   6638     register Py_UNICODE *p;
   6639     register long x;
   6640 
   6641 #ifdef Py_DEBUG
   6642     assert(_Py_HashSecret_Initialized);
   6643 #endif
   6644     if (self->hash != -1)
   6645         return self->hash;
   6646     len = PyUnicode_GET_SIZE(self);
   6647     /*
   6648       We make the hash of the empty string be 0, rather than using
   6649       (prefix ^ suffix), since this slightly obfuscates the hash secret
   6650     */
   6651     if (len == 0) {
   6652         self->hash = 0;
   6653         return 0;
   6654     }
   6655     p = PyUnicode_AS_UNICODE(self);
   6656     x = _Py_HashSecret.prefix;
   6657     x ^= *p << 7;
   6658     while (--len >= 0)
   6659         x = (1000003*x) ^ *p++;
   6660     x ^= PyUnicode_GET_SIZE(self);
   6661     x ^= _Py_HashSecret.suffix;
   6662     if (x == -1)
   6663         x = -2;
   6664     self->hash = x;
   6665     return x;
   6666 }
   6667 
   6668 PyDoc_STRVAR(index__doc__,
   6669              "S.index(sub [,start [,end]]) -> int\n\
   6670 \n\
   6671 Like S.find() but raise ValueError when the substring is not found.");
   6672 
   6673 static PyObject *
   6674 unicode_index(PyUnicodeObject *self, PyObject *args)
   6675 {
   6676     Py_ssize_t result;
   6677     PyUnicodeObject *substring;
   6678     Py_ssize_t start;
   6679     Py_ssize_t end;
   6680 
   6681     if (!stringlib_parse_args_finds_unicode("index", args, &substring,
   6682                                             &start, &end))
   6683         return NULL;
   6684 
   6685     result = stringlib_find_slice(
   6686         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
   6687         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
   6688         start, end
   6689         );
   6690 
   6691     Py_DECREF(substring);
   6692 
   6693     if (result < 0) {
   6694         PyErr_SetString(PyExc_ValueError, "substring not found");
   6695         return NULL;
   6696     }
   6697 
   6698     return PyInt_FromSsize_t(result);
   6699 }
   6700 
   6701 PyDoc_STRVAR(islower__doc__,
   6702              "S.islower() -> bool\n\
   6703 \n\
   6704 Return True if all cased characters in S are lowercase and there is\n\
   6705 at least one cased character in S, False otherwise.");
   6706 
   6707 static PyObject*
   6708 unicode_islower(PyUnicodeObject *self)
   6709 {
   6710     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6711     register const Py_UNICODE *e;
   6712     int cased;
   6713 
   6714     /* Shortcut for single character strings */
   6715     if (PyUnicode_GET_SIZE(self) == 1)
   6716         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
   6717 
   6718     /* Special case for empty strings */
   6719     if (PyUnicode_GET_SIZE(self) == 0)
   6720         return PyBool_FromLong(0);
   6721 
   6722     e = p + PyUnicode_GET_SIZE(self);
   6723     cased = 0;
   6724     for (; p < e; p++) {
   6725         register const Py_UNICODE ch = *p;
   6726 
   6727         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
   6728             return PyBool_FromLong(0);
   6729         else if (!cased && Py_UNICODE_ISLOWER(ch))
   6730             cased = 1;
   6731     }
   6732     return PyBool_FromLong(cased);
   6733 }
   6734 
   6735 PyDoc_STRVAR(isupper__doc__,
   6736              "S.isupper() -> bool\n\
   6737 \n\
   6738 Return True if all cased characters in S are uppercase and there is\n\
   6739 at least one cased character in S, False otherwise.");
   6740 
   6741 static PyObject*
   6742 unicode_isupper(PyUnicodeObject *self)
   6743 {
   6744     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6745     register const Py_UNICODE *e;
   6746     int cased;
   6747 
   6748     /* Shortcut for single character strings */
   6749     if (PyUnicode_GET_SIZE(self) == 1)
   6750         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
   6751 
   6752     /* Special case for empty strings */
   6753     if (PyUnicode_GET_SIZE(self) == 0)
   6754         return PyBool_FromLong(0);
   6755 
   6756     e = p + PyUnicode_GET_SIZE(self);
   6757     cased = 0;
   6758     for (; p < e; p++) {
   6759         register const Py_UNICODE ch = *p;
   6760 
   6761         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
   6762             return PyBool_FromLong(0);
   6763         else if (!cased && Py_UNICODE_ISUPPER(ch))
   6764             cased = 1;
   6765     }
   6766     return PyBool_FromLong(cased);
   6767 }
   6768 
   6769 PyDoc_STRVAR(istitle__doc__,
   6770              "S.istitle() -> bool\n\
   6771 \n\
   6772 Return True if S is a titlecased string and there is at least one\n\
   6773 character in S, i.e. upper- and titlecase characters may only\n\
   6774 follow uncased characters and lowercase characters only cased ones.\n\
   6775 Return False otherwise.");
   6776 
   6777 static PyObject*
   6778 unicode_istitle(PyUnicodeObject *self)
   6779 {
   6780     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6781     register const Py_UNICODE *e;
   6782     int cased, previous_is_cased;
   6783 
   6784     /* Shortcut for single character strings */
   6785     if (PyUnicode_GET_SIZE(self) == 1)
   6786         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
   6787                                (Py_UNICODE_ISUPPER(*p) != 0));
   6788 
   6789     /* Special case for empty strings */
   6790     if (PyUnicode_GET_SIZE(self) == 0)
   6791         return PyBool_FromLong(0);
   6792 
   6793     e = p + PyUnicode_GET_SIZE(self);
   6794     cased = 0;
   6795     previous_is_cased = 0;
   6796     for (; p < e; p++) {
   6797         register const Py_UNICODE ch = *p;
   6798 
   6799         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
   6800             if (previous_is_cased)
   6801                 return PyBool_FromLong(0);
   6802             previous_is_cased = 1;
   6803             cased = 1;
   6804         }
   6805         else if (Py_UNICODE_ISLOWER(ch)) {
   6806             if (!previous_is_cased)
   6807                 return PyBool_FromLong(0);
   6808             previous_is_cased = 1;
   6809             cased = 1;
   6810         }
   6811         else
   6812             previous_is_cased = 0;
   6813     }
   6814     return PyBool_FromLong(cased);
   6815 }
   6816 
   6817 PyDoc_STRVAR(isspace__doc__,
   6818              "S.isspace() -> bool\n\
   6819 \n\
   6820 Return True if all characters in S are whitespace\n\
   6821 and there is at least one character in S, False otherwise.");
   6822 
   6823 static PyObject*
   6824 unicode_isspace(PyUnicodeObject *self)
   6825 {
   6826     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6827     register const Py_UNICODE *e;
   6828 
   6829     /* Shortcut for single character strings */
   6830     if (PyUnicode_GET_SIZE(self) == 1 &&
   6831         Py_UNICODE_ISSPACE(*p))
   6832         return PyBool_FromLong(1);
   6833 
   6834     /* Special case for empty strings */
   6835     if (PyUnicode_GET_SIZE(self) == 0)
   6836         return PyBool_FromLong(0);
   6837 
   6838     e = p + PyUnicode_GET_SIZE(self);
   6839     for (; p < e; p++) {
   6840         if (!Py_UNICODE_ISSPACE(*p))
   6841             return PyBool_FromLong(0);
   6842     }
   6843     return PyBool_FromLong(1);
   6844 }
   6845 
   6846 PyDoc_STRVAR(isalpha__doc__,
   6847              "S.isalpha() -> bool\n\
   6848 \n\
   6849 Return True if all characters in S are alphabetic\n\
   6850 and there is at least one character in S, False otherwise.");
   6851 
   6852 static PyObject*
   6853 unicode_isalpha(PyUnicodeObject *self)
   6854 {
   6855     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6856     register const Py_UNICODE *e;
   6857 
   6858     /* Shortcut for single character strings */
   6859     if (PyUnicode_GET_SIZE(self) == 1 &&
   6860         Py_UNICODE_ISALPHA(*p))
   6861         return PyBool_FromLong(1);
   6862 
   6863     /* Special case for empty strings */
   6864     if (PyUnicode_GET_SIZE(self) == 0)
   6865         return PyBool_FromLong(0);
   6866 
   6867     e = p + PyUnicode_GET_SIZE(self);
   6868     for (; p < e; p++) {
   6869         if (!Py_UNICODE_ISALPHA(*p))
   6870             return PyBool_FromLong(0);
   6871     }
   6872     return PyBool_FromLong(1);
   6873 }
   6874 
   6875 PyDoc_STRVAR(isalnum__doc__,
   6876              "S.isalnum() -> bool\n\
   6877 \n\
   6878 Return True if all characters in S are alphanumeric\n\
   6879 and there is at least one character in S, False otherwise.");
   6880 
   6881 static PyObject*
   6882 unicode_isalnum(PyUnicodeObject *self)
   6883 {
   6884     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6885     register const Py_UNICODE *e;
   6886 
   6887     /* Shortcut for single character strings */
   6888     if (PyUnicode_GET_SIZE(self) == 1 &&
   6889         Py_UNICODE_ISALNUM(*p))
   6890         return PyBool_FromLong(1);
   6891 
   6892     /* Special case for empty strings */
   6893     if (PyUnicode_GET_SIZE(self) == 0)
   6894         return PyBool_FromLong(0);
   6895 
   6896     e = p + PyUnicode_GET_SIZE(self);
   6897     for (; p < e; p++) {
   6898         if (!Py_UNICODE_ISALNUM(*p))
   6899             return PyBool_FromLong(0);
   6900     }
   6901     return PyBool_FromLong(1);
   6902 }
   6903 
   6904 PyDoc_STRVAR(isdecimal__doc__,
   6905              "S.isdecimal() -> bool\n\
   6906 \n\
   6907 Return True if there are only decimal characters in S,\n\
   6908 False otherwise.");
   6909 
   6910 static PyObject*
   6911 unicode_isdecimal(PyUnicodeObject *self)
   6912 {
   6913     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6914     register const Py_UNICODE *e;
   6915 
   6916     /* Shortcut for single character strings */
   6917     if (PyUnicode_GET_SIZE(self) == 1 &&
   6918         Py_UNICODE_ISDECIMAL(*p))
   6919         return PyBool_FromLong(1);
   6920 
   6921     /* Special case for empty strings */
   6922     if (PyUnicode_GET_SIZE(self) == 0)
   6923         return PyBool_FromLong(0);
   6924 
   6925     e = p + PyUnicode_GET_SIZE(self);
   6926     for (; p < e; p++) {
   6927         if (!Py_UNICODE_ISDECIMAL(*p))
   6928             return PyBool_FromLong(0);
   6929     }
   6930     return PyBool_FromLong(1);
   6931 }
   6932 
   6933 PyDoc_STRVAR(isdigit__doc__,
   6934              "S.isdigit() -> bool\n\
   6935 \n\
   6936 Return True if all characters in S are digits\n\
   6937 and there is at least one character in S, False otherwise.");
   6938 
   6939 static PyObject*
   6940 unicode_isdigit(PyUnicodeObject *self)
   6941 {
   6942     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6943     register const Py_UNICODE *e;
   6944 
   6945     /* Shortcut for single character strings */
   6946     if (PyUnicode_GET_SIZE(self) == 1 &&
   6947         Py_UNICODE_ISDIGIT(*p))
   6948         return PyBool_FromLong(1);
   6949 
   6950     /* Special case for empty strings */
   6951     if (PyUnicode_GET_SIZE(self) == 0)
   6952         return PyBool_FromLong(0);
   6953 
   6954     e = p + PyUnicode_GET_SIZE(self);
   6955     for (; p < e; p++) {
   6956         if (!Py_UNICODE_ISDIGIT(*p))
   6957             return PyBool_FromLong(0);
   6958     }
   6959     return PyBool_FromLong(1);
   6960 }
   6961 
   6962 PyDoc_STRVAR(isnumeric__doc__,
   6963              "S.isnumeric() -> bool\n\
   6964 \n\
   6965 Return True if there are only numeric characters in S,\n\
   6966 False otherwise.");
   6967 
   6968 static PyObject*
   6969 unicode_isnumeric(PyUnicodeObject *self)
   6970 {
   6971     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6972     register const Py_UNICODE *e;
   6973 
   6974     /* Shortcut for single character strings */
   6975     if (PyUnicode_GET_SIZE(self) == 1 &&
   6976         Py_UNICODE_ISNUMERIC(*p))
   6977         return PyBool_FromLong(1);
   6978 
   6979     /* Special case for empty strings */
   6980     if (PyUnicode_GET_SIZE(self) == 0)
   6981         return PyBool_FromLong(0);
   6982 
   6983     e = p + PyUnicode_GET_SIZE(self);
   6984     for (; p < e; p++) {
   6985         if (!Py_UNICODE_ISNUMERIC(*p))
   6986             return PyBool_FromLong(0);
   6987     }
   6988     return PyBool_FromLong(1);
   6989 }
   6990 
   6991 PyDoc_STRVAR(join__doc__,
   6992              "S.join(iterable) -> unicode\n\
   6993 \n\
   6994 Return a string which is the concatenation of the strings in the\n\
   6995 iterable.  The separator between elements is S.");
   6996 
   6997 static PyObject*
   6998 unicode_join(PyObject *self, PyObject *data)
   6999 {
   7000     return PyUnicode_Join(self, data);
   7001 }
   7002 
   7003 static Py_ssize_t
   7004 unicode_length(PyUnicodeObject *self)
   7005 {
   7006     return self->length;
   7007 }
   7008 
   7009 PyDoc_STRVAR(ljust__doc__,
   7010              "S.ljust(width[, fillchar]) -> int\n\
   7011 \n\
   7012 Return S left-justified in a Unicode string of length width. Padding is\n\
   7013 done using the specified fill character (default is a space).");
   7014 
   7015 static PyObject *
   7016 unicode_ljust(PyUnicodeObject *self, PyObject *args)
   7017 {
   7018     Py_ssize_t width;
   7019     Py_UNICODE fillchar = ' ';
   7020 
   7021     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
   7022         return NULL;
   7023 
   7024     if (self->length >= width && PyUnicode_CheckExact(self)) {
   7025         Py_INCREF(self);
   7026         return (PyObject*) self;
   7027     }
   7028 
   7029     return (PyObject*) pad(self, 0, width - self->length, fillchar);
   7030 }
   7031 
   7032 PyDoc_STRVAR(lower__doc__,
   7033              "S.lower() -> unicode\n\
   7034 \n\
   7035 Return a copy of the string S converted to lowercase.");
   7036 
   7037 static PyObject*
   7038 unicode_lower(PyUnicodeObject *self)
   7039 {
   7040     return fixup(self, fixlower);
   7041 }
   7042 
   7043 #define LEFTSTRIP 0
   7044 #define RIGHTSTRIP 1
   7045 #define BOTHSTRIP 2
   7046 
   7047 /* Arrays indexed by above */
   7048 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
   7049 
   7050 #define STRIPNAME(i) (stripformat[i]+3)
   7051 
   7052 /* externally visible for str.strip(unicode) */
   7053 PyObject *
   7054 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
   7055 {
   7056     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
   7057     Py_ssize_t len = PyUnicode_GET_SIZE(self);
   7058     Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
   7059     Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
   7060     Py_ssize_t i, j;
   7061 
   7062     BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
   7063 
   7064     i = 0;
   7065     if (striptype != RIGHTSTRIP) {
   7066         while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
   7067             i++;
   7068         }
   7069     }
   7070 
   7071     j = len;
   7072     if (striptype != LEFTSTRIP) {
   7073         do {
   7074             j--;
   7075         } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
   7076         j++;
   7077     }
   7078 
   7079     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
   7080         Py_INCREF(self);
   7081         return (PyObject*)self;
   7082     }
   7083     else
   7084         return PyUnicode_FromUnicode(s+i, j-i);
   7085 }
   7086 
   7087 
   7088 static PyObject *
   7089 do_strip(PyUnicodeObject *self, int striptype)
   7090 {
   7091     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
   7092     Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
   7093 
   7094     i = 0;
   7095     if (striptype != RIGHTSTRIP) {
   7096         while (i < len && Py_UNICODE_ISSPACE(s[i])) {
   7097             i++;
   7098         }
   7099     }
   7100 
   7101     j = len;
   7102     if (striptype != LEFTSTRIP) {
   7103         do {
   7104             j--;
   7105         } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
   7106         j++;
   7107     }
   7108 
   7109     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
   7110         Py_INCREF(self);
   7111         return (PyObject*)self;
   7112     }
   7113     else
   7114         return PyUnicode_FromUnicode(s+i, j-i);
   7115 }
   7116 
   7117 
   7118 static PyObject *
   7119 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
   7120 {
   7121     PyObject *sep = NULL;
   7122 
   7123     if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
   7124         return NULL;
   7125 
   7126     if (sep != NULL && sep != Py_None) {
   7127         if (PyUnicode_Check(sep))
   7128             return _PyUnicode_XStrip(self, striptype, sep);
   7129         else if (PyString_Check(sep)) {
   7130             PyObject *res;
   7131             sep = PyUnicode_FromObject(sep);
   7132             if (sep==NULL)
   7133                 return NULL;
   7134             res = _PyUnicode_XStrip(self, striptype, sep);
   7135             Py_DECREF(sep);
   7136             return res;
   7137         }
   7138         else {
   7139             PyErr_Format(PyExc_TypeError,
   7140                          "%s arg must be None, unicode or str",
   7141                          STRIPNAME(striptype));
   7142             return NULL;
   7143         }
   7144     }
   7145 
   7146     return do_strip(self, striptype);
   7147 }
   7148 
   7149 
   7150 PyDoc_STRVAR(strip__doc__,
   7151              "S.strip([chars]) -> unicode\n\
   7152 \n\
   7153 Return a copy of the string S with leading and trailing\n\
   7154 whitespace removed.\n\
   7155 If chars is given and not None, remove characters in chars instead.\n\
   7156 If chars is a str, it will be converted to unicode before stripping");
   7157 
   7158 static PyObject *
   7159 unicode_strip(PyUnicodeObject *self, PyObject *args)
   7160 {
   7161     if (PyTuple_GET_SIZE(args) == 0)
   7162         return do_strip(self, BOTHSTRIP); /* Common case */
   7163     else
   7164         return do_argstrip(self, BOTHSTRIP, args);
   7165 }
   7166 
   7167 
   7168 PyDoc_STRVAR(lstrip__doc__,
   7169              "S.lstrip([chars]) -> unicode\n\
   7170 \n\
   7171 Return a copy of the string S with leading whitespace removed.\n\
   7172 If chars is given and not None, remove characters in chars instead.\n\
   7173 If chars is a str, it will be converted to unicode before stripping");
   7174 
   7175 static PyObject *
   7176 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
   7177 {
   7178     if (PyTuple_GET_SIZE(args) == 0)
   7179         return do_strip(self, LEFTSTRIP); /* Common case */
   7180     else
   7181         return do_argstrip(self, LEFTSTRIP, args);
   7182 }
   7183 
   7184 
   7185 PyDoc_STRVAR(rstrip__doc__,
   7186              "S.rstrip([chars]) -> unicode\n\
   7187 \n\
   7188 Return a copy of the string S with trailing whitespace removed.\n\
   7189 If chars is given and not None, remove characters in chars instead.\n\
   7190 If chars is a str, it will be converted to unicode before stripping");
   7191 
   7192 static PyObject *
   7193 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
   7194 {
   7195     if (PyTuple_GET_SIZE(args) == 0)
   7196         return do_strip(self, RIGHTSTRIP); /* Common case */
   7197     else
   7198         return do_argstrip(self, RIGHTSTRIP, args);
   7199 }
   7200 
   7201 
   7202 static PyObject*
   7203 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
   7204 {
   7205     PyUnicodeObject *u;
   7206     Py_UNICODE *p;
   7207     Py_ssize_t nchars;
   7208     size_t nbytes;
   7209 
   7210     if (len < 0)
   7211         len = 0;
   7212 
   7213     if (len == 1 && PyUnicode_CheckExact(str)) {
   7214         /* no repeat, return original string */
   7215         Py_INCREF(str);
   7216         return (PyObject*) str;
   7217     }
   7218 
   7219     /* ensure # of chars needed doesn't overflow int and # of bytes
   7220      * needed doesn't overflow size_t
   7221      */
   7222     nchars = len * str->length;
   7223     if (len && nchars / len != str->length) {
   7224         PyErr_SetString(PyExc_OverflowError,
   7225                         "repeated string is too long");
   7226         return NULL;
   7227     }
   7228     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
   7229     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
   7230         PyErr_SetString(PyExc_OverflowError,
   7231                         "repeated string is too long");
   7232         return NULL;
   7233     }
   7234     u = _PyUnicode_New(nchars);
   7235     if (!u)
   7236         return NULL;
   7237 
   7238     p = u->str;
   7239 
   7240     if (str->length == 1 && len > 0) {
   7241         Py_UNICODE_FILL(p, str->str[0], len);
   7242     } else {
   7243         Py_ssize_t done = 0; /* number of characters copied this far */
   7244         if (done < nchars) {
   7245             Py_UNICODE_COPY(p, str->str, str->length);
   7246             done = str->length;
   7247         }
   7248         while (done < nchars) {
   7249             Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
   7250             Py_UNICODE_COPY(p+done, p, n);
   7251             done += n;
   7252         }
   7253     }
   7254 
   7255     return (PyObject*) u;
   7256 }
   7257 
   7258 PyObject *PyUnicode_Replace(PyObject *obj,
   7259                             PyObject *subobj,
   7260                             PyObject *replobj,
   7261                             Py_ssize_t maxcount)
   7262 {
   7263     PyObject *self;
   7264     PyObject *str1;
   7265     PyObject *str2;
   7266     PyObject *result;
   7267 
   7268     self = PyUnicode_FromObject(obj);
   7269     if (self == NULL)
   7270         return NULL;
   7271     str1 = PyUnicode_FromObject(subobj);
   7272     if (str1 == NULL) {
   7273         Py_DECREF(self);
   7274         return NULL;
   7275     }
   7276     str2 = PyUnicode_FromObject(replobj);
   7277     if (str2 == NULL) {
   7278         Py_DECREF(self);
   7279         Py_DECREF(str1);
   7280         return NULL;
   7281     }
   7282     result = replace((PyUnicodeObject *)self,
   7283                      (PyUnicodeObject *)str1,
   7284                      (PyUnicodeObject *)str2,
   7285                      maxcount);
   7286     Py_DECREF(self);
   7287     Py_DECREF(str1);
   7288     Py_DECREF(str2);
   7289     return result;
   7290 }
   7291 
   7292 PyDoc_STRVAR(replace__doc__,
   7293              "S.replace(old, new[, count]) -> unicode\n\
   7294 \n\
   7295 Return a copy of S with all occurrences of substring\n\
   7296 old replaced by new.  If the optional argument count is\n\
   7297 given, only the first count occurrences are replaced.");
   7298 
   7299 static PyObject*
   7300 unicode_replace(PyUnicodeObject *self, PyObject *args)
   7301 {
   7302     PyUnicodeObject *str1;
   7303     PyUnicodeObject *str2;
   7304     Py_ssize_t maxcount = -1;
   7305     PyObject *result;
   7306 
   7307     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
   7308         return NULL;
   7309     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
   7310     if (str1 == NULL)
   7311         return NULL;
   7312     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
   7313     if (str2 == NULL) {
   7314         Py_DECREF(str1);
   7315         return NULL;
   7316     }
   7317 
   7318     result = replace(self, str1, str2, maxcount);
   7319 
   7320     Py_DECREF(str1);
   7321     Py_DECREF(str2);
   7322     return result;
   7323 }
   7324 
   7325 static
   7326 PyObject *unicode_repr(PyObject *unicode)
   7327 {
   7328     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
   7329                                 PyUnicode_GET_SIZE(unicode),
   7330                                 1);
   7331 }
   7332 
   7333 PyDoc_STRVAR(rfind__doc__,
   7334              "S.rfind(sub [,start [,end]]) -> int\n\
   7335 \n\
   7336 Return the highest index in S where substring sub is found,\n\
   7337 such that sub is contained within S[start:end].  Optional\n\
   7338 arguments start and end are interpreted as in slice notation.\n\
   7339 \n\
   7340 Return -1 on failure.");
   7341 
   7342 static PyObject *
   7343 unicode_rfind(PyUnicodeObject *self, PyObject *args)
   7344 {
   7345     PyUnicodeObject *substring;
   7346     Py_ssize_t start;
   7347     Py_ssize_t end;
   7348     Py_ssize_t result;
   7349 
   7350     if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
   7351                                             &start, &end))
   7352         return NULL;
   7353 
   7354     result = stringlib_rfind_slice(
   7355         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
   7356         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
   7357         start, end
   7358         );
   7359 
   7360     Py_DECREF(substring);
   7361 
   7362     return PyInt_FromSsize_t(result);
   7363 }
   7364 
   7365 PyDoc_STRVAR(rindex__doc__,
   7366              "S.rindex(sub [,start [,end]]) -> int\n\
   7367 \n\
   7368 Like S.rfind() but raise ValueError when the substring is not found.");
   7369 
   7370 static PyObject *
   7371 unicode_rindex(PyUnicodeObject *self, PyObject *args)
   7372 {
   7373     PyUnicodeObject *substring;
   7374     Py_ssize_t start;
   7375     Py_ssize_t end;
   7376     Py_ssize_t result;
   7377 
   7378     if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
   7379                                             &start, &end))
   7380         return NULL;
   7381 
   7382     result = stringlib_rfind_slice(
   7383         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
   7384         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
   7385         start, end
   7386         );
   7387 
   7388     Py_DECREF(substring);
   7389 
   7390     if (result < 0) {
   7391         PyErr_SetString(PyExc_ValueError, "substring not found");
   7392         return NULL;
   7393     }
   7394     return PyInt_FromSsize_t(result);
   7395 }
   7396 
   7397 PyDoc_STRVAR(rjust__doc__,
   7398              "S.rjust(width[, fillchar]) -> unicode\n\
   7399 \n\
   7400 Return S right-justified in a Unicode string of length width. Padding is\n\
   7401 done using the specified fill character (default is a space).");
   7402 
   7403 static PyObject *
   7404 unicode_rjust(PyUnicodeObject *self, PyObject *args)
   7405 {
   7406     Py_ssize_t width;
   7407     Py_UNICODE fillchar = ' ';
   7408 
   7409     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
   7410         return NULL;
   7411 
   7412     if (self->length >= width && PyUnicode_CheckExact(self)) {
   7413         Py_INCREF(self);
   7414         return (PyObject*) self;
   7415     }
   7416 
   7417     return (PyObject*) pad(self, width - self->length, 0, fillchar);
   7418 }
   7419 
   7420 static PyObject*
   7421 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
   7422 {
   7423     /* standard clamping */
   7424     if (start < 0)
   7425         start = 0;
   7426     if (end < 0)
   7427         end = 0;
   7428     if (end > self->length)
   7429         end = self->length;
   7430     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
   7431         /* full slice, return original string */
   7432         Py_INCREF(self);
   7433         return (PyObject*) self;
   7434     }
   7435     if (start > end)
   7436         start = end;
   7437     /* copy slice */
   7438     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
   7439                                              end - start);
   7440 }
   7441 
   7442 PyObject *PyUnicode_Split(PyObject *s,
   7443                           PyObject *sep,
   7444                           Py_ssize_t maxsplit)
   7445 {
   7446     PyObject *result;
   7447 
   7448     s = PyUnicode_FromObject(s);
   7449     if (s == NULL)
   7450         return NULL;
   7451     if (sep != NULL) {
   7452         sep = PyUnicode_FromObject(sep);
   7453         if (sep == NULL) {
   7454             Py_DECREF(s);
   7455             return NULL;
   7456         }
   7457     }
   7458 
   7459     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
   7460 
   7461     Py_DECREF(s);
   7462     Py_XDECREF(sep);
   7463     return result;
   7464 }
   7465 
   7466 PyDoc_STRVAR(split__doc__,
   7467              "S.split([sep [,maxsplit]]) -> list of strings\n\
   7468 \n\
   7469 Return a list of the words in S, using sep as the\n\
   7470 delimiter string.  If maxsplit is given, at most maxsplit\n\
   7471 splits are done. If sep is not specified or is None, any\n\
   7472 whitespace string is a separator and empty strings are\n\
   7473 removed from the result.");
   7474 
   7475 static PyObject*
   7476 unicode_split(PyUnicodeObject *self, PyObject *args)
   7477 {
   7478     PyObject *substring = Py_None;
   7479     Py_ssize_t maxcount = -1;
   7480 
   7481     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
   7482         return NULL;
   7483 
   7484     if (substring == Py_None)
   7485         return split(self, NULL, maxcount);
   7486     else if (PyUnicode_Check(substring))
   7487         return split(self, (PyUnicodeObject *)substring, maxcount);
   7488     else
   7489         return PyUnicode_Split((PyObject *)self, substring, maxcount);
   7490 }
   7491 
   7492 PyObject *
   7493 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
   7494 {
   7495     PyObject* str_obj;
   7496     PyObject* sep_obj;
   7497     PyObject* out;
   7498 
   7499     str_obj = PyUnicode_FromObject(str_in);
   7500     if (!str_obj)
   7501         return NULL;
   7502     sep_obj = PyUnicode_FromObject(sep_in);
   7503     if (!sep_obj) {
   7504         Py_DECREF(str_obj);
   7505         return NULL;
   7506     }
   7507 
   7508     out = stringlib_partition(
   7509         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
   7510         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
   7511         );
   7512 
   7513     Py_DECREF(sep_obj);
   7514     Py_DECREF(str_obj);
   7515 
   7516     return out;
   7517 }
   7518 
   7519 
   7520 PyObject *
   7521 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
   7522 {
   7523     PyObject* str_obj;
   7524     PyObject* sep_obj;
   7525     PyObject* out;
   7526 
   7527     str_obj = PyUnicode_FromObject(str_in);
   7528     if (!str_obj)
   7529         return NULL;
   7530     sep_obj = PyUnicode_FromObject(sep_in);
   7531     if (!sep_obj) {
   7532         Py_DECREF(str_obj);
   7533         return NULL;
   7534     }
   7535 
   7536     out = stringlib_rpartition(
   7537         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
   7538         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
   7539         );
   7540 
   7541     Py_DECREF(sep_obj);
   7542     Py_DECREF(str_obj);
   7543 
   7544     return out;
   7545 }
   7546 
   7547 PyDoc_STRVAR(partition__doc__,
   7548              "S.partition(sep) -> (head, sep, tail)\n\
   7549 \n\
   7550 Search for the separator sep in S, and return the part before it,\n\
   7551 the separator itself, and the part after it.  If the separator is not\n\
   7552 found, return S and two empty strings.");
   7553 
   7554 static PyObject*
   7555 unicode_partition(PyUnicodeObject *self, PyObject *separator)
   7556 {
   7557     return PyUnicode_Partition((PyObject *)self, separator);
   7558 }
   7559 
   7560 PyDoc_STRVAR(rpartition__doc__,
   7561              "S.rpartition(sep) -> (head, sep, tail)\n\
   7562 \n\
   7563 Search for the separator sep in S, starting at the end of S, and return\n\
   7564 the part before it, the separator itself, and the part after it.  If the\n\
   7565 separator is not found, return two empty strings and S.");
   7566 
   7567 static PyObject*
   7568 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
   7569 {
   7570     return PyUnicode_RPartition((PyObject *)self, separator);
   7571 }
   7572 
   7573 PyObject *PyUnicode_RSplit(PyObject *s,
   7574                            PyObject *sep,
   7575                            Py_ssize_t maxsplit)
   7576 {
   7577     PyObject *result;
   7578 
   7579     s = PyUnicode_FromObject(s);
   7580     if (s == NULL)
   7581         return NULL;
   7582     if (sep != NULL) {
   7583         sep = PyUnicode_FromObject(sep);
   7584         if (sep == NULL) {
   7585             Py_DECREF(s);
   7586             return NULL;
   7587         }
   7588     }
   7589 
   7590     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
   7591 
   7592     Py_DECREF(s);
   7593     Py_XDECREF(sep);
   7594     return result;
   7595 }
   7596 
   7597 PyDoc_STRVAR(rsplit__doc__,
   7598              "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
   7599 \n\
   7600 Return a list of the words in S, using sep as the\n\
   7601 delimiter string, starting at the end of the string and\n\
   7602 working to the front.  If maxsplit is given, at most maxsplit\n\
   7603 splits are done. If sep is not specified, any whitespace string\n\
   7604 is a separator.");
   7605 
   7606 static PyObject*
   7607 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
   7608 {
   7609     PyObject *substring = Py_None;
   7610     Py_ssize_t maxcount = -1;
   7611 
   7612     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
   7613         return NULL;
   7614 
   7615     if (substring == Py_None)
   7616         return rsplit(self, NULL, maxcount);
   7617     else if (PyUnicode_Check(substring))
   7618         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
   7619     else
   7620         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
   7621 }
   7622 
   7623 PyDoc_STRVAR(splitlines__doc__,
   7624              "S.splitlines(keepends=False) -> list of strings\n\
   7625 \n\
   7626 Return a list of the lines in S, breaking at line boundaries.\n\
   7627 Line breaks are not included in the resulting list unless keepends\n\
   7628 is given and true.");
   7629 
   7630 static PyObject*
   7631 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
   7632 {
   7633     int keepends = 0;
   7634 
   7635     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
   7636         return NULL;
   7637 
   7638     return PyUnicode_Splitlines((PyObject *)self, keepends);
   7639 }
   7640 
   7641 static
   7642 PyObject *unicode_str(PyUnicodeObject *self)
   7643 {
   7644     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
   7645 }
   7646 
   7647 PyDoc_STRVAR(swapcase__doc__,
   7648              "S.swapcase() -> unicode\n\
   7649 \n\
   7650 Return a copy of S with uppercase characters converted to lowercase\n\
   7651 and vice versa.");
   7652 
   7653 static PyObject*
   7654 unicode_swapcase(PyUnicodeObject *self)
   7655 {
   7656     return fixup(self, fixswapcase);
   7657 }
   7658 
   7659 PyDoc_STRVAR(translate__doc__,
   7660              "S.translate(table) -> unicode\n\
   7661 \n\
   7662 Return a copy of the string S, where all characters have been mapped\n\
   7663 through the given translation table, which must be a mapping of\n\
   7664 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
   7665 Unmapped characters are left untouched. Characters mapped to None\n\
   7666 are deleted.");
   7667 
   7668 static PyObject*
   7669 unicode_translate(PyUnicodeObject *self, PyObject *table)
   7670 {
   7671     return PyUnicode_TranslateCharmap(self->str,
   7672                                       self->length,
   7673                                       table,
   7674                                       "ignore");
   7675 }
   7676 
   7677 PyDoc_STRVAR(upper__doc__,
   7678              "S.upper() -> unicode\n\
   7679 \n\
   7680 Return a copy of S converted to uppercase.");
   7681 
   7682 static PyObject*
   7683 unicode_upper(PyUnicodeObject *self)
   7684 {
   7685     return fixup(self, fixupper);
   7686 }
   7687 
   7688 PyDoc_STRVAR(zfill__doc__,
   7689              "S.zfill(width) -> unicode\n\
   7690 \n\
   7691 Pad a numeric string S with zeros on the left, to fill a field\n\
   7692 of the specified width. The string S is never truncated.");
   7693 
   7694 static PyObject *
   7695 unicode_zfill(PyUnicodeObject *self, PyObject *args)
   7696 {
   7697     Py_ssize_t fill;
   7698     PyUnicodeObject *u;
   7699 
   7700     Py_ssize_t width;
   7701     if (!PyArg_ParseTuple(args, "n:zfill", &width))
   7702         return NULL;
   7703 
   7704     if (self->length >= width) {
   7705         if (PyUnicode_CheckExact(self)) {
   7706             Py_INCREF(self);
   7707             return (PyObject*) self;
   7708         }
   7709         else
   7710             return PyUnicode_FromUnicode(
   7711                 PyUnicode_AS_UNICODE(self),
   7712                 PyUnicode_GET_SIZE(self)
   7713                 );
   7714     }
   7715 
   7716     fill = width - self->length;
   7717 
   7718     u = pad(self, fill, 0, '0');
   7719 
   7720     if (u == NULL)
   7721         return NULL;
   7722 
   7723     if (u->str[fill] == '+' || u->str[fill] == '-') {
   7724         /* move sign to beginning of string */
   7725         u->str[0] = u->str[fill];
   7726         u->str[fill] = '0';
   7727     }
   7728 
   7729     return (PyObject*) u;
   7730 }
   7731 
   7732 #if 0
   7733 static PyObject*
   7734 free_listsize(PyUnicodeObject *self)
   7735 {
   7736     return PyInt_FromLong(numfree);
   7737 }
   7738 #endif
   7739 
   7740 PyDoc_STRVAR(startswith__doc__,
   7741              "S.startswith(prefix[, start[, end]]) -> bool\n\
   7742 \n\
   7743 Return True if S starts with the specified prefix, False otherwise.\n\
   7744 With optional start, test S beginning at that position.\n\
   7745 With optional end, stop comparing S at that position.\n\
   7746 prefix can also be a tuple of strings to try.");
   7747 
   7748 static PyObject *
   7749 unicode_startswith(PyUnicodeObject *self,
   7750                    PyObject *args)
   7751 {
   7752     PyObject *subobj;
   7753     PyUnicodeObject *substring;
   7754     Py_ssize_t start = 0;
   7755     Py_ssize_t end = PY_SSIZE_T_MAX;
   7756     int result;
   7757 
   7758     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
   7759         return NULL;
   7760     if (PyTuple_Check(subobj)) {
   7761         Py_ssize_t i;
   7762         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
   7763             substring = (PyUnicodeObject *)PyUnicode_FromObject(
   7764                 PyTuple_GET_ITEM(subobj, i));
   7765             if (substring == NULL)
   7766                 return NULL;
   7767             result = tailmatch(self, substring, start, end, -1);
   7768             Py_DECREF(substring);
   7769             if (result) {
   7770                 Py_RETURN_TRUE;
   7771             }
   7772         }
   7773         /* nothing matched */
   7774         Py_RETURN_FALSE;
   7775     }
   7776     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
   7777     if (substring == NULL) {
   7778         if (PyErr_ExceptionMatches(PyExc_TypeError))
   7779             PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
   7780                          "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
   7781         return NULL;
   7782     }
   7783     result = tailmatch(self, substring, start, end, -1);
   7784     Py_DECREF(substring);
   7785     return PyBool_FromLong(result);
   7786 }
   7787 
   7788 
   7789 PyDoc_STRVAR(endswith__doc__,
   7790              "S.endswith(suffix[, start[, end]]) -> bool\n\
   7791 \n\
   7792 Return True if S ends with the specified suffix, False otherwise.\n\
   7793 With optional start, test S beginning at that position.\n\
   7794 With optional end, stop comparing S at that position.\n\
   7795 suffix can also be a tuple of strings to try.");
   7796 
   7797 static PyObject *
   7798 unicode_endswith(PyUnicodeObject *self,
   7799                  PyObject *args)
   7800 {
   7801     PyObject *subobj;
   7802     PyUnicodeObject *substring;
   7803     Py_ssize_t start = 0;
   7804     Py_ssize_t end = PY_SSIZE_T_MAX;
   7805     int result;
   7806 
   7807     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
   7808         return NULL;
   7809     if (PyTuple_Check(subobj)) {
   7810         Py_ssize_t i;
   7811         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
   7812             substring = (PyUnicodeObject *)PyUnicode_FromObject(
   7813                 PyTuple_GET_ITEM(subobj, i));
   7814             if (substring == NULL)
   7815                 return NULL;
   7816             result = tailmatch(self, substring, start, end, +1);
   7817             Py_DECREF(substring);
   7818             if (result) {
   7819                 Py_RETURN_TRUE;
   7820             }
   7821         }
   7822         Py_RETURN_FALSE;
   7823     }
   7824     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
   7825     if (substring == NULL) {
   7826         if (PyErr_ExceptionMatches(PyExc_TypeError))
   7827             PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
   7828                          "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
   7829         return NULL;
   7830     }
   7831     result = tailmatch(self, substring, start, end, +1);
   7832     Py_DECREF(substring);
   7833     return PyBool_FromLong(result);
   7834 }
   7835 
   7836 
   7837 /* Implements do_string_format, which is unicode because of stringlib */
   7838 #include "stringlib/string_format.h"
   7839 
   7840 PyDoc_STRVAR(format__doc__,
   7841              "S.format(*args, **kwargs) -> unicode\n\
   7842 \n\
   7843 Return a formatted version of S, using substitutions from args and kwargs.\n\
   7844 The substitutions are identified by braces ('{' and '}').");
   7845 
   7846 static PyObject *
   7847 unicode__format__(PyObject *self, PyObject *args)
   7848 {
   7849     PyObject *format_spec;
   7850     PyObject *result = NULL;
   7851     PyObject *tmp = NULL;
   7852 
   7853     /* If 2.x, convert format_spec to the same type as value */
   7854     /* This is to allow things like u''.format('') */
   7855     if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
   7856         goto done;
   7857     if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
   7858         PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
   7859                      "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
   7860         goto done;
   7861     }
   7862     tmp = PyObject_Unicode(format_spec);
   7863     if (tmp == NULL)
   7864         goto done;
   7865     format_spec = tmp;
   7866 
   7867     result = _PyUnicode_FormatAdvanced(self,
   7868                                        PyUnicode_AS_UNICODE(format_spec),
   7869                                        PyUnicode_GET_SIZE(format_spec));
   7870   done:
   7871     Py_XDECREF(tmp);
   7872     return result;
   7873 }
   7874 
   7875 PyDoc_STRVAR(p_format__doc__,
   7876              "S.__format__(format_spec) -> unicode\n\
   7877 \n\
   7878 Return a formatted version of S as described by format_spec.");
   7879 
   7880 static PyObject *
   7881 unicode__sizeof__(PyUnicodeObject *v)
   7882 {
   7883     return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
   7884                              sizeof(Py_UNICODE) * (v->length + 1));
   7885 }
   7886 
   7887 PyDoc_STRVAR(sizeof__doc__,
   7888              "S.__sizeof__() -> size of S in memory, in bytes\n\
   7889 \n\
   7890 ");
   7891 
   7892 static PyObject *
   7893 unicode_getnewargs(PyUnicodeObject *v)
   7894 {
   7895     return Py_BuildValue("(u#)", v->str, v->length);
   7896 }
   7897 
   7898 
   7899 static PyMethodDef unicode_methods[] = {
   7900     {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
   7901     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
   7902     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
   7903     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
   7904     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
   7905     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
   7906     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
   7907     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
   7908     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
   7909     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
   7910     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
   7911     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
   7912     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
   7913     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
   7914     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
   7915     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
   7916     {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
   7917 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
   7918     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
   7919     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
   7920     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
   7921     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
   7922     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
   7923     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
   7924     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
   7925     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
   7926     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
   7927     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
   7928     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
   7929     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
   7930     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
   7931     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
   7932     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
   7933     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
   7934     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
   7935     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
   7936     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
   7937     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
   7938     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
   7939     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
   7940     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
   7941     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
   7942     {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
   7943     {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
   7944     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
   7945 #if 0
   7946     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
   7947 #endif
   7948 
   7949 #if 0
   7950     /* This one is just used for debugging the implementation. */
   7951     {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
   7952 #endif
   7953 
   7954     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
   7955     {NULL, NULL}
   7956 };
   7957 
   7958 static PyObject *
   7959 unicode_mod(PyObject *v, PyObject *w)
   7960 {
   7961     if (!PyUnicode_Check(v)) {
   7962         Py_INCREF(Py_NotImplemented);
   7963         return Py_NotImplemented;
   7964     }
   7965     return PyUnicode_Format(v, w);
   7966 }
   7967 
   7968 static PyNumberMethods unicode_as_number = {
   7969     0,              /*nb_add*/
   7970     0,              /*nb_subtract*/
   7971     0,              /*nb_multiply*/
   7972     0,              /*nb_divide*/
   7973     unicode_mod,            /*nb_remainder*/
   7974 };
   7975 
   7976 static PySequenceMethods unicode_as_sequence = {
   7977     (lenfunc) unicode_length,       /* sq_length */
   7978     PyUnicode_Concat,           /* sq_concat */
   7979     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
   7980     (ssizeargfunc) unicode_getitem,     /* sq_item */
   7981     (ssizessizeargfunc) unicode_slice,  /* sq_slice */
   7982     0,                  /* sq_ass_item */
   7983     0,                  /* sq_ass_slice */
   7984     PyUnicode_Contains,         /* sq_contains */
   7985 };
   7986 
   7987 static PyObject*
   7988 unicode_subscript(PyUnicodeObject* self, PyObject* item)
   7989 {
   7990     if (PyIndex_Check(item)) {
   7991         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
   7992         if (i == -1 && PyErr_Occurred())
   7993             return NULL;
   7994         if (i < 0)
   7995             i += PyUnicode_GET_SIZE(self);
   7996         return unicode_getitem(self, i);
   7997     } else if (PySlice_Check(item)) {
   7998         Py_ssize_t start, stop, step, slicelength, cur, i;
   7999         Py_UNICODE* source_buf;
   8000         Py_UNICODE* result_buf;
   8001         PyObject* result;
   8002 
   8003         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
   8004                                  &start, &stop, &step, &slicelength) < 0) {
   8005             return NULL;
   8006         }
   8007 
   8008         if (slicelength <= 0) {
   8009             return PyUnicode_FromUnicode(NULL, 0);
   8010         } else if (start == 0 && step == 1 && slicelength == self->length &&
   8011                    PyUnicode_CheckExact(self)) {
   8012             Py_INCREF(self);
   8013             return (PyObject *)self;
   8014         } else if (step == 1) {
   8015             return PyUnicode_FromUnicode(self->str + start, slicelength);
   8016         } else {
   8017             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
   8018             result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
   8019                                                        sizeof(Py_UNICODE));
   8020 
   8021             if (result_buf == NULL)
   8022                 return PyErr_NoMemory();
   8023 
   8024             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
   8025                 result_buf[i] = source_buf[cur];
   8026             }
   8027 
   8028             result = PyUnicode_FromUnicode(result_buf, slicelength);
   8029             PyObject_FREE(result_buf);
   8030             return result;
   8031         }
   8032     } else {
   8033         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
   8034         return NULL;
   8035     }
   8036 }
   8037 
   8038 static PyMappingMethods unicode_as_mapping = {
   8039     (lenfunc)unicode_length,        /* mp_length */
   8040     (binaryfunc)unicode_subscript,  /* mp_subscript */
   8041     (objobjargproc)0,           /* mp_ass_subscript */
   8042 };
   8043 
   8044 static Py_ssize_t
   8045 unicode_buffer_getreadbuf(PyUnicodeObject *self,
   8046                           Py_ssize_t index,
   8047                           const void **ptr)
   8048 {
   8049     if (index != 0) {
   8050         PyErr_SetString(PyExc_SystemError,
   8051                         "accessing non-existent unicode segment");
   8052         return -1;
   8053     }
   8054     *ptr = (void *) self->str;
   8055     return PyUnicode_GET_DATA_SIZE(self);
   8056 }
   8057 
   8058 static Py_ssize_t
   8059 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
   8060                            const void **ptr)
   8061 {
   8062     PyErr_SetString(PyExc_TypeError,
   8063                     "cannot use unicode as modifiable buffer");
   8064     return -1;
   8065 }
   8066 
   8067 static int
   8068 unicode_buffer_getsegcount(PyUnicodeObject *self,
   8069                            Py_ssize_t *lenp)
   8070 {
   8071     if (lenp)
   8072         *lenp = PyUnicode_GET_DATA_SIZE(self);
   8073     return 1;
   8074 }
   8075 
   8076 static Py_ssize_t
   8077 unicode_buffer_getcharbuf(PyUnicodeObject *self,
   8078                           Py_ssize_t index,
   8079                           const void **ptr)
   8080 {
   8081     PyObject *str;
   8082 
   8083     if (index != 0) {
   8084         PyErr_SetString(PyExc_SystemError,
   8085                         "accessing non-existent unicode segment");
   8086         return -1;
   8087     }
   8088     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
   8089     if (str == NULL)
   8090         return -1;
   8091     *ptr = (void *) PyString_AS_STRING(str);
   8092     return PyString_GET_SIZE(str);
   8093 }
   8094 
   8095 /* Helpers for PyUnicode_Format() */
   8096 
   8097 static PyObject *
   8098 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
   8099 {
   8100     Py_ssize_t argidx = *p_argidx;
   8101     if (argidx < arglen) {
   8102         (*p_argidx)++;
   8103         if (arglen < 0)
   8104             return args;
   8105         else
   8106             return PyTuple_GetItem(args, argidx);
   8107     }
   8108     PyErr_SetString(PyExc_TypeError,
   8109                     "not enough arguments for format string");
   8110     return NULL;
   8111 }
   8112 
   8113 #define F_LJUST (1<<0)
   8114 #define F_SIGN  (1<<1)
   8115 #define F_BLANK (1<<2)
   8116 #define F_ALT   (1<<3)
   8117 #define F_ZERO  (1<<4)
   8118 
   8119 static Py_ssize_t
   8120 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
   8121 {
   8122     register Py_ssize_t i;
   8123     Py_ssize_t len = strlen(charbuffer);
   8124     for (i = len - 1; i >= 0; i--)
   8125         buffer[i] = (Py_UNICODE) charbuffer[i];
   8126 
   8127     return len;
   8128 }
   8129 
   8130 static int
   8131 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
   8132 {
   8133     Py_ssize_t result;
   8134 
   8135     PyOS_snprintf((char *)buffer, len, format, x);
   8136     result = strtounicode(buffer, (char *)buffer);
   8137     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
   8138 }
   8139 
   8140 /* XXX To save some code duplication, formatfloat/long/int could have been
   8141    shared with stringobject.c, converting from 8-bit to Unicode after the
   8142    formatting is done. */
   8143 
   8144 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
   8145 
   8146 static PyObject *
   8147 formatfloat(PyObject *v, int flags, int prec, int type)
   8148 {
   8149     char *p;
   8150     PyObject *result;
   8151     double x;
   8152 
   8153     x = PyFloat_AsDouble(v);
   8154     if (x == -1.0 && PyErr_Occurred())
   8155         return NULL;
   8156 
   8157     if (prec < 0)
   8158         prec = 6;
   8159 
   8160     p = PyOS_double_to_string(x, type, prec,
   8161                               (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
   8162     if (p == NULL)
   8163         return NULL;
   8164     result = PyUnicode_FromStringAndSize(p, strlen(p));
   8165     PyMem_Free(p);
   8166     return result;
   8167 }
   8168 
   8169 static PyObject*
   8170 formatlong(PyObject *val, int flags, int prec, int type)
   8171 {
   8172     char *buf;
   8173     int i, len;
   8174     PyObject *str; /* temporary string object. */
   8175     PyUnicodeObject *result;
   8176 
   8177     str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
   8178     if (!str)
   8179         return NULL;
   8180     result = _PyUnicode_New(len);
   8181     if (!result) {
   8182         Py_DECREF(str);
   8183         return NULL;
   8184     }
   8185     for (i = 0; i < len; i++)
   8186         result->str[i] = buf[i];
   8187     result->str[len] = 0;
   8188     Py_DECREF(str);
   8189     return (PyObject*)result;
   8190 }
   8191 
   8192 static int
   8193 formatint(Py_UNICODE *buf,
   8194           size_t buflen,
   8195           int flags,
   8196           int prec,
   8197           int type,
   8198           PyObject *v)
   8199 {
   8200     /* fmt = '%#.' + `prec` + 'l' + `type`
   8201      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
   8202      *                     + 1 + 1
   8203      *                   = 24
   8204      */
   8205     char fmt[64]; /* plenty big enough! */
   8206     char *sign;
   8207     long x;
   8208 
   8209     x = PyInt_AsLong(v);
   8210     if (x == -1 && PyErr_Occurred())
   8211         return -1;
   8212     if (x < 0 && type == 'u') {
   8213         type = 'd';
   8214     }
   8215     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
   8216         sign = "-";
   8217     else
   8218         sign = "";
   8219     if (prec < 0)
   8220         prec = 1;
   8221 
   8222     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
   8223      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
   8224      */
   8225     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
   8226         PyErr_SetString(PyExc_OverflowError,
   8227                         "formatted integer is too long (precision too large?)");
   8228         return -1;
   8229     }
   8230 
   8231     if ((flags & F_ALT) &&
   8232         (type == 'x' || type == 'X')) {
   8233         /* When converting under %#x or %#X, there are a number
   8234          * of issues that cause pain:
   8235          * - when 0 is being converted, the C standard leaves off
   8236          *   the '0x' or '0X', which is inconsistent with other
   8237          *   %#x/%#X conversions and inconsistent with Python's
   8238          *   hex() function
   8239          * - there are platforms that violate the standard and
   8240          *   convert 0 with the '0x' or '0X'
   8241          *   (Metrowerks, Compaq Tru64)
   8242          * - there are platforms that give '0x' when converting
   8243          *   under %#X, but convert 0 in accordance with the
   8244          *   standard (OS/2 EMX)
   8245          *
   8246          * We can achieve the desired consistency by inserting our
   8247          * own '0x' or '0X' prefix, and substituting %x/%X in place
   8248          * of %#x/%#X.
   8249          *
   8250          * Note that this is the same approach as used in
   8251          * formatint() in stringobject.c
   8252          */
   8253         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
   8254                       sign, type, prec, type);
   8255     }
   8256     else {
   8257         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
   8258                       sign, (flags&F_ALT) ? "#" : "",
   8259                       prec, type);
   8260     }
   8261     if (sign[0])
   8262         return longtounicode(buf, buflen, fmt, -x);
   8263     else
   8264         return longtounicode(buf, buflen, fmt, x);
   8265 }
   8266 
   8267 static int
   8268 formatchar(Py_UNICODE *buf,
   8269            size_t buflen,
   8270            PyObject *v)
   8271 {
   8272     PyObject *unistr;
   8273     char *str;
   8274     /* presume that the buffer is at least 2 characters long */
   8275     if (PyUnicode_Check(v)) {
   8276         if (PyUnicode_GET_SIZE(v) != 1)
   8277             goto onError;
   8278         buf[0] = PyUnicode_AS_UNICODE(v)[0];
   8279     }
   8280 
   8281     else if (PyString_Check(v)) {
   8282         if (PyString_GET_SIZE(v) != 1)
   8283             goto onError;
   8284         /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
   8285            with a UnicodeDecodeError if 'char' is not decodable with the
   8286            default encoding (usually ASCII, but it might be something else) */
   8287         str = PyString_AS_STRING(v);
   8288         if ((unsigned char)str[0] > 0x7F) {
   8289             /* the char is not ASCII; try to decode the string using the
   8290                default encoding and return -1 to let the UnicodeDecodeError
   8291                be raised if the string can't be decoded */
   8292             unistr = PyUnicode_Decode(str, 1, NULL, "strict");
   8293             if (unistr == NULL)
   8294                 return -1;
   8295             buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
   8296             Py_DECREF(unistr);
   8297         }
   8298         else
   8299             buf[0] = (Py_UNICODE)str[0];
   8300     }
   8301 
   8302     else {
   8303         /* Integer input truncated to a character */
   8304         long x;
   8305         x = PyInt_AsLong(v);
   8306         if (x == -1 && PyErr_Occurred())
   8307             goto onError;
   8308 #ifdef Py_UNICODE_WIDE
   8309         if (x < 0 || x > 0x10ffff) {
   8310             PyErr_SetString(PyExc_OverflowError,
   8311                             "%c arg not in range(0x110000) "
   8312                             "(wide Python build)");
   8313             return -1;
   8314         }
   8315 #else
   8316         if (x < 0 || x > 0xffff) {
   8317             PyErr_SetString(PyExc_OverflowError,
   8318                             "%c arg not in range(0x10000) "
   8319                             "(narrow Python build)");
   8320             return -1;
   8321         }
   8322 #endif
   8323         buf[0] = (Py_UNICODE) x;
   8324     }
   8325     buf[1] = '\0';
   8326     return 1;
   8327 
   8328   onError:
   8329     PyErr_SetString(PyExc_TypeError,
   8330                     "%c requires int or char");
   8331     return -1;
   8332 }
   8333 
   8334 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
   8335 
   8336    FORMATBUFLEN is the length of the buffer in which the ints &
   8337    chars are formatted. XXX This is a magic number. Each formatting
   8338    routine does bounds checking to ensure no overflow, but a better
   8339    solution may be to malloc a buffer of appropriate size for each
   8340    format. For now, the current solution is sufficient.
   8341 */
   8342 #define FORMATBUFLEN (size_t)120
   8343 
   8344 PyObject *PyUnicode_Format(PyObject *format,
   8345                            PyObject *args)
   8346 {
   8347     Py_UNICODE *fmt, *res;
   8348     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
   8349     int args_owned = 0;
   8350     PyUnicodeObject *result = NULL;
   8351     PyObject *dict = NULL;
   8352     PyObject *uformat;
   8353 
   8354     if (format == NULL || args == NULL) {
   8355         PyErr_BadInternalCall();
   8356         return NULL;
   8357     }
   8358     uformat = PyUnicode_FromObject(format);
   8359     if (uformat == NULL)
   8360         return NULL;
   8361     fmt = PyUnicode_AS_UNICODE(uformat);
   8362     fmtcnt = PyUnicode_GET_SIZE(uformat);
   8363 
   8364     reslen = rescnt = fmtcnt + 100;
   8365     result = _PyUnicode_New(reslen);
   8366     if (result == NULL)
   8367         goto onError;
   8368     res = PyUnicode_AS_UNICODE(result);
   8369 
   8370     if (PyTuple_Check(args)) {
   8371         arglen = PyTuple_Size(args);
   8372         argidx = 0;
   8373     }
   8374     else {
   8375         arglen = -1;
   8376         argidx = -2;
   8377     }
   8378     if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript &&
   8379         !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type))
   8380         dict = args;
   8381 
   8382     while (--fmtcnt >= 0) {
   8383         if (*fmt != '%') {
   8384             if (--rescnt < 0) {
   8385                 rescnt = fmtcnt + 100;
   8386                 reslen += rescnt;
   8387                 if (_PyUnicode_Resize(&result, reslen) < 0)
   8388                     goto onError;
   8389                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
   8390                 --rescnt;
   8391             }
   8392             *res++ = *fmt++;
   8393         }
   8394         else {
   8395             /* Got a format specifier */
   8396             int flags = 0;
   8397             Py_ssize_t width = -1;
   8398             int prec = -1;
   8399             Py_UNICODE c = '\0';
   8400             Py_UNICODE fill;
   8401             int isnumok;
   8402             PyObject *v = NULL;
   8403             PyObject *temp = NULL;
   8404             Py_UNICODE *pbuf;
   8405             Py_UNICODE sign;
   8406             Py_ssize_t len;
   8407             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
   8408 
   8409             fmt++;
   8410             if (*fmt == '(') {
   8411                 Py_UNICODE *keystart;
   8412                 Py_ssize_t keylen;
   8413                 PyObject *key;
   8414                 int pcount = 1;
   8415 
   8416                 if (dict == NULL) {
   8417                     PyErr_SetString(PyExc_TypeError,
   8418                                     "format requires a mapping");
   8419                     goto onError;
   8420                 }
   8421                 ++fmt;
   8422                 --fmtcnt;
   8423                 keystart = fmt;
   8424                 /* Skip over balanced parentheses */
   8425                 while (pcount > 0 && --fmtcnt >= 0) {
   8426                     if (*fmt == ')')
   8427                         --pcount;
   8428                     else if (*fmt == '(')
   8429                         ++pcount;
   8430                     fmt++;
   8431                 }
   8432                 keylen = fmt - keystart - 1;
   8433                 if (fmtcnt < 0 || pcount > 0) {
   8434                     PyErr_SetString(PyExc_ValueError,
   8435                                     "incomplete format key");
   8436                     goto onError;
   8437                 }
   8438 #if 0
   8439                 /* keys are converted to strings using UTF-8 and
   8440                    then looked up since Python uses strings to hold
   8441                    variables names etc. in its namespaces and we
   8442                    wouldn't want to break common idioms. */
   8443                 key = PyUnicode_EncodeUTF8(keystart,
   8444                                            keylen,
   8445                                            NULL);
   8446 #else
   8447                 key = PyUnicode_FromUnicode(keystart, keylen);
   8448 #endif
   8449                 if (key == NULL)
   8450                     goto onError;
   8451                 if (args_owned) {
   8452                     Py_DECREF(args);
   8453                     args_owned = 0;
   8454                 }
   8455                 args = PyObject_GetItem(dict, key);
   8456                 Py_DECREF(key);
   8457                 if (args == NULL) {
   8458                     goto onError;
   8459                 }
   8460                 args_owned = 1;
   8461                 arglen = -1;
   8462                 argidx = -2;
   8463             }
   8464             while (--fmtcnt >= 0) {
   8465                 switch (c = *fmt++) {
   8466                 case '-': flags |= F_LJUST; continue;
   8467                 case '+': flags |= F_SIGN; continue;
   8468                 case ' ': flags |= F_BLANK; continue;
   8469                 case '#': flags |= F_ALT; continue;
   8470                 case '0': flags |= F_ZERO; continue;
   8471                 }
   8472                 break;
   8473             }
   8474             if (c == '*') {
   8475                 v = getnextarg(args, arglen, &argidx);
   8476                 if (v == NULL)
   8477                     goto onError;
   8478                 if (!PyInt_Check(v)) {
   8479                     PyErr_SetString(PyExc_TypeError,
   8480                                     "* wants int");
   8481                     goto onError;
   8482                 }
   8483                 width = PyInt_AsSsize_t(v);
   8484                 if (width == -1 && PyErr_Occurred())
   8485                     goto onError;
   8486                 if (width < 0) {
   8487                     flags |= F_LJUST;
   8488                     width = -width;
   8489                 }
   8490                 if (--fmtcnt >= 0)
   8491                     c = *fmt++;
   8492             }
   8493             else if (c >= '0' && c <= '9') {
   8494                 width = c - '0';
   8495                 while (--fmtcnt >= 0) {
   8496                     c = *fmt++;
   8497                     if (c < '0' || c > '9')
   8498                         break;
   8499                     if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
   8500                         PyErr_SetString(PyExc_ValueError,
   8501                                         "width too big");
   8502                         goto onError;
   8503                     }
   8504                     width = width*10 + (c - '0');
   8505                 }
   8506             }
   8507             if (c == '.') {
   8508                 prec = 0;
   8509                 if (--fmtcnt >= 0)
   8510                     c = *fmt++;
   8511                 if (c == '*') {
   8512                     v = getnextarg(args, arglen, &argidx);
   8513                     if (v == NULL)
   8514                         goto onError;
   8515                     if (!PyInt_Check(v)) {
   8516                         PyErr_SetString(PyExc_TypeError,
   8517                                         "* wants int");
   8518                         goto onError;
   8519                     }
   8520                     prec = _PyInt_AsInt(v);
   8521                     if (prec == -1 && PyErr_Occurred())
   8522                         goto onError;
   8523                     if (prec < 0)
   8524                         prec = 0;
   8525                     if (--fmtcnt >= 0)
   8526                         c = *fmt++;
   8527                 }
   8528                 else if (c >= '0' && c <= '9') {
   8529                     prec = c - '0';
   8530                     while (--fmtcnt >= 0) {
   8531                         c = *fmt++;
   8532                         if (c < '0' || c > '9')
   8533                             break;
   8534                         if (prec > (INT_MAX - ((int)c - '0')) / 10) {
   8535                             PyErr_SetString(PyExc_ValueError,
   8536                                             "prec too big");
   8537                             goto onError;
   8538                         }
   8539                         prec = prec*10 + (c - '0');
   8540                     }
   8541                 }
   8542             } /* prec */
   8543             if (fmtcnt >= 0) {
   8544                 if (c == 'h' || c == 'l' || c == 'L') {
   8545                     if (--fmtcnt >= 0)
   8546                         c = *fmt++;
   8547                 }
   8548             }
   8549             if (fmtcnt < 0) {
   8550                 PyErr_SetString(PyExc_ValueError,
   8551                                 "incomplete format");
   8552                 goto onError;
   8553             }
   8554             if (c != '%') {
   8555                 v = getnextarg(args, arglen, &argidx);
   8556                 if (v == NULL)
   8557                     goto onError;
   8558             }
   8559             sign = 0;
   8560             fill = ' ';
   8561             switch (c) {
   8562 
   8563             case '%':
   8564                 pbuf = formatbuf;
   8565                 /* presume that buffer length is at least 1 */
   8566                 pbuf[0] = '%';
   8567                 len = 1;
   8568                 break;
   8569 
   8570             case 's':
   8571             case 'r':
   8572                 if (PyUnicode_CheckExact(v) && c == 's') {
   8573                     temp = v;
   8574                     Py_INCREF(temp);
   8575                 }
   8576                 else {
   8577                     PyObject *unicode;
   8578                     if (c == 's')
   8579                         temp = PyObject_Unicode(v);
   8580                     else
   8581                         temp = PyObject_Repr(v);
   8582                     if (temp == NULL)
   8583                         goto onError;
   8584                     if (PyUnicode_Check(temp))
   8585                         /* nothing to do */;
   8586                     else if (PyString_Check(temp)) {
   8587                         /* convert to string to Unicode */
   8588                         unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
   8589                                                    PyString_GET_SIZE(temp),
   8590                                                    NULL,
   8591                                                    "strict");
   8592                         Py_DECREF(temp);
   8593                         temp = unicode;
   8594                         if (temp == NULL)
   8595                             goto onError;
   8596                     }
   8597                     else {
   8598                         Py_DECREF(temp);
   8599                         PyErr_SetString(PyExc_TypeError,
   8600                                         "%s argument has non-string str()");
   8601                         goto onError;
   8602                     }
   8603                 }
   8604                 pbuf = PyUnicode_AS_UNICODE(temp);
   8605                 len = PyUnicode_GET_SIZE(temp);
   8606                 if (prec >= 0 && len > prec)
   8607                     len = prec;
   8608                 break;
   8609 
   8610             case 'i':
   8611             case 'd':
   8612             case 'u':
   8613             case 'o':
   8614             case 'x':
   8615             case 'X':
   8616                 if (c == 'i')
   8617                     c = 'd';
   8618                 isnumok = 0;
   8619                 if (PyNumber_Check(v)) {
   8620                     PyObject *iobj=NULL;
   8621 
   8622                     if (PyInt_Check(v) || (PyLong_Check(v))) {
   8623                         iobj = v;
   8624                         Py_INCREF(iobj);
   8625                     }
   8626                     else {
   8627                         iobj = PyNumber_Int(v);
   8628                         if (iobj==NULL) iobj = PyNumber_Long(v);
   8629                     }
   8630                     if (iobj!=NULL) {
   8631                         if (PyInt_Check(iobj)) {
   8632                             isnumok = 1;
   8633                             pbuf = formatbuf;
   8634                             len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
   8635                                             flags, prec, c, iobj);
   8636                             Py_DECREF(iobj);
   8637                             if (len < 0)
   8638                                 goto onError;
   8639                             sign = 1;
   8640                         }
   8641                         else if (PyLong_Check(iobj)) {
   8642                             isnumok = 1;
   8643                             temp = formatlong(iobj, flags, prec, c);
   8644                             Py_DECREF(iobj);
   8645                             if (!temp)
   8646                                 goto onError;
   8647                             pbuf = PyUnicode_AS_UNICODE(temp);
   8648                             len = PyUnicode_GET_SIZE(temp);
   8649                             sign = 1;
   8650                         }
   8651                         else {
   8652                             Py_DECREF(iobj);
   8653                         }
   8654                     }
   8655                 }
   8656                 if (!isnumok) {
   8657                     PyErr_Format(PyExc_TypeError,
   8658                                  "%%%c format: a number is required, "
   8659                                  "not %.200s", (char)c, Py_TYPE(v)->tp_name);
   8660                     goto onError;
   8661                 }
   8662                 if (flags & F_ZERO)
   8663                     fill = '0';
   8664                 break;
   8665 
   8666             case 'e':
   8667             case 'E':
   8668             case 'f':
   8669             case 'F':
   8670             case 'g':
   8671             case 'G':
   8672                 temp = formatfloat(v, flags, prec, c);
   8673                 if (temp == NULL)
   8674                     goto onError;
   8675                 pbuf = PyUnicode_AS_UNICODE(temp);
   8676                 len = PyUnicode_GET_SIZE(temp);
   8677                 sign = 1;
   8678                 if (flags & F_ZERO)
   8679                     fill = '0';
   8680                 break;
   8681 
   8682             case 'c':
   8683                 pbuf = formatbuf;
   8684                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
   8685                 if (len < 0)
   8686                     goto onError;
   8687                 break;
   8688 
   8689             default:
   8690                 PyErr_Format(PyExc_ValueError,
   8691                              "unsupported format character '%c' (0x%x) "
   8692                              "at index %zd",
   8693                              (31<=c && c<=126) ? (char)c : '?',
   8694                              (int)c,
   8695                              (Py_ssize_t)(fmt - 1 -
   8696                                           PyUnicode_AS_UNICODE(uformat)));
   8697                 goto onError;
   8698             }
   8699             if (sign) {
   8700                 if (*pbuf == '-' || *pbuf == '+') {
   8701                     sign = *pbuf++;
   8702                     len--;
   8703                 }
   8704                 else if (flags & F_SIGN)
   8705                     sign = '+';
   8706                 else if (flags & F_BLANK)
   8707                     sign = ' ';
   8708                 else
   8709                     sign = 0;
   8710             }
   8711             if (width < len)
   8712                 width = len;
   8713             if (rescnt - (sign != 0) < width) {
   8714                 reslen -= rescnt;
   8715                 rescnt = width + fmtcnt + 100;
   8716                 reslen += rescnt;
   8717                 if (reslen < 0) {
   8718                     Py_XDECREF(temp);
   8719                     PyErr_NoMemory();
   8720                     goto onError;
   8721                 }
   8722                 if (_PyUnicode_Resize(&result, reslen) < 0) {
   8723                     Py_XDECREF(temp);
   8724                     goto onError;
   8725                 }
   8726                 res = PyUnicode_AS_UNICODE(result)
   8727                     + reslen - rescnt;
   8728             }
   8729             if (sign) {
   8730                 if (fill != ' ')
   8731                     *res++ = sign;
   8732                 rescnt--;
   8733                 if (width > len)
   8734                     width--;
   8735             }
   8736             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
   8737                 assert(pbuf[0] == '0');
   8738                 assert(pbuf[1] == c);
   8739                 if (fill != ' ') {
   8740                     *res++ = *pbuf++;
   8741                     *res++ = *pbuf++;
   8742                 }
   8743                 rescnt -= 2;
   8744                 width -= 2;
   8745                 if (width < 0)
   8746                     width = 0;
   8747                 len -= 2;
   8748             }
   8749             if (width > len && !(flags & F_LJUST)) {
   8750                 do {
   8751                     --rescnt;
   8752                     *res++ = fill;
   8753                 } while (--width > len);
   8754             }
   8755             if (fill == ' ') {
   8756                 if (sign)
   8757                     *res++ = sign;
   8758                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
   8759                     assert(pbuf[0] == '0');
   8760                     assert(pbuf[1] == c);
   8761                     *res++ = *pbuf++;
   8762                     *res++ = *pbuf++;
   8763                 }
   8764             }
   8765             Py_UNICODE_COPY(res, pbuf, len);
   8766             res += len;
   8767             rescnt -= len;
   8768             while (--width >= len) {
   8769                 --rescnt;
   8770                 *res++ = ' ';
   8771             }
   8772             if (dict && (argidx < arglen) && c != '%') {
   8773                 PyErr_SetString(PyExc_TypeError,
   8774                                 "not all arguments converted during string formatting");
   8775                 Py_XDECREF(temp);
   8776                 goto onError;
   8777             }
   8778             Py_XDECREF(temp);
   8779         } /* '%' */
   8780     } /* until end */
   8781     if (argidx < arglen && !dict) {
   8782         PyErr_SetString(PyExc_TypeError,
   8783                         "not all arguments converted during string formatting");
   8784         goto onError;
   8785     }
   8786 
   8787     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
   8788         goto onError;
   8789     if (args_owned) {
   8790         Py_DECREF(args);
   8791     }
   8792     Py_DECREF(uformat);
   8793     return (PyObject *)result;
   8794 
   8795   onError:
   8796     Py_XDECREF(result);
   8797     Py_DECREF(uformat);
   8798     if (args_owned) {
   8799         Py_DECREF(args);
   8800     }
   8801     return NULL;
   8802 }
   8803 
   8804 static PyBufferProcs unicode_as_buffer = {
   8805     (readbufferproc) unicode_buffer_getreadbuf,
   8806     (writebufferproc) unicode_buffer_getwritebuf,
   8807     (segcountproc) unicode_buffer_getsegcount,
   8808     (charbufferproc) unicode_buffer_getcharbuf,
   8809 };
   8810 
   8811 static PyObject *
   8812 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
   8813 
   8814 static PyObject *
   8815 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
   8816 {
   8817     PyObject *x = NULL;
   8818     static char *kwlist[] = {"string", "encoding", "errors", 0};
   8819     char *encoding = NULL;
   8820     char *errors = NULL;
   8821 
   8822     if (type != &PyUnicode_Type)
   8823         return unicode_subtype_new(type, args, kwds);
   8824     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
   8825                                      kwlist, &x, &encoding, &errors))
   8826         return NULL;
   8827     if (x == NULL)
   8828         return (PyObject *)_PyUnicode_New(0);
   8829     if (encoding == NULL && errors == NULL)
   8830         return PyObject_Unicode(x);
   8831     else
   8832         return PyUnicode_FromEncodedObject(x, encoding, errors);
   8833 }
   8834 
   8835 static PyObject *
   8836 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
   8837 {
   8838     PyUnicodeObject *tmp, *pnew;
   8839     Py_ssize_t n;
   8840 
   8841     assert(PyType_IsSubtype(type, &PyUnicode_Type));
   8842     tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
   8843     if (tmp == NULL)
   8844         return NULL;
   8845     assert(PyUnicode_Check(tmp));
   8846     pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
   8847     if (pnew == NULL) {
   8848         Py_DECREF(tmp);
   8849         return NULL;
   8850     }
   8851     pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
   8852     if (pnew->str == NULL) {
   8853         _Py_ForgetReference((PyObject *)pnew);
   8854         PyObject_Del(pnew);
   8855         Py_DECREF(tmp);
   8856         return PyErr_NoMemory();
   8857     }
   8858     Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
   8859     pnew->length = n;
   8860     pnew->hash = tmp->hash;
   8861     Py_DECREF(tmp);
   8862     return (PyObject *)pnew;
   8863 }
   8864 
   8865 PyDoc_STRVAR(unicode_doc,
   8866              "unicode(object='') -> unicode object\n\
   8867 unicode(string[, encoding[, errors]]) -> unicode object\n\
   8868 \n\
   8869 Create a new Unicode object from the given encoded string.\n\
   8870 encoding defaults to the current default string encoding.\n\
   8871 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
   8872 
   8873 PyTypeObject PyUnicode_Type = {
   8874     PyVarObject_HEAD_INIT(&PyType_Type, 0)
   8875     "unicode",              /* tp_name */
   8876     sizeof(PyUnicodeObject),        /* tp_size */
   8877     0,                  /* tp_itemsize */
   8878     /* Slots */
   8879     (destructor)unicode_dealloc,    /* tp_dealloc */
   8880     0,                  /* tp_print */
   8881     0,                  /* tp_getattr */
   8882     0,                  /* tp_setattr */
   8883     0,                  /* tp_compare */
   8884     unicode_repr,           /* tp_repr */
   8885     &unicode_as_number,         /* tp_as_number */
   8886     &unicode_as_sequence,       /* tp_as_sequence */
   8887     &unicode_as_mapping,        /* tp_as_mapping */
   8888     (hashfunc) unicode_hash,        /* tp_hash*/
   8889     0,                  /* tp_call*/
   8890     (reprfunc) unicode_str,     /* tp_str */
   8891     PyObject_GenericGetAttr,        /* tp_getattro */
   8892     0,                  /* tp_setattro */
   8893     &unicode_as_buffer,         /* tp_as_buffer */
   8894     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
   8895     Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
   8896     unicode_doc,            /* tp_doc */
   8897     0,                  /* tp_traverse */
   8898     0,                  /* tp_clear */
   8899     PyUnicode_RichCompare,      /* tp_richcompare */
   8900     0,                  /* tp_weaklistoffset */
   8901     0,                  /* tp_iter */
   8902     0,                  /* tp_iternext */
   8903     unicode_methods,            /* tp_methods */
   8904     0,                  /* tp_members */
   8905     0,                  /* tp_getset */
   8906     &PyBaseString_Type,         /* tp_base */
   8907     0,                  /* tp_dict */
   8908     0,                  /* tp_descr_get */
   8909     0,                  /* tp_descr_set */
   8910     0,                  /* tp_dictoffset */
   8911     0,                  /* tp_init */
   8912     0,                  /* tp_alloc */
   8913     unicode_new,            /* tp_new */
   8914     PyObject_Del,           /* tp_free */
   8915 };
   8916 
   8917 /* Initialize the Unicode implementation */
   8918 
   8919 void _PyUnicode_Init(void)
   8920 {
   8921     /* XXX - move this array to unicodectype.c ? */
   8922     Py_UNICODE linebreak[] = {
   8923         0x000A, /* LINE FEED */
   8924         0x000D, /* CARRIAGE RETURN */
   8925         0x001C, /* FILE SEPARATOR */
   8926         0x001D, /* GROUP SEPARATOR */
   8927         0x001E, /* RECORD SEPARATOR */
   8928         0x0085, /* NEXT LINE */
   8929         0x2028, /* LINE SEPARATOR */
   8930         0x2029, /* PARAGRAPH SEPARATOR */
   8931     };
   8932 
   8933     /* Init the implementation */
   8934     if (!unicode_empty) {
   8935         unicode_empty = _PyUnicode_New(0);
   8936         if (!unicode_empty)
   8937             return;
   8938     }
   8939 
   8940     if (PyType_Ready(&PyUnicode_Type) < 0)
   8941         Py_FatalError("Can't initialize 'unicode'");
   8942 
   8943     /* initialize the linebreak bloom filter */
   8944     bloom_linebreak = make_bloom_mask(
   8945         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
   8946         );
   8947 
   8948     PyType_Ready(&EncodingMapType);
   8949 
   8950     if (PyType_Ready(&PyFieldNameIter_Type) < 0)
   8951         Py_FatalError("Can't initialize field name iterator type");
   8952 
   8953     if (PyType_Ready(&PyFormatterIter_Type) < 0)
   8954         Py_FatalError("Can't initialize formatter iter type");
   8955 }
   8956 
   8957 /* Finalize the Unicode implementation */
   8958 
   8959 int
   8960 PyUnicode_ClearFreeList(void)
   8961 {
   8962     int freelist_size = numfree;
   8963     PyUnicodeObject *u;
   8964 
   8965     for (u = free_list; u != NULL;) {
   8966         PyUnicodeObject *v = u;
   8967         u = *(PyUnicodeObject **)u;
   8968         if (v->str)
   8969             PyObject_DEL(v->str);
   8970         Py_XDECREF(v->defenc);
   8971         PyObject_Del(v);
   8972         numfree--;
   8973     }
   8974     free_list = NULL;
   8975     assert(numfree == 0);
   8976     return freelist_size;
   8977 }
   8978 
   8979 void
   8980 _PyUnicode_Fini(void)
   8981 {
   8982     int i;
   8983 
   8984     Py_CLEAR(unicode_empty);
   8985 
   8986     for (i = 0; i < 256; i++)
   8987         Py_CLEAR(unicode_latin1[i]);
   8988 
   8989     (void)PyUnicode_ClearFreeList();
   8990 }
   8991 
   8992 #ifdef __cplusplus
   8993 }
   8994 #endif
   8995