Home | History | Annotate | Download | only in Objects
      1 /*
      2 
      3 Unicode implementation based on original code by Fredrik Lundh,
      4 modified by Marc-Andre Lemburg <mal (at) lemburg.com> according to the
      5 Unicode Integration Proposal (see file Misc/unicode.txt).
      6 
      7 Major speed upgrades to the method implementations at the Reykjavik
      8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
      9 
     10 Copyright (c) Corporation for National Research Initiatives.
     11 
     12 --------------------------------------------------------------------
     13 The original string type implementation is:
     14 
     15   Copyright (c) 1999 by Secret Labs AB
     16   Copyright (c) 1999 by Fredrik Lundh
     17 
     18 By obtaining, using, and/or copying this software and/or its
     19 associated documentation, you agree that you have read, understood,
     20 and will comply with the following terms and conditions:
     21 
     22 Permission to use, copy, modify, and distribute this software and its
     23 associated documentation for any purpose and without fee is hereby
     24 granted, provided that the above copyright notice appears in all
     25 copies, and that both that copyright notice and this permission notice
     26 appear in supporting documentation, and that the name of Secret Labs
     27 AB or the author not be used in advertising or publicity pertaining to
     28 distribution of the software without specific, written prior
     29 permission.
     30 
     31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
     32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
     33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
     34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
     37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     38 --------------------------------------------------------------------
     39 
     40 */
     41 
     42 #define PY_SSIZE_T_CLEAN
     43 #include "Python.h"
     44 
     45 #include "unicodeobject.h"
     46 #include "ucnhash.h"
     47 
     48 #ifdef MS_WINDOWS
     49 #include <windows.h>
     50 #endif
     51 
     52 /* Limit for the Unicode object free list */
     53 
     54 #define PyUnicode_MAXFREELIST       1024
     55 
     56 /* Limit for the Unicode object free list stay alive optimization.
     57 
     58    The implementation will keep allocated Unicode memory intact for
     59    all objects on the free list having a size less than this
     60    limit. This reduces malloc() overhead for small Unicode objects.
     61 
     62    At worst this will result in PyUnicode_MAXFREELIST *
     63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
     64    malloc()-overhead) bytes of unused garbage.
     65 
     66    Setting the limit to 0 effectively turns the feature off.
     67 
     68    Note: This is an experimental feature ! If you get core dumps when
     69    using Unicode objects, turn this feature off.
     70 
     71 */
     72 
     73 #define KEEPALIVE_SIZE_LIMIT       9
     74 
     75 /* Endianness switches; defaults to little endian */
     76 
     77 #ifdef WORDS_BIGENDIAN
     78 # define BYTEORDER_IS_BIG_ENDIAN
     79 #else
     80 # define BYTEORDER_IS_LITTLE_ENDIAN
     81 #endif
     82 
     83 /* --- Globals ------------------------------------------------------------
     84 
     85    The globals are initialized by the _PyUnicode_Init() API and should
     86    not be used before calling that API.
     87 
     88 */
     89 
     90 
     91 #ifdef __cplusplus
     92 extern "C" {
     93 #endif
     94 
     95 /* Free list for Unicode objects */
     96 static PyUnicodeObject *free_list;
     97 static int numfree;
     98 
     99 /* The empty Unicode object is shared to improve performance. */
    100 static PyUnicodeObject *unicode_empty;
    101 
    102 /* Single character Unicode strings in the Latin-1 range are being
    103    shared as well. */
    104 static PyUnicodeObject *unicode_latin1[256];
    105 
    106 /* Default encoding to use and assume when NULL is passed as encoding
    107    parameter; it is initialized by _PyUnicode_Init().
    108 
    109    Always use the PyUnicode_SetDefaultEncoding() and
    110    PyUnicode_GetDefaultEncoding() APIs to access this global.
    111 
    112 */
    113 static char unicode_default_encoding[100];
    114 
    115 /* Fast detection of the most frequent whitespace characters */
    116 const unsigned char _Py_ascii_whitespace[] = {
    117     0, 0, 0, 0, 0, 0, 0, 0,
    118 /*     case 0x0009: * CHARACTER TABULATION */
    119 /*     case 0x000A: * LINE FEED */
    120 /*     case 0x000B: * LINE TABULATION */
    121 /*     case 0x000C: * FORM FEED */
    122 /*     case 0x000D: * CARRIAGE RETURN */
    123     0, 1, 1, 1, 1, 1, 0, 0,
    124     0, 0, 0, 0, 0, 0, 0, 0,
    125 /*     case 0x001C: * FILE SEPARATOR */
    126 /*     case 0x001D: * GROUP SEPARATOR */
    127 /*     case 0x001E: * RECORD SEPARATOR */
    128 /*     case 0x001F: * UNIT SEPARATOR */
    129     0, 0, 0, 0, 1, 1, 1, 1,
    130 /*     case 0x0020: * SPACE */
    131     1, 0, 0, 0, 0, 0, 0, 0,
    132     0, 0, 0, 0, 0, 0, 0, 0,
    133     0, 0, 0, 0, 0, 0, 0, 0,
    134     0, 0, 0, 0, 0, 0, 0, 0,
    135 
    136     0, 0, 0, 0, 0, 0, 0, 0,
    137     0, 0, 0, 0, 0, 0, 0, 0,
    138     0, 0, 0, 0, 0, 0, 0, 0,
    139     0, 0, 0, 0, 0, 0, 0, 0,
    140     0, 0, 0, 0, 0, 0, 0, 0,
    141     0, 0, 0, 0, 0, 0, 0, 0,
    142     0, 0, 0, 0, 0, 0, 0, 0,
    143     0, 0, 0, 0, 0, 0, 0, 0
    144 };
    145 
    146 /* Same for linebreaks */
    147 static unsigned char ascii_linebreak[] = {
    148     0, 0, 0, 0, 0, 0, 0, 0,
    149 /*         0x000A, * LINE FEED */
    150 /*         0x000B, * LINE TABULATION */
    151 /*         0x000C, * FORM FEED */
    152 /*         0x000D, * CARRIAGE RETURN */
    153     0, 0, 1, 1, 1, 1, 0, 0,
    154     0, 0, 0, 0, 0, 0, 0, 0,
    155 /*         0x001C, * FILE SEPARATOR */
    156 /*         0x001D, * GROUP SEPARATOR */
    157 /*         0x001E, * RECORD SEPARATOR */
    158     0, 0, 0, 0, 1, 1, 1, 0,
    159     0, 0, 0, 0, 0, 0, 0, 0,
    160     0, 0, 0, 0, 0, 0, 0, 0,
    161     0, 0, 0, 0, 0, 0, 0, 0,
    162     0, 0, 0, 0, 0, 0, 0, 0,
    163 
    164     0, 0, 0, 0, 0, 0, 0, 0,
    165     0, 0, 0, 0, 0, 0, 0, 0,
    166     0, 0, 0, 0, 0, 0, 0, 0,
    167     0, 0, 0, 0, 0, 0, 0, 0,
    168     0, 0, 0, 0, 0, 0, 0, 0,
    169     0, 0, 0, 0, 0, 0, 0, 0,
    170     0, 0, 0, 0, 0, 0, 0, 0,
    171     0, 0, 0, 0, 0, 0, 0, 0
    172 };
    173 
    174 
    175 Py_UNICODE
    176 PyUnicode_GetMax(void)
    177 {
    178 #ifdef Py_UNICODE_WIDE
    179     return 0x10FFFF;
    180 #else
    181     /* This is actually an illegal character, so it should
    182        not be passed to unichr. */
    183     return 0xFFFF;
    184 #endif
    185 }
    186 
    187 /* --- Bloom Filters ----------------------------------------------------- */
    188 
    189 /* stuff to implement simple "bloom filters" for Unicode characters.
    190    to keep things simple, we use a single bitmask, using the least 5
    191    bits from each unicode characters as the bit index. */
    192 
    193 /* the linebreak mask is set up by Unicode_Init below */
    194 
    195 #if LONG_BIT >= 128
    196 #define BLOOM_WIDTH 128
    197 #elif LONG_BIT >= 64
    198 #define BLOOM_WIDTH 64
    199 #elif LONG_BIT >= 32
    200 #define BLOOM_WIDTH 32
    201 #else
    202 #error "LONG_BIT is smaller than 32"
    203 #endif
    204 
    205 #define BLOOM_MASK unsigned long
    206 
    207 static BLOOM_MASK bloom_linebreak;
    208 
    209 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
    210 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
    211 
    212 #define BLOOM_LINEBREAK(ch)                                             \
    213     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
    214      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
    215 
    216 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
    217 {
    218     /* calculate simple bloom-style bitmask for a given unicode string */
    219 
    220     BLOOM_MASK mask;
    221     Py_ssize_t i;
    222 
    223     mask = 0;
    224     for (i = 0; i < len; i++)
    225         BLOOM_ADD(mask, ptr[i]);
    226 
    227     return mask;
    228 }
    229 
    230 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
    231 {
    232     Py_ssize_t i;
    233 
    234     for (i = 0; i < setlen; i++)
    235         if (set[i] == chr)
    236             return 1;
    237 
    238     return 0;
    239 }
    240 
    241 #define BLOOM_MEMBER(mask, chr, set, setlen)                    \
    242     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
    243 
    244 /* --- Unicode Object ----------------------------------------------------- */
    245 
    246 static
    247 int unicode_resize(register PyUnicodeObject *unicode,
    248                    Py_ssize_t length)
    249 {
    250     void *oldstr;
    251 
    252     /* Shortcut if there's nothing much to do. */
    253     if (unicode->length == length)
    254         goto reset;
    255 
    256     /* Resizing shared object (unicode_empty or single character
    257        objects) in-place is not allowed. Use PyUnicode_Resize()
    258        instead ! */
    259 
    260     if (unicode == unicode_empty ||
    261         (unicode->length == 1 &&
    262          unicode->str[0] < 256U &&
    263          unicode_latin1[unicode->str[0]] == unicode)) {
    264         PyErr_SetString(PyExc_SystemError,
    265                         "can't resize shared unicode objects");
    266         return -1;
    267     }
    268 
    269     /* We allocate one more byte to make sure the string is Ux0000 terminated.
    270        The overallocation is also used by fastsearch, which assumes that it's
    271        safe to look at str[length] (without making any assumptions about what
    272        it contains). */
    273 
    274     oldstr = unicode->str;
    275     unicode->str = PyObject_REALLOC(unicode->str,
    276                                     sizeof(Py_UNICODE) * (length + 1));
    277     if (!unicode->str) {
    278         unicode->str = (Py_UNICODE *)oldstr;
    279         PyErr_NoMemory();
    280         return -1;
    281     }
    282     unicode->str[length] = 0;
    283     unicode->length = length;
    284 
    285   reset:
    286     /* Reset the object caches */
    287     if (unicode->defenc) {
    288         Py_CLEAR(unicode->defenc);
    289     }
    290     unicode->hash = -1;
    291 
    292     return 0;
    293 }
    294 
    295 /* We allocate one more byte to make sure the string is
    296    Ux0000 terminated; some code relies on that.
    297 
    298    XXX This allocator could further be enhanced by assuring that the
    299    free list never reduces its size below 1.
    300 
    301 */
    302 
    303 static
    304 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
    305 {
    306     register PyUnicodeObject *unicode;
    307 
    308     /* Optimization for empty strings */
    309     if (length == 0 && unicode_empty != NULL) {
    310         Py_INCREF(unicode_empty);
    311         return unicode_empty;
    312     }
    313 
    314     /* Ensure we won't overflow the size. */
    315     if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
    316         return (PyUnicodeObject *)PyErr_NoMemory();
    317     }
    318 
    319     /* Unicode freelist & memory allocation */
    320     if (free_list) {
    321         unicode = free_list;
    322         free_list = *(PyUnicodeObject **)unicode;
    323         numfree--;
    324         if (unicode->str) {
    325             /* Keep-Alive optimization: we only upsize the buffer,
    326                never downsize it. */
    327             if ((unicode->length < length) &&
    328                 unicode_resize(unicode, length) < 0) {
    329                 PyObject_DEL(unicode->str);
    330                 unicode->str = NULL;
    331             }
    332         }
    333         else {
    334             size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
    335             unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
    336         }
    337         PyObject_INIT(unicode, &PyUnicode_Type);
    338     }
    339     else {
    340         size_t new_size;
    341         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
    342         if (unicode == NULL)
    343             return NULL;
    344         new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
    345         unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
    346     }
    347 
    348     if (!unicode->str) {
    349         PyErr_NoMemory();
    350         goto onError;
    351     }
    352     /* Initialize the first element to guard against cases where
    353      * the caller fails before initializing str -- unicode_resize()
    354      * reads str[0], and the Keep-Alive optimization can keep memory
    355      * allocated for str alive across a call to unicode_dealloc(unicode).
    356      * We don't want unicode_resize to read uninitialized memory in
    357      * that case.
    358      */
    359     unicode->str[0] = 0;
    360     unicode->str[length] = 0;
    361     unicode->length = length;
    362     unicode->hash = -1;
    363     unicode->defenc = NULL;
    364     return unicode;
    365 
    366   onError:
    367     /* XXX UNREF/NEWREF interface should be more symmetrical */
    368     _Py_DEC_REFTOTAL;
    369     _Py_ForgetReference((PyObject *)unicode);
    370     PyObject_Del(unicode);
    371     return NULL;
    372 }
    373 
    374 static
    375 void unicode_dealloc(register PyUnicodeObject *unicode)
    376 {
    377     if (PyUnicode_CheckExact(unicode) &&
    378         numfree < PyUnicode_MAXFREELIST) {
    379         /* Keep-Alive optimization */
    380         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
    381             PyObject_DEL(unicode->str);
    382             unicode->str = NULL;
    383             unicode->length = 0;
    384         }
    385         if (unicode->defenc) {
    386             Py_CLEAR(unicode->defenc);
    387         }
    388         /* Add to free list */
    389         *(PyUnicodeObject **)unicode = free_list;
    390         free_list = unicode;
    391         numfree++;
    392     }
    393     else {
    394         PyObject_DEL(unicode->str);
    395         Py_XDECREF(unicode->defenc);
    396         Py_TYPE(unicode)->tp_free((PyObject *)unicode);
    397     }
    398 }
    399 
    400 static
    401 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
    402 {
    403     register PyUnicodeObject *v;
    404 
    405     /* Argument checks */
    406     if (unicode == NULL) {
    407         PyErr_BadInternalCall();
    408         return -1;
    409     }
    410     v = *unicode;
    411     if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
    412         PyErr_BadInternalCall();
    413         return -1;
    414     }
    415 
    416     /* Resizing unicode_empty and single character objects is not
    417        possible since these are being shared. We simply return a fresh
    418        copy with the same Unicode content. */
    419     if (v->length != length &&
    420         (v == unicode_empty || v->length == 1)) {
    421         PyUnicodeObject *w = _PyUnicode_New(length);
    422         if (w == NULL)
    423             return -1;
    424         Py_UNICODE_COPY(w->str, v->str,
    425                         length < v->length ? length : v->length);
    426         Py_DECREF(*unicode);
    427         *unicode = w;
    428         return 0;
    429     }
    430 
    431     /* Note that we don't have to modify *unicode for unshared Unicode
    432        objects, since we can modify them in-place. */
    433     return unicode_resize(v, length);
    434 }
    435 
    436 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
    437 {
    438     return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
    439 }
    440 
    441 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
    442                                 Py_ssize_t size)
    443 {
    444     PyUnicodeObject *unicode;
    445 
    446     /* If the Unicode data is known at construction time, we can apply
    447        some optimizations which share commonly used objects. */
    448     if (u != NULL) {
    449 
    450         /* Optimization for empty strings */
    451         if (size == 0 && unicode_empty != NULL) {
    452             Py_INCREF(unicode_empty);
    453             return (PyObject *)unicode_empty;
    454         }
    455 
    456         /* Single character Unicode objects in the Latin-1 range are
    457            shared when using this constructor */
    458         if (size == 1 && *u < 256) {
    459             unicode = unicode_latin1[*u];
    460             if (!unicode) {
    461                 unicode = _PyUnicode_New(1);
    462                 if (!unicode)
    463                     return NULL;
    464                 unicode->str[0] = *u;
    465                 unicode_latin1[*u] = unicode;
    466             }
    467             Py_INCREF(unicode);
    468             return (PyObject *)unicode;
    469         }
    470     }
    471 
    472     unicode = _PyUnicode_New(size);
    473     if (!unicode)
    474         return NULL;
    475 
    476     /* Copy the Unicode data into the new object */
    477     if (u != NULL)
    478         Py_UNICODE_COPY(unicode->str, u, size);
    479 
    480     return (PyObject *)unicode;
    481 }
    482 
    483 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
    484 {
    485     PyUnicodeObject *unicode;
    486 
    487     if (size < 0) {
    488         PyErr_SetString(PyExc_SystemError,
    489                         "Negative size passed to PyUnicode_FromStringAndSize");
    490         return NULL;
    491     }
    492 
    493     /* If the Unicode data is known at construction time, we can apply
    494        some optimizations which share commonly used objects.
    495        Also, this means the input must be UTF-8, so fall back to the
    496        UTF-8 decoder at the end. */
    497     if (u != NULL) {
    498 
    499         /* Optimization for empty strings */
    500         if (size == 0 && unicode_empty != NULL) {
    501             Py_INCREF(unicode_empty);
    502             return (PyObject *)unicode_empty;
    503         }
    504 
    505         /* Single characters are shared when using this constructor.
    506            Restrict to ASCII, since the input must be UTF-8. */
    507         if (size == 1 && Py_CHARMASK(*u) < 128) {
    508             unicode = unicode_latin1[Py_CHARMASK(*u)];
    509             if (!unicode) {
    510                 unicode = _PyUnicode_New(1);
    511                 if (!unicode)
    512                     return NULL;
    513                 unicode->str[0] = Py_CHARMASK(*u);
    514                 unicode_latin1[Py_CHARMASK(*u)] = unicode;
    515             }
    516             Py_INCREF(unicode);
    517             return (PyObject *)unicode;
    518         }
    519 
    520         return PyUnicode_DecodeUTF8(u, size, NULL);
    521     }
    522 
    523     unicode = _PyUnicode_New(size);
    524     if (!unicode)
    525         return NULL;
    526 
    527     return (PyObject *)unicode;
    528 }
    529 
    530 PyObject *PyUnicode_FromString(const char *u)
    531 {
    532     size_t size = strlen(u);
    533     if (size > PY_SSIZE_T_MAX) {
    534         PyErr_SetString(PyExc_OverflowError, "input too long");
    535         return NULL;
    536     }
    537 
    538     return PyUnicode_FromStringAndSize(u, size);
    539 }
    540 
    541 #ifdef HAVE_WCHAR_H
    542 
    543 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
    544 # define CONVERT_WCHAR_TO_SURROGATES
    545 #endif
    546 
    547 #ifdef CONVERT_WCHAR_TO_SURROGATES
    548 
    549 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
    550    to convert from UTF32 to UTF16. */
    551 
    552 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
    553                                  Py_ssize_t size)
    554 {
    555     PyUnicodeObject *unicode;
    556     register Py_ssize_t i;
    557     Py_ssize_t alloc;
    558     const wchar_t *orig_w;
    559 
    560     if (w == NULL) {
    561         PyErr_BadInternalCall();
    562         return NULL;
    563     }
    564 
    565     alloc = size;
    566     orig_w = w;
    567     for (i = size; i > 0; i--) {
    568         if (*w > 0xFFFF)
    569             alloc++;
    570         w++;
    571     }
    572     w = orig_w;
    573     unicode = _PyUnicode_New(alloc);
    574     if (!unicode)
    575         return NULL;
    576 
    577     /* Copy the wchar_t data into the new object */
    578     {
    579         register Py_UNICODE *u;
    580         u = PyUnicode_AS_UNICODE(unicode);
    581         for (i = size; i > 0; i--) {
    582             if (*w > 0xFFFF) {
    583                 wchar_t ordinal = *w++;
    584                 ordinal -= 0x10000;
    585                 *u++ = 0xD800 | (ordinal >> 10);
    586                 *u++ = 0xDC00 | (ordinal & 0x3FF);
    587             }
    588             else
    589                 *u++ = *w++;
    590         }
    591     }
    592     return (PyObject *)unicode;
    593 }
    594 
    595 #else
    596 
    597 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
    598                                  Py_ssize_t size)
    599 {
    600     PyUnicodeObject *unicode;
    601 
    602     if (w == NULL) {
    603         PyErr_BadInternalCall();
    604         return NULL;
    605     }
    606 
    607     unicode = _PyUnicode_New(size);
    608     if (!unicode)
    609         return NULL;
    610 
    611     /* Copy the wchar_t data into the new object */
    612 #ifdef HAVE_USABLE_WCHAR_T
    613     memcpy(unicode->str, w, size * sizeof(wchar_t));
    614 #else
    615     {
    616         register Py_UNICODE *u;
    617         register Py_ssize_t i;
    618         u = PyUnicode_AS_UNICODE(unicode);
    619         for (i = size; i > 0; i--)
    620             *u++ = *w++;
    621     }
    622 #endif
    623 
    624     return (PyObject *)unicode;
    625 }
    626 
    627 #endif /* CONVERT_WCHAR_TO_SURROGATES */
    628 
    629 #undef CONVERT_WCHAR_TO_SURROGATES
    630 
    631 static void
    632 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
    633 {
    634     *fmt++ = '%';
    635     if (width) {
    636         if (zeropad)
    637             *fmt++ = '0';
    638         fmt += sprintf(fmt, "%d", width);
    639     }
    640     if (precision)
    641         fmt += sprintf(fmt, ".%d", precision);
    642     if (longflag)
    643         *fmt++ = 'l';
    644     else if (size_tflag) {
    645         char *f = PY_FORMAT_SIZE_T;
    646         while (*f)
    647             *fmt++ = *f++;
    648     }
    649     *fmt++ = c;
    650     *fmt = '\0';
    651 }
    652 
    653 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
    654 
    655 PyObject *
    656 PyUnicode_FromFormatV(const char *format, va_list vargs)
    657 {
    658     va_list count;
    659     Py_ssize_t callcount = 0;
    660     PyObject **callresults = NULL;
    661     PyObject **callresult = NULL;
    662     Py_ssize_t n = 0;
    663     int width = 0;
    664     int precision = 0;
    665     int zeropad;
    666     const char* f;
    667     Py_UNICODE *s;
    668     PyObject *string;
    669     /* used by sprintf */
    670     char buffer[21];
    671     /* use abuffer instead of buffer, if we need more space
    672      * (which can happen if there's a format specifier with width). */
    673     char *abuffer = NULL;
    674     char *realbuffer;
    675     Py_ssize_t abuffersize = 0;
    676     char fmt[60]; /* should be enough for %0width.precisionld */
    677     const char *copy;
    678 
    679 #ifdef VA_LIST_IS_ARRAY
    680     Py_MEMCPY(count, vargs, sizeof(va_list));
    681 #else
    682 #ifdef  __va_copy
    683     __va_copy(count, vargs);
    684 #else
    685     count = vargs;
    686 #endif
    687 #endif
    688      /* step 1: count the number of %S/%R/%s format specifications
    689       * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
    690       * objects once during step 3 and put the result in an array) */
    691     for (f = format; *f; f++) {
    692          if (*f == '%') {
    693              if (*(f+1)=='%')
    694                  continue;
    695              if (*(f+1)=='S' || *(f+1)=='R')
    696                  ++callcount;
    697              while (isdigit((unsigned)*f))
    698                  width = (width*10) + *f++ - '0';
    699              while (*++f && *f != '%' && !isalpha((unsigned)*f))
    700                  ;
    701              if (*f == 's')
    702                  ++callcount;
    703          }
    704     }
    705     /* step 2: allocate memory for the results of
    706      * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
    707     if (callcount) {
    708         callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
    709         if (!callresults) {
    710             PyErr_NoMemory();
    711             return NULL;
    712         }
    713         callresult = callresults;
    714     }
    715     /* step 3: figure out how large a buffer we need */
    716     for (f = format; *f; f++) {
    717         if (*f == '%') {
    718             const char* p = f;
    719             width = 0;
    720             while (isdigit((unsigned)*f))
    721                 width = (width*10) + *f++ - '0';
    722             while (*++f && *f != '%' && !isalpha((unsigned)*f))
    723                 ;
    724 
    725             /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
    726              * they don't affect the amount of space we reserve.
    727              */
    728             if ((*f == 'l' || *f == 'z') &&
    729                 (f[1] == 'd' || f[1] == 'u'))
    730                 ++f;
    731 
    732             switch (*f) {
    733             case 'c':
    734                 (void)va_arg(count, int);
    735                 /* fall through... */
    736             case '%':
    737                 n++;
    738                 break;
    739             case 'd': case 'u': case 'i': case 'x':
    740                 (void) va_arg(count, int);
    741                 /* 20 bytes is enough to hold a 64-bit
    742                    integer.  Decimal takes the most space.
    743                    This isn't enough for octal.
    744                    If a width is specified we need more
    745                    (which we allocate later). */
    746                 if (width < 20)
    747                     width = 20;
    748                 n += width;
    749                 if (abuffersize < width)
    750                     abuffersize = width;
    751                 break;
    752             case 's':
    753             {
    754                 /* UTF-8 */
    755                 const char *s = va_arg(count, const char*);
    756                 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
    757                 if (!str)
    758                     goto fail;
    759                 n += PyUnicode_GET_SIZE(str);
    760                 /* Remember the str and switch to the next slot */
    761                 *callresult++ = str;
    762                 break;
    763             }
    764             case 'U':
    765             {
    766                 PyObject *obj = va_arg(count, PyObject *);
    767                 assert(obj && PyUnicode_Check(obj));
    768                 n += PyUnicode_GET_SIZE(obj);
    769                 break;
    770             }
    771             case 'V':
    772             {
    773                 PyObject *obj = va_arg(count, PyObject *);
    774                 const char *str = va_arg(count, const char *);
    775                 assert(obj || str);
    776                 assert(!obj || PyUnicode_Check(obj));
    777                 if (obj)
    778                     n += PyUnicode_GET_SIZE(obj);
    779                 else
    780                     n += strlen(str);
    781                 break;
    782             }
    783             case 'S':
    784             {
    785                 PyObject *obj = va_arg(count, PyObject *);
    786                 PyObject *str;
    787                 assert(obj);
    788                 str = PyObject_Str(obj);
    789                 if (!str)
    790                     goto fail;
    791                 n += PyUnicode_GET_SIZE(str);
    792                 /* Remember the str and switch to the next slot */
    793                 *callresult++ = str;
    794                 break;
    795             }
    796             case 'R':
    797             {
    798                 PyObject *obj = va_arg(count, PyObject *);
    799                 PyObject *repr;
    800                 assert(obj);
    801                 repr = PyObject_Repr(obj);
    802                 if (!repr)
    803                     goto fail;
    804                 n += PyUnicode_GET_SIZE(repr);
    805                 /* Remember the repr and switch to the next slot */
    806                 *callresult++ = repr;
    807                 break;
    808             }
    809             case 'p':
    810                 (void) va_arg(count, int);
    811                 /* maximum 64-bit pointer representation:
    812                  * 0xffffffffffffffff
    813                  * so 19 characters is enough.
    814                  * XXX I count 18 -- what's the extra for?
    815                  */
    816                 n += 19;
    817                 break;
    818             default:
    819                 /* if we stumble upon an unknown
    820                    formatting code, copy the rest of
    821                    the format string to the output
    822                    string. (we cannot just skip the
    823                    code, since there's no way to know
    824                    what's in the argument list) */
    825                 n += strlen(p);
    826                 goto expand;
    827             }
    828         } else
    829             n++;
    830     }
    831   expand:
    832     if (abuffersize > 20) {
    833         abuffer = PyObject_Malloc(abuffersize);
    834         if (!abuffer) {
    835             PyErr_NoMemory();
    836             goto fail;
    837         }
    838         realbuffer = abuffer;
    839     }
    840     else
    841         realbuffer = buffer;
    842     /* step 4: fill the buffer */
    843     /* Since we've analyzed how much space we need for the worst case,
    844        we don't have to resize the string.
    845        There can be no errors beyond this point. */
    846     string = PyUnicode_FromUnicode(NULL, n);
    847     if (!string)
    848         goto fail;
    849 
    850     s = PyUnicode_AS_UNICODE(string);
    851     callresult = callresults;
    852 
    853     for (f = format; *f; f++) {
    854         if (*f == '%') {
    855             const char* p = f++;
    856             int longflag = 0;
    857             int size_tflag = 0;
    858             zeropad = (*f == '0');
    859             /* parse the width.precision part */
    860             width = 0;
    861             while (isdigit((unsigned)*f))
    862                 width = (width*10) + *f++ - '0';
    863             precision = 0;
    864             if (*f == '.') {
    865                 f++;
    866                 while (isdigit((unsigned)*f))
    867                     precision = (precision*10) + *f++ - '0';
    868             }
    869             /* handle the long flag, but only for %ld and %lu.
    870                others can be added when necessary. */
    871             if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
    872                 longflag = 1;
    873                 ++f;
    874             }
    875             /* handle the size_t flag. */
    876             if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
    877                 size_tflag = 1;
    878                 ++f;
    879             }
    880 
    881             switch (*f) {
    882             case 'c':
    883                 *s++ = va_arg(vargs, int);
    884                 break;
    885             case 'd':
    886                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
    887                 if (longflag)
    888                     sprintf(realbuffer, fmt, va_arg(vargs, long));
    889                 else if (size_tflag)
    890                     sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
    891                 else
    892                     sprintf(realbuffer, fmt, va_arg(vargs, int));
    893                 appendstring(realbuffer);
    894                 break;
    895             case 'u':
    896                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
    897                 if (longflag)
    898                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
    899                 else if (size_tflag)
    900                     sprintf(realbuffer, fmt, va_arg(vargs, size_t));
    901                 else
    902                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
    903                 appendstring(realbuffer);
    904                 break;
    905             case 'i':
    906                 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
    907                 sprintf(realbuffer, fmt, va_arg(vargs, int));
    908                 appendstring(realbuffer);
    909                 break;
    910             case 'x':
    911                 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
    912                 sprintf(realbuffer, fmt, va_arg(vargs, int));
    913                 appendstring(realbuffer);
    914                 break;
    915             case 's':
    916             {
    917                 /* unused, since we already have the result */
    918                 (void) va_arg(vargs, char *);
    919                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
    920                                 PyUnicode_GET_SIZE(*callresult));
    921                 s += PyUnicode_GET_SIZE(*callresult);
    922                 /* We're done with the unicode()/repr() => forget it */
    923                 Py_DECREF(*callresult);
    924                 /* switch to next unicode()/repr() result */
    925                 ++callresult;
    926                 break;
    927             }
    928             case 'U':
    929             {
    930                 PyObject *obj = va_arg(vargs, PyObject *);
    931                 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
    932                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
    933                 s += size;
    934                 break;
    935             }
    936             case 'V':
    937             {
    938                 PyObject *obj = va_arg(vargs, PyObject *);
    939                 const char *str = va_arg(vargs, const char *);
    940                 if (obj) {
    941                     Py_ssize_t size = PyUnicode_GET_SIZE(obj);
    942                     Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
    943                     s += size;
    944                 } else {
    945                     appendstring(str);
    946                 }
    947                 break;
    948             }
    949             case 'S':
    950             case 'R':
    951             {
    952                 Py_UNICODE *ucopy;
    953                 Py_ssize_t usize;
    954                 Py_ssize_t upos;
    955                 /* unused, since we already have the result */
    956                 (void) va_arg(vargs, PyObject *);
    957                 ucopy = PyUnicode_AS_UNICODE(*callresult);
    958                 usize = PyUnicode_GET_SIZE(*callresult);
    959                 for (upos = 0; upos<usize;)
    960                     *s++ = ucopy[upos++];
    961                 /* We're done with the unicode()/repr() => forget it */
    962                 Py_DECREF(*callresult);
    963                 /* switch to next unicode()/repr() result */
    964                 ++callresult;
    965                 break;
    966             }
    967             case 'p':
    968                 sprintf(buffer, "%p", va_arg(vargs, void*));
    969                 /* %p is ill-defined:  ensure leading 0x. */
    970                 if (buffer[1] == 'X')
    971                     buffer[1] = 'x';
    972                 else if (buffer[1] != 'x') {
    973                     memmove(buffer+2, buffer, strlen(buffer)+1);
    974                     buffer[0] = '0';
    975                     buffer[1] = 'x';
    976                 }
    977                 appendstring(buffer);
    978                 break;
    979             case '%':
    980                 *s++ = '%';
    981                 break;
    982             default:
    983                 appendstring(p);
    984                 goto end;
    985             }
    986         } else
    987             *s++ = *f;
    988     }
    989 
    990   end:
    991     if (callresults)
    992         PyObject_Free(callresults);
    993     if (abuffer)
    994         PyObject_Free(abuffer);
    995     PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
    996     return string;
    997   fail:
    998     if (callresults) {
    999         PyObject **callresult2 = callresults;
   1000         while (callresult2 < callresult) {
   1001             Py_DECREF(*callresult2);
   1002             ++callresult2;
   1003         }
   1004         PyObject_Free(callresults);
   1005     }
   1006     if (abuffer)
   1007         PyObject_Free(abuffer);
   1008     return NULL;
   1009 }
   1010 
   1011 #undef appendstring
   1012 
   1013 PyObject *
   1014 PyUnicode_FromFormat(const char *format, ...)
   1015 {
   1016     PyObject* ret;
   1017     va_list vargs;
   1018 
   1019 #ifdef HAVE_STDARG_PROTOTYPES
   1020     va_start(vargs, format);
   1021 #else
   1022     va_start(vargs);
   1023 #endif
   1024     ret = PyUnicode_FromFormatV(format, vargs);
   1025     va_end(vargs);
   1026     return ret;
   1027 }
   1028 
   1029 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
   1030                                 wchar_t *w,
   1031                                 Py_ssize_t size)
   1032 {
   1033     if (unicode == NULL) {
   1034         PyErr_BadInternalCall();
   1035         return -1;
   1036     }
   1037 
   1038     /* If possible, try to copy the 0-termination as well */
   1039     if (size > PyUnicode_GET_SIZE(unicode))
   1040         size = PyUnicode_GET_SIZE(unicode) + 1;
   1041 
   1042 #ifdef HAVE_USABLE_WCHAR_T
   1043     memcpy(w, unicode->str, size * sizeof(wchar_t));
   1044 #else
   1045     {
   1046         register Py_UNICODE *u;
   1047         register Py_ssize_t i;
   1048         u = PyUnicode_AS_UNICODE(unicode);
   1049         for (i = size; i > 0; i--)
   1050             *w++ = *u++;
   1051     }
   1052 #endif
   1053 
   1054     if (size > PyUnicode_GET_SIZE(unicode))
   1055         return PyUnicode_GET_SIZE(unicode);
   1056     else
   1057         return size;
   1058 }
   1059 
   1060 #endif
   1061 
   1062 PyObject *PyUnicode_FromOrdinal(int ordinal)
   1063 {
   1064     Py_UNICODE s[1];
   1065 
   1066 #ifdef Py_UNICODE_WIDE
   1067     if (ordinal < 0 || ordinal > 0x10ffff) {
   1068         PyErr_SetString(PyExc_ValueError,
   1069                         "unichr() arg not in range(0x110000) "
   1070                         "(wide Python build)");
   1071         return NULL;
   1072     }
   1073 #else
   1074     if (ordinal < 0 || ordinal > 0xffff) {
   1075         PyErr_SetString(PyExc_ValueError,
   1076                         "unichr() arg not in range(0x10000) "
   1077                         "(narrow Python build)");
   1078         return NULL;
   1079     }
   1080 #endif
   1081 
   1082     s[0] = (Py_UNICODE)ordinal;
   1083     return PyUnicode_FromUnicode(s, 1);
   1084 }
   1085 
   1086 PyObject *PyUnicode_FromObject(register PyObject *obj)
   1087 {
   1088     /* XXX Perhaps we should make this API an alias of
   1089        PyObject_Unicode() instead ?! */
   1090     if (PyUnicode_CheckExact(obj)) {
   1091         Py_INCREF(obj);
   1092         return obj;
   1093     }
   1094     if (PyUnicode_Check(obj)) {
   1095         /* For a Unicode subtype that's not a Unicode object,
   1096            return a true Unicode object with the same data. */
   1097         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
   1098                                      PyUnicode_GET_SIZE(obj));
   1099     }
   1100     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
   1101 }
   1102 
   1103 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
   1104                                       const char *encoding,
   1105                                       const char *errors)
   1106 {
   1107     const char *s = NULL;
   1108     Py_ssize_t len;
   1109     PyObject *v;
   1110 
   1111     if (obj == NULL) {
   1112         PyErr_BadInternalCall();
   1113         return NULL;
   1114     }
   1115 
   1116 #if 0
   1117     /* For b/w compatibility we also accept Unicode objects provided
   1118        that no encodings is given and then redirect to
   1119        PyObject_Unicode() which then applies the additional logic for
   1120        Unicode subclasses.
   1121 
   1122        NOTE: This API should really only be used for object which
   1123        represent *encoded* Unicode !
   1124 
   1125     */
   1126     if (PyUnicode_Check(obj)) {
   1127         if (encoding) {
   1128             PyErr_SetString(PyExc_TypeError,
   1129                             "decoding Unicode is not supported");
   1130             return NULL;
   1131         }
   1132         return PyObject_Unicode(obj);
   1133     }
   1134 #else
   1135     if (PyUnicode_Check(obj)) {
   1136         PyErr_SetString(PyExc_TypeError,
   1137                         "decoding Unicode is not supported");
   1138         return NULL;
   1139     }
   1140 #endif
   1141 
   1142     /* Coerce object */
   1143     if (PyString_Check(obj)) {
   1144         s = PyString_AS_STRING(obj);
   1145         len = PyString_GET_SIZE(obj);
   1146     }
   1147     else if (PyByteArray_Check(obj)) {
   1148         /* Python 2.x specific */
   1149         PyErr_Format(PyExc_TypeError,
   1150                      "decoding bytearray is not supported");
   1151         return NULL;
   1152     }
   1153     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
   1154         /* Overwrite the error message with something more useful in
   1155            case of a TypeError. */
   1156         if (PyErr_ExceptionMatches(PyExc_TypeError))
   1157             PyErr_Format(PyExc_TypeError,
   1158                          "coercing to Unicode: need string or buffer, "
   1159                          "%.80s found",
   1160                          Py_TYPE(obj)->tp_name);
   1161         goto onError;
   1162     }
   1163 
   1164     /* Convert to Unicode */
   1165     if (len == 0) {
   1166         Py_INCREF(unicode_empty);
   1167         v = (PyObject *)unicode_empty;
   1168     }
   1169     else
   1170         v = PyUnicode_Decode(s, len, encoding, errors);
   1171 
   1172     return v;
   1173 
   1174   onError:
   1175     return NULL;
   1176 }
   1177 
   1178 PyObject *PyUnicode_Decode(const char *s,
   1179                            Py_ssize_t size,
   1180                            const char *encoding,
   1181                            const char *errors)
   1182 {
   1183     PyObject *buffer = NULL, *unicode;
   1184 
   1185     if (encoding == NULL)
   1186         encoding = PyUnicode_GetDefaultEncoding();
   1187 
   1188     /* Shortcuts for common default encodings */
   1189     if (strcmp(encoding, "utf-8") == 0)
   1190         return PyUnicode_DecodeUTF8(s, size, errors);
   1191     else if (strcmp(encoding, "latin-1") == 0)
   1192         return PyUnicode_DecodeLatin1(s, size, errors);
   1193 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
   1194     else if (strcmp(encoding, "mbcs") == 0)
   1195         return PyUnicode_DecodeMBCS(s, size, errors);
   1196 #endif
   1197     else if (strcmp(encoding, "ascii") == 0)
   1198         return PyUnicode_DecodeASCII(s, size, errors);
   1199 
   1200     /* Decode via the codec registry */
   1201     buffer = PyBuffer_FromMemory((void *)s, size);
   1202     if (buffer == NULL)
   1203         goto onError;
   1204     unicode = PyCodec_Decode(buffer, encoding, errors);
   1205     if (unicode == NULL)
   1206         goto onError;
   1207     if (!PyUnicode_Check(unicode)) {
   1208         PyErr_Format(PyExc_TypeError,
   1209                      "decoder did not return an unicode object (type=%.400s)",
   1210                      Py_TYPE(unicode)->tp_name);
   1211         Py_DECREF(unicode);
   1212         goto onError;
   1213     }
   1214     Py_DECREF(buffer);
   1215     return unicode;
   1216 
   1217   onError:
   1218     Py_XDECREF(buffer);
   1219     return NULL;
   1220 }
   1221 
   1222 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
   1223                                     const char *encoding,
   1224                                     const char *errors)
   1225 {
   1226     PyObject *v;
   1227 
   1228     if (!PyUnicode_Check(unicode)) {
   1229         PyErr_BadArgument();
   1230         goto onError;
   1231     }
   1232 
   1233     if (encoding == NULL)
   1234         encoding = PyUnicode_GetDefaultEncoding();
   1235 
   1236     /* Decode via the codec registry */
   1237     v = PyCodec_Decode(unicode, encoding, errors);
   1238     if (v == NULL)
   1239         goto onError;
   1240     return v;
   1241 
   1242   onError:
   1243     return NULL;
   1244 }
   1245 
   1246 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
   1247                            Py_ssize_t size,
   1248                            const char *encoding,
   1249                            const char *errors)
   1250 {
   1251     PyObject *v, *unicode;
   1252 
   1253     unicode = PyUnicode_FromUnicode(s, size);
   1254     if (unicode == NULL)
   1255         return NULL;
   1256     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
   1257     Py_DECREF(unicode);
   1258     return v;
   1259 }
   1260 
   1261 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
   1262                                     const char *encoding,
   1263                                     const char *errors)
   1264 {
   1265     PyObject *v;
   1266 
   1267     if (!PyUnicode_Check(unicode)) {
   1268         PyErr_BadArgument();
   1269         goto onError;
   1270     }
   1271 
   1272     if (encoding == NULL)
   1273         encoding = PyUnicode_GetDefaultEncoding();
   1274 
   1275     /* Encode via the codec registry */
   1276     v = PyCodec_Encode(unicode, encoding, errors);
   1277     if (v == NULL)
   1278         goto onError;
   1279     return v;
   1280 
   1281   onError:
   1282     return NULL;
   1283 }
   1284 
   1285 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
   1286                                     const char *encoding,
   1287                                     const char *errors)
   1288 {
   1289     PyObject *v;
   1290 
   1291     if (!PyUnicode_Check(unicode)) {
   1292         PyErr_BadArgument();
   1293         goto onError;
   1294     }
   1295 
   1296     if (encoding == NULL)
   1297         encoding = PyUnicode_GetDefaultEncoding();
   1298 
   1299     /* Shortcuts for common default encodings */
   1300     if (errors == NULL) {
   1301         if (strcmp(encoding, "utf-8") == 0)
   1302             return PyUnicode_AsUTF8String(unicode);
   1303         else if (strcmp(encoding, "latin-1") == 0)
   1304             return PyUnicode_AsLatin1String(unicode);
   1305 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
   1306         else if (strcmp(encoding, "mbcs") == 0)
   1307             return PyUnicode_AsMBCSString(unicode);
   1308 #endif
   1309         else if (strcmp(encoding, "ascii") == 0)
   1310             return PyUnicode_AsASCIIString(unicode);
   1311     }
   1312 
   1313     /* Encode via the codec registry */
   1314     v = PyCodec_Encode(unicode, encoding, errors);
   1315     if (v == NULL)
   1316         goto onError;
   1317     if (!PyString_Check(v)) {
   1318         PyErr_Format(PyExc_TypeError,
   1319                      "encoder did not return a string object (type=%.400s)",
   1320                      Py_TYPE(v)->tp_name);
   1321         Py_DECREF(v);
   1322         goto onError;
   1323     }
   1324     return v;
   1325 
   1326   onError:
   1327     return NULL;
   1328 }
   1329 
   1330 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
   1331                                             const char *errors)
   1332 {
   1333     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
   1334 
   1335     if (v)
   1336         return v;
   1337     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
   1338     if (v && errors == NULL)
   1339         ((PyUnicodeObject *)unicode)->defenc = v;
   1340     return v;
   1341 }
   1342 
   1343 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
   1344 {
   1345     if (!PyUnicode_Check(unicode)) {
   1346         PyErr_BadArgument();
   1347         goto onError;
   1348     }
   1349     return PyUnicode_AS_UNICODE(unicode);
   1350 
   1351   onError:
   1352     return NULL;
   1353 }
   1354 
   1355 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
   1356 {
   1357     if (!PyUnicode_Check(unicode)) {
   1358         PyErr_BadArgument();
   1359         goto onError;
   1360     }
   1361     return PyUnicode_GET_SIZE(unicode);
   1362 
   1363   onError:
   1364     return -1;
   1365 }
   1366 
   1367 const char *PyUnicode_GetDefaultEncoding(void)
   1368 {
   1369     return unicode_default_encoding;
   1370 }
   1371 
   1372 int PyUnicode_SetDefaultEncoding(const char *encoding)
   1373 {
   1374     PyObject *v;
   1375 
   1376     /* Make sure the encoding is valid. As side effect, this also
   1377        loads the encoding into the codec registry cache. */
   1378     v = _PyCodec_Lookup(encoding);
   1379     if (v == NULL)
   1380         goto onError;
   1381     Py_DECREF(v);
   1382     strncpy(unicode_default_encoding,
   1383             encoding,
   1384             sizeof(unicode_default_encoding));
   1385     return 0;
   1386 
   1387   onError:
   1388     return -1;
   1389 }
   1390 
   1391 /* error handling callback helper:
   1392    build arguments, call the callback and check the arguments,
   1393    if no exception occurred, copy the replacement to the output
   1394    and adjust various state variables.
   1395    return 0 on success, -1 on error
   1396 */
   1397 
   1398 static
   1399 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
   1400                                      const char *encoding, const char *reason,
   1401                                      const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
   1402                                      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
   1403                                      PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
   1404 {
   1405     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
   1406 
   1407     PyObject *restuple = NULL;
   1408     PyObject *repunicode = NULL;
   1409     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
   1410     Py_ssize_t requiredsize;
   1411     Py_ssize_t newpos;
   1412     Py_UNICODE *repptr;
   1413     Py_ssize_t repsize;
   1414     int res = -1;
   1415 
   1416     if (*errorHandler == NULL) {
   1417         *errorHandler = PyCodec_LookupError(errors);
   1418         if (*errorHandler == NULL)
   1419             goto onError;
   1420     }
   1421 
   1422     if (*exceptionObject == NULL) {
   1423         *exceptionObject = PyUnicodeDecodeError_Create(
   1424             encoding, input, insize, *startinpos, *endinpos, reason);
   1425         if (*exceptionObject == NULL)
   1426             goto onError;
   1427     }
   1428     else {
   1429         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
   1430             goto onError;
   1431         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
   1432             goto onError;
   1433         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
   1434             goto onError;
   1435     }
   1436 
   1437     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
   1438     if (restuple == NULL)
   1439         goto onError;
   1440     if (!PyTuple_Check(restuple)) {
   1441         PyErr_SetString(PyExc_TypeError, &argparse[4]);
   1442         goto onError;
   1443     }
   1444     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
   1445         goto onError;
   1446     if (newpos<0)
   1447         newpos = insize+newpos;
   1448     if (newpos<0 || newpos>insize) {
   1449         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
   1450         goto onError;
   1451     }
   1452 
   1453     /* need more space? (at least enough for what we
   1454        have+the replacement+the rest of the string (starting
   1455        at the new input position), so we won't have to check space
   1456        when there are no errors in the rest of the string) */
   1457     repptr = PyUnicode_AS_UNICODE(repunicode);
   1458     repsize = PyUnicode_GET_SIZE(repunicode);
   1459     requiredsize = *outpos + repsize + insize-newpos;
   1460     if (requiredsize > outsize) {
   1461         if (requiredsize<2*outsize)
   1462             requiredsize = 2*outsize;
   1463         if (_PyUnicode_Resize(output, requiredsize) < 0)
   1464             goto onError;
   1465         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
   1466     }
   1467     *endinpos = newpos;
   1468     *inptr = input + newpos;
   1469     Py_UNICODE_COPY(*outptr, repptr, repsize);
   1470     *outptr += repsize;
   1471     *outpos += repsize;
   1472     /* we made it! */
   1473     res = 0;
   1474 
   1475   onError:
   1476     Py_XDECREF(restuple);
   1477     return res;
   1478 }
   1479 
   1480 /* --- UTF-7 Codec -------------------------------------------------------- */
   1481 
   1482 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
   1483 
   1484 /* Three simple macros defining base-64. */
   1485 
   1486 /* Is c a base-64 character? */
   1487 
   1488 #define IS_BASE64(c) \
   1489     (isalnum(c) || (c) == '+' || (c) == '/')
   1490 
   1491 /* given that c is a base-64 character, what is its base-64 value? */
   1492 
   1493 #define FROM_BASE64(c)                                                  \
   1494     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
   1495      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
   1496      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
   1497      (c) == '+' ? 62 : 63)
   1498 
   1499 /* What is the base-64 character of the bottom 6 bits of n? */
   1500 
   1501 #define TO_BASE64(n)  \
   1502     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
   1503 
   1504 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
   1505  * decoded as itself.  We are permissive on decoding; the only ASCII
   1506  * byte not decoding to itself is the + which begins a base64
   1507  * string. */
   1508 
   1509 #define DECODE_DIRECT(c)                                \
   1510     ((c) <= 127 && (c) != '+')
   1511 
   1512 /* The UTF-7 encoder treats ASCII characters differently according to
   1513  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
   1514  * the above).  See RFC2152.  This array identifies these different
   1515  * sets:
   1516  * 0 : "Set D"
   1517  *     alphanumeric and '(),-./:?
   1518  * 1 : "Set O"
   1519  *     !"#$%&*;<=>@[]^_`{|}
   1520  * 2 : "whitespace"
   1521  *     ht nl cr sp
   1522  * 3 : special (must be base64 encoded)
   1523  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
   1524  */
   1525 
   1526 static
   1527 char utf7_category[128] = {
   1528 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
   1529     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
   1530 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
   1531     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
   1532 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
   1533     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
   1534 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
   1535     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
   1536 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
   1537     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   1538 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
   1539     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
   1540 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
   1541     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   1542 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
   1543     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
   1544 };
   1545 
   1546 /* ENCODE_DIRECT: this character should be encoded as itself.  The
   1547  * answer depends on whether we are encoding set O as itself, and also
   1548  * on whether we are encoding whitespace as itself.  RFC2152 makes it
   1549  * clear that the answers to these questions vary between
   1550  * applications, so this code needs to be flexible.  */
   1551 
   1552 #define ENCODE_DIRECT(c, directO, directWS)             \
   1553     ((c) < 128 && (c) > 0 &&                            \
   1554      ((utf7_category[(c)] == 0) ||                      \
   1555       (directWS && (utf7_category[(c)] == 2)) ||        \
   1556       (directO && (utf7_category[(c)] == 1))))
   1557 
   1558 PyObject *PyUnicode_DecodeUTF7(const char *s,
   1559                                Py_ssize_t size,
   1560                                const char *errors)
   1561 {
   1562     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
   1563 }
   1564 
   1565 /* The decoder.  The only state we preserve is our read position,
   1566  * i.e. how many characters we have consumed.  So if we end in the
   1567  * middle of a shift sequence we have to back off the read position
   1568  * and the output to the beginning of the sequence, otherwise we lose
   1569  * all the shift state (seen bits, number of bits seen, high
   1570  * surrogate). */
   1571 
   1572 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
   1573                                        Py_ssize_t size,
   1574                                        const char *errors,
   1575                                        Py_ssize_t *consumed)
   1576 {
   1577     const char *starts = s;
   1578     Py_ssize_t startinpos;
   1579     Py_ssize_t endinpos;
   1580     Py_ssize_t outpos;
   1581     const char *e;
   1582     PyUnicodeObject *unicode;
   1583     Py_UNICODE *p;
   1584     const char *errmsg = "";
   1585     int inShift = 0;
   1586     Py_UNICODE *shiftOutStart;
   1587     unsigned int base64bits = 0;
   1588     unsigned long base64buffer = 0;
   1589     Py_UNICODE surrogate = 0;
   1590     PyObject *errorHandler = NULL;
   1591     PyObject *exc = NULL;
   1592 
   1593     unicode = _PyUnicode_New(size);
   1594     if (!unicode)
   1595         return NULL;
   1596     if (size == 0) {
   1597         if (consumed)
   1598             *consumed = 0;
   1599         return (PyObject *)unicode;
   1600     }
   1601 
   1602     p = unicode->str;
   1603     shiftOutStart = p;
   1604     e = s + size;
   1605 
   1606     while (s < e) {
   1607         Py_UNICODE ch = (unsigned char) *s;
   1608 
   1609         if (inShift) { /* in a base-64 section */
   1610             if (IS_BASE64(ch)) { /* consume a base-64 character */
   1611                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
   1612                 base64bits += 6;
   1613                 s++;
   1614                 if (base64bits >= 16) {
   1615                     /* we have enough bits for a UTF-16 value */
   1616                     Py_UNICODE outCh = (Py_UNICODE)
   1617                                        (base64buffer >> (base64bits-16));
   1618                     base64bits -= 16;
   1619                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
   1620                     if (surrogate) {
   1621                         /* expecting a second surrogate */
   1622                         if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
   1623 #ifdef Py_UNICODE_WIDE
   1624                             *p++ = (((surrogate & 0x3FF)<<10)
   1625                                     | (outCh & 0x3FF)) + 0x10000;
   1626 #else
   1627                             *p++ = surrogate;
   1628                             *p++ = outCh;
   1629 #endif
   1630                             surrogate = 0;
   1631                         }
   1632                         else {
   1633                             surrogate = 0;
   1634                             errmsg = "second surrogate missing";
   1635                             goto utf7Error;
   1636                         }
   1637                     }
   1638                     else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
   1639                         /* first surrogate */
   1640                         surrogate = outCh;
   1641                     }
   1642                     else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
   1643                         errmsg = "unexpected second surrogate";
   1644                         goto utf7Error;
   1645                     }
   1646                     else {
   1647                         *p++ = outCh;
   1648                     }
   1649                 }
   1650             }
   1651             else { /* now leaving a base-64 section */
   1652                 inShift = 0;
   1653                 s++;
   1654                 if (surrogate) {
   1655                     errmsg = "second surrogate missing at end of shift sequence";
   1656                     goto utf7Error;
   1657                 }
   1658                 if (base64bits > 0) { /* left-over bits */
   1659                     if (base64bits >= 6) {
   1660                         /* We've seen at least one base-64 character */
   1661                         errmsg = "partial character in shift sequence";
   1662                         goto utf7Error;
   1663                     }
   1664                     else {
   1665                         /* Some bits remain; they should be zero */
   1666                         if (base64buffer != 0) {
   1667                             errmsg = "non-zero padding bits in shift sequence";
   1668                             goto utf7Error;
   1669                         }
   1670                     }
   1671                 }
   1672                 if (ch != '-') {
   1673                     /* '-' is absorbed; other terminating
   1674                        characters are preserved */
   1675                     *p++ = ch;
   1676                 }
   1677             }
   1678         }
   1679         else if ( ch == '+' ) {
   1680             startinpos = s-starts;
   1681             s++; /* consume '+' */
   1682             if (s < e && *s == '-') { /* '+-' encodes '+' */
   1683                 s++;
   1684                 *p++ = '+';
   1685             }
   1686             else { /* begin base64-encoded section */
   1687                 inShift = 1;
   1688                 shiftOutStart = p;
   1689                 base64bits = 0;
   1690             }
   1691         }
   1692         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
   1693             *p++ = ch;
   1694             s++;
   1695         }
   1696         else {
   1697             startinpos = s-starts;
   1698             s++;
   1699             errmsg = "unexpected special character";
   1700             goto utf7Error;
   1701         }
   1702         continue;
   1703 utf7Error:
   1704         outpos = p-PyUnicode_AS_UNICODE(unicode);
   1705         endinpos = s-starts;
   1706         if (unicode_decode_call_errorhandler(
   1707                 errors, &errorHandler,
   1708                 "utf7", errmsg,
   1709                 starts, size, &startinpos, &endinpos, &exc, &s,
   1710                 &unicode, &outpos, &p))
   1711             goto onError;
   1712     }
   1713 
   1714     /* end of string */
   1715 
   1716     if (inShift && !consumed) { /* in shift sequence, no more to follow */
   1717         /* if we're in an inconsistent state, that's an error */
   1718         if (surrogate ||
   1719                 (base64bits >= 6) ||
   1720                 (base64bits > 0 && base64buffer != 0)) {
   1721             outpos = p-PyUnicode_AS_UNICODE(unicode);
   1722             endinpos = size;
   1723             if (unicode_decode_call_errorhandler(
   1724                     errors, &errorHandler,
   1725                     "utf7", "unterminated shift sequence",
   1726                     starts, size, &startinpos, &endinpos, &exc, &s,
   1727                     &unicode, &outpos, &p))
   1728                 goto onError;
   1729         }
   1730     }
   1731 
   1732     /* return state */
   1733     if (consumed) {
   1734         if (inShift) {
   1735             p = shiftOutStart; /* back off output */
   1736             *consumed = startinpos;
   1737         }
   1738         else {
   1739             *consumed = s-starts;
   1740         }
   1741     }
   1742 
   1743     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
   1744         goto onError;
   1745 
   1746     Py_XDECREF(errorHandler);
   1747     Py_XDECREF(exc);
   1748     return (PyObject *)unicode;
   1749 
   1750   onError:
   1751     Py_XDECREF(errorHandler);
   1752     Py_XDECREF(exc);
   1753     Py_DECREF(unicode);
   1754     return NULL;
   1755 }
   1756 
   1757 
   1758 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
   1759                                Py_ssize_t size,
   1760                                int base64SetO,
   1761                                int base64WhiteSpace,
   1762                                const char *errors)
   1763 {
   1764     PyObject *v;
   1765     /* It might be possible to tighten this worst case */
   1766     Py_ssize_t allocated = 8 * size;
   1767     int inShift = 0;
   1768     Py_ssize_t i = 0;
   1769     unsigned int base64bits = 0;
   1770     unsigned long base64buffer = 0;
   1771     char * out;
   1772     char * start;
   1773 
   1774     if (allocated / 8 != size)
   1775         return PyErr_NoMemory();
   1776 
   1777     if (size == 0)
   1778         return PyString_FromStringAndSize(NULL, 0);
   1779 
   1780     v = PyString_FromStringAndSize(NULL, allocated);
   1781     if (v == NULL)
   1782         return NULL;
   1783 
   1784     start = out = PyString_AS_STRING(v);
   1785     for (;i < size; ++i) {
   1786         Py_UNICODE ch = s[i];
   1787 
   1788         if (inShift) {
   1789             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
   1790                 /* shifting out */
   1791                 if (base64bits) { /* output remaining bits */
   1792                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
   1793                     base64buffer = 0;
   1794                     base64bits = 0;
   1795                 }
   1796                 inShift = 0;
   1797                 /* Characters not in the BASE64 set implicitly unshift the sequence
   1798                    so no '-' is required, except if the character is itself a '-' */
   1799                 if (IS_BASE64(ch) || ch == '-') {
   1800                     *out++ = '-';
   1801                 }
   1802                 *out++ = (char) ch;
   1803             }
   1804             else {
   1805                 goto encode_char;
   1806             }
   1807         }
   1808         else { /* not in a shift sequence */
   1809             if (ch == '+') {
   1810                 *out++ = '+';
   1811                         *out++ = '-';
   1812             }
   1813             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
   1814                 *out++ = (char) ch;
   1815             }
   1816             else {
   1817                 *out++ = '+';
   1818                 inShift = 1;
   1819                 goto encode_char;
   1820             }
   1821         }
   1822         continue;
   1823 encode_char:
   1824 #ifdef Py_UNICODE_WIDE
   1825         if (ch >= 0x10000) {
   1826             /* code first surrogate */
   1827             base64bits += 16;
   1828             base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
   1829             while (base64bits >= 6) {
   1830                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
   1831                 base64bits -= 6;
   1832             }
   1833             /* prepare second surrogate */
   1834             ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
   1835         }
   1836 #endif
   1837         base64bits += 16;
   1838         base64buffer = (base64buffer << 16) | ch;
   1839         while (base64bits >= 6) {
   1840             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
   1841             base64bits -= 6;
   1842         }
   1843     }
   1844     if (base64bits)
   1845         *out++= TO_BASE64(base64buffer << (6-base64bits) );
   1846     if (inShift)
   1847         *out++ = '-';
   1848 
   1849     if (_PyString_Resize(&v, out - start))
   1850         return NULL;
   1851     return v;
   1852 }
   1853 
   1854 #undef IS_BASE64
   1855 #undef FROM_BASE64
   1856 #undef TO_BASE64
   1857 #undef DECODE_DIRECT
   1858 #undef ENCODE_DIRECT
   1859 
   1860 /* --- UTF-8 Codec -------------------------------------------------------- */
   1861 
   1862 static
   1863 char utf8_code_length[256] = {
   1864     /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
   1865        illegal prefix.  See RFC 3629 for details */
   1866     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
   1867     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1868     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1869     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1870     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1871     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1872     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1873     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
   1874     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
   1875     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1876     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1877     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
   1878     0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
   1879     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
   1880     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
   1881     4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
   1882 };
   1883 
   1884 PyObject *PyUnicode_DecodeUTF8(const char *s,
   1885                                Py_ssize_t size,
   1886                                const char *errors)
   1887 {
   1888     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
   1889 }
   1890 
   1891 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
   1892                                        Py_ssize_t size,
   1893                                        const char *errors,
   1894                                        Py_ssize_t *consumed)
   1895 {
   1896     const char *starts = s;
   1897     int n;
   1898     int k;
   1899     Py_ssize_t startinpos;
   1900     Py_ssize_t endinpos;
   1901     Py_ssize_t outpos;
   1902     const char *e;
   1903     PyUnicodeObject *unicode;
   1904     Py_UNICODE *p;
   1905     const char *errmsg = "";
   1906     PyObject *errorHandler = NULL;
   1907     PyObject *exc = NULL;
   1908 
   1909     /* Note: size will always be longer than the resulting Unicode
   1910        character count */
   1911     unicode = _PyUnicode_New(size);
   1912     if (!unicode)
   1913         return NULL;
   1914     if (size == 0) {
   1915         if (consumed)
   1916             *consumed = 0;
   1917         return (PyObject *)unicode;
   1918     }
   1919 
   1920     /* Unpack UTF-8 encoded data */
   1921     p = unicode->str;
   1922     e = s + size;
   1923 
   1924     while (s < e) {
   1925         Py_UCS4 ch = (unsigned char)*s;
   1926 
   1927         if (ch < 0x80) {
   1928             *p++ = (Py_UNICODE)ch;
   1929             s++;
   1930             continue;
   1931         }
   1932 
   1933         n = utf8_code_length[ch];
   1934 
   1935         if (s + n > e) {
   1936             if (consumed)
   1937                 break;
   1938             else {
   1939                 errmsg = "unexpected end of data";
   1940                 startinpos = s-starts;
   1941                 endinpos = startinpos+1;
   1942                 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
   1943                     endinpos++;
   1944                 goto utf8Error;
   1945             }
   1946         }
   1947 
   1948         switch (n) {
   1949 
   1950         case 0:
   1951             errmsg = "invalid start byte";
   1952             startinpos = s-starts;
   1953             endinpos = startinpos+1;
   1954             goto utf8Error;
   1955 
   1956         case 1:
   1957             errmsg = "internal error";
   1958             startinpos = s-starts;
   1959             endinpos = startinpos+1;
   1960             goto utf8Error;
   1961 
   1962         case 2:
   1963             if ((s[1] & 0xc0) != 0x80) {
   1964                 errmsg = "invalid continuation byte";
   1965                 startinpos = s-starts;
   1966                 endinpos = startinpos + 1;
   1967                 goto utf8Error;
   1968             }
   1969             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
   1970             assert ((ch > 0x007F) && (ch <= 0x07FF));
   1971             *p++ = (Py_UNICODE)ch;
   1972             break;
   1973 
   1974         case 3:
   1975             /* XXX: surrogates shouldn't be valid UTF-8!
   1976                see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
   1977                (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
   1978                Uncomment the 2 lines below to make them invalid,
   1979                codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
   1980             if ((s[1] & 0xc0) != 0x80 ||
   1981                 (s[2] & 0xc0) != 0x80 ||
   1982                 ((unsigned char)s[0] == 0xE0 &&
   1983                  (unsigned char)s[1] < 0xA0)/* ||
   1984                 ((unsigned char)s[0] == 0xED &&
   1985                  (unsigned char)s[1] > 0x9F)*/) {
   1986                 errmsg = "invalid continuation byte";
   1987                 startinpos = s-starts;
   1988                 endinpos = startinpos + 1;
   1989 
   1990                 /* if s[1] first two bits are 1 and 0, then the invalid
   1991                    continuation byte is s[2], so increment endinpos by 1,
   1992                    if not, s[1] is invalid and endinpos doesn't need to
   1993                    be incremented. */
   1994                 if ((s[1] & 0xC0) == 0x80)
   1995                     endinpos++;
   1996                 goto utf8Error;
   1997             }
   1998             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
   1999             assert ((ch > 0x07FF) && (ch <= 0xFFFF));
   2000             *p++ = (Py_UNICODE)ch;
   2001             break;
   2002 
   2003         case 4:
   2004             if ((s[1] & 0xc0) != 0x80 ||
   2005                 (s[2] & 0xc0) != 0x80 ||
   2006                 (s[3] & 0xc0) != 0x80 ||
   2007                 ((unsigned char)s[0] == 0xF0 &&
   2008                  (unsigned char)s[1] < 0x90) ||
   2009                 ((unsigned char)s[0] == 0xF4 &&
   2010                  (unsigned char)s[1] > 0x8F)) {
   2011                 errmsg = "invalid continuation byte";
   2012                 startinpos = s-starts;
   2013                 endinpos = startinpos + 1;
   2014                 if ((s[1] & 0xC0) == 0x80) {
   2015                     endinpos++;
   2016                     if ((s[2] & 0xC0) == 0x80)
   2017                         endinpos++;
   2018                 }
   2019                 goto utf8Error;
   2020             }
   2021             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
   2022                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
   2023             assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
   2024 
   2025 #ifdef Py_UNICODE_WIDE
   2026             *p++ = (Py_UNICODE)ch;
   2027 #else
   2028             /*  compute and append the two surrogates: */
   2029 
   2030             /*  translate from 10000..10FFFF to 0..FFFF */
   2031             ch -= 0x10000;
   2032 
   2033             /*  high surrogate = top 10 bits added to D800 */
   2034             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
   2035 
   2036             /*  low surrogate = bottom 10 bits added to DC00 */
   2037             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
   2038 #endif
   2039             break;
   2040         }
   2041         s += n;
   2042         continue;
   2043 
   2044       utf8Error:
   2045         outpos = p-PyUnicode_AS_UNICODE(unicode);
   2046         if (unicode_decode_call_errorhandler(
   2047                 errors, &errorHandler,
   2048                 "utf8", errmsg,
   2049                 starts, size, &startinpos, &endinpos, &exc, &s,
   2050                 &unicode, &outpos, &p))
   2051             goto onError;
   2052     }
   2053     if (consumed)
   2054         *consumed = s-starts;
   2055 
   2056     /* Adjust length */
   2057     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
   2058         goto onError;
   2059 
   2060     Py_XDECREF(errorHandler);
   2061     Py_XDECREF(exc);
   2062     return (PyObject *)unicode;
   2063 
   2064   onError:
   2065     Py_XDECREF(errorHandler);
   2066     Py_XDECREF(exc);
   2067     Py_DECREF(unicode);
   2068     return NULL;
   2069 }
   2070 
   2071 /* Allocation strategy:  if the string is short, convert into a stack buffer
   2072    and allocate exactly as much space needed at the end.  Else allocate the
   2073    maximum possible needed (4 result bytes per Unicode character), and return
   2074    the excess memory at the end.
   2075 */
   2076 PyObject *
   2077 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
   2078                      Py_ssize_t size,
   2079                      const char *errors)
   2080 {
   2081 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
   2082 
   2083     Py_ssize_t i;           /* index into s of next input byte */
   2084     PyObject *v;        /* result string object */
   2085     char *p;            /* next free byte in output buffer */
   2086     Py_ssize_t nallocated;  /* number of result bytes allocated */
   2087     Py_ssize_t nneeded;        /* number of result bytes needed */
   2088     char stackbuf[MAX_SHORT_UNICHARS * 4];
   2089 
   2090     assert(s != NULL);
   2091     assert(size >= 0);
   2092 
   2093     if (size <= MAX_SHORT_UNICHARS) {
   2094         /* Write into the stack buffer; nallocated can't overflow.
   2095          * At the end, we'll allocate exactly as much heap space as it
   2096          * turns out we need.
   2097          */
   2098         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
   2099         v = NULL;   /* will allocate after we're done */
   2100         p = stackbuf;
   2101     }
   2102     else {
   2103         /* Overallocate on the heap, and give the excess back at the end. */
   2104         nallocated = size * 4;
   2105         if (nallocated / 4 != size)  /* overflow! */
   2106             return PyErr_NoMemory();
   2107         v = PyString_FromStringAndSize(NULL, nallocated);
   2108         if (v == NULL)
   2109             return NULL;
   2110         p = PyString_AS_STRING(v);
   2111     }
   2112 
   2113     for (i = 0; i < size;) {
   2114         Py_UCS4 ch = s[i++];
   2115 
   2116         if (ch < 0x80)
   2117             /* Encode ASCII */
   2118             *p++ = (char) ch;
   2119 
   2120         else if (ch < 0x0800) {
   2121             /* Encode Latin-1 */
   2122             *p++ = (char)(0xc0 | (ch >> 6));
   2123             *p++ = (char)(0x80 | (ch & 0x3f));
   2124         }
   2125         else {
   2126             /* Encode UCS2 Unicode ordinals */
   2127             if (ch < 0x10000) {
   2128                 /* Special case: check for high surrogate */
   2129                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
   2130                     Py_UCS4 ch2 = s[i];
   2131                     /* Check for low surrogate and combine the two to
   2132                        form a UCS4 value */
   2133                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
   2134                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
   2135                         i++;
   2136                         goto encodeUCS4;
   2137                     }
   2138                     /* Fall through: handles isolated high surrogates */
   2139                 }
   2140                 *p++ = (char)(0xe0 | (ch >> 12));
   2141                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
   2142                 *p++ = (char)(0x80 | (ch & 0x3f));
   2143                 continue;
   2144             }
   2145           encodeUCS4:
   2146             /* Encode UCS4 Unicode ordinals */
   2147             *p++ = (char)(0xf0 | (ch >> 18));
   2148             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
   2149             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
   2150             *p++ = (char)(0x80 | (ch & 0x3f));
   2151         }
   2152     }
   2153 
   2154     if (v == NULL) {
   2155         /* This was stack allocated. */
   2156         nneeded = p - stackbuf;
   2157         assert(nneeded <= nallocated);
   2158         v = PyString_FromStringAndSize(stackbuf, nneeded);
   2159     }
   2160     else {
   2161         /* Cut back to size actually needed. */
   2162         nneeded = p - PyString_AS_STRING(v);
   2163         assert(nneeded <= nallocated);
   2164         if (_PyString_Resize(&v, nneeded))
   2165             return NULL;
   2166     }
   2167     return v;
   2168 
   2169 #undef MAX_SHORT_UNICHARS
   2170 }
   2171 
   2172 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
   2173 {
   2174     if (!PyUnicode_Check(unicode)) {
   2175         PyErr_BadArgument();
   2176         return NULL;
   2177     }
   2178     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
   2179                                 PyUnicode_GET_SIZE(unicode),
   2180                                 NULL);
   2181 }
   2182 
   2183 /* --- UTF-32 Codec ------------------------------------------------------- */
   2184 
   2185 PyObject *
   2186 PyUnicode_DecodeUTF32(const char *s,
   2187                       Py_ssize_t size,
   2188                       const char *errors,
   2189                       int *byteorder)
   2190 {
   2191     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
   2192 }
   2193 
   2194 PyObject *
   2195 PyUnicode_DecodeUTF32Stateful(const char *s,
   2196                               Py_ssize_t size,
   2197                               const char *errors,
   2198                               int *byteorder,
   2199                               Py_ssize_t *consumed)
   2200 {
   2201     const char *starts = s;
   2202     Py_ssize_t startinpos;
   2203     Py_ssize_t endinpos;
   2204     Py_ssize_t outpos;
   2205     PyUnicodeObject *unicode;
   2206     Py_UNICODE *p;
   2207 #ifndef Py_UNICODE_WIDE
   2208     int pairs = 0;
   2209     const unsigned char *qq;
   2210 #else
   2211     const int pairs = 0;
   2212 #endif
   2213     const unsigned char *q, *e;
   2214     int bo = 0;       /* assume native ordering by default */
   2215     const char *errmsg = "";
   2216     /* Offsets from q for retrieving bytes in the right order. */
   2217 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
   2218     int iorder[] = {0, 1, 2, 3};
   2219 #else
   2220     int iorder[] = {3, 2, 1, 0};
   2221 #endif
   2222     PyObject *errorHandler = NULL;
   2223     PyObject *exc = NULL;
   2224 
   2225     q = (unsigned char *)s;
   2226     e = q + size;
   2227 
   2228     if (byteorder)
   2229         bo = *byteorder;
   2230 
   2231     /* Check for BOM marks (U+FEFF) in the input and adjust current
   2232        byte order setting accordingly. In native mode, the leading BOM
   2233        mark is skipped, in all other modes, it is copied to the output
   2234        stream as-is (giving a ZWNBSP character). */
   2235     if (bo == 0) {
   2236         if (size >= 4) {
   2237             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
   2238                 (q[iorder[1]] << 8) | q[iorder[0]];
   2239 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
   2240             if (bom == 0x0000FEFF) {
   2241                 q += 4;
   2242                 bo = -1;
   2243             }
   2244             else if (bom == 0xFFFE0000) {
   2245                 q += 4;
   2246                 bo = 1;
   2247             }
   2248 #else
   2249             if (bom == 0x0000FEFF) {
   2250                 q += 4;
   2251                 bo = 1;
   2252             }
   2253             else if (bom == 0xFFFE0000) {
   2254                 q += 4;
   2255                 bo = -1;
   2256             }
   2257 #endif
   2258         }
   2259     }
   2260 
   2261     if (bo == -1) {
   2262         /* force LE */
   2263         iorder[0] = 0;
   2264         iorder[1] = 1;
   2265         iorder[2] = 2;
   2266         iorder[3] = 3;
   2267     }
   2268     else if (bo == 1) {
   2269         /* force BE */
   2270         iorder[0] = 3;
   2271         iorder[1] = 2;
   2272         iorder[2] = 1;
   2273         iorder[3] = 0;
   2274     }
   2275 
   2276     /* On narrow builds we split characters outside the BMP into two
   2277        codepoints => count how much extra space we need. */
   2278 #ifndef Py_UNICODE_WIDE
   2279     for (qq = q; qq < e; qq += 4)
   2280         if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
   2281             pairs++;
   2282 #endif
   2283 
   2284     /* This might be one to much, because of a BOM */
   2285     unicode = _PyUnicode_New((size+3)/4+pairs);
   2286     if (!unicode)
   2287         return NULL;
   2288     if (size == 0)
   2289         return (PyObject *)unicode;
   2290 
   2291     /* Unpack UTF-32 encoded data */
   2292     p = unicode->str;
   2293 
   2294     while (q < e) {
   2295         Py_UCS4 ch;
   2296         /* remaining bytes at the end? (size should be divisible by 4) */
   2297         if (e-q<4) {
   2298             if (consumed)
   2299                 break;
   2300             errmsg = "truncated data";
   2301             startinpos = ((const char *)q)-starts;
   2302             endinpos = ((const char *)e)-starts;
   2303             goto utf32Error;
   2304             /* The remaining input chars are ignored if the callback
   2305                chooses to skip the input */
   2306         }
   2307         ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
   2308             (q[iorder[1]] << 8) | q[iorder[0]];
   2309 
   2310         if (ch >= 0x110000)
   2311         {
   2312             errmsg = "codepoint not in range(0x110000)";
   2313             startinpos = ((const char *)q)-starts;
   2314             endinpos = startinpos+4;
   2315             goto utf32Error;
   2316         }
   2317 #ifndef Py_UNICODE_WIDE
   2318         if (ch >= 0x10000)
   2319         {
   2320             *p++ = 0xD800 | ((ch-0x10000) >> 10);
   2321             *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
   2322         }
   2323         else
   2324 #endif
   2325             *p++ = ch;
   2326         q += 4;
   2327         continue;
   2328       utf32Error:
   2329         outpos = p-PyUnicode_AS_UNICODE(unicode);
   2330         if (unicode_decode_call_errorhandler(
   2331                 errors, &errorHandler,
   2332                 "utf32", errmsg,
   2333                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
   2334                 &unicode, &outpos, &p))
   2335             goto onError;
   2336     }
   2337 
   2338     if (byteorder)
   2339         *byteorder = bo;
   2340 
   2341     if (consumed)
   2342         *consumed = (const char *)q-starts;
   2343 
   2344     /* Adjust length */
   2345     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
   2346         goto onError;
   2347 
   2348     Py_XDECREF(errorHandler);
   2349     Py_XDECREF(exc);
   2350     return (PyObject *)unicode;
   2351 
   2352   onError:
   2353     Py_DECREF(unicode);
   2354     Py_XDECREF(errorHandler);
   2355     Py_XDECREF(exc);
   2356     return NULL;
   2357 }
   2358 
   2359 PyObject *
   2360 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
   2361                       Py_ssize_t size,
   2362                       const char *errors,
   2363                       int byteorder)
   2364 {
   2365     PyObject *v;
   2366     unsigned char *p;
   2367     Py_ssize_t nsize, bytesize;
   2368 #ifndef Py_UNICODE_WIDE
   2369     Py_ssize_t i, pairs;
   2370 #else
   2371     const int pairs = 0;
   2372 #endif
   2373     /* Offsets from p for storing byte pairs in the right order. */
   2374 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
   2375     int iorder[] = {0, 1, 2, 3};
   2376 #else
   2377     int iorder[] = {3, 2, 1, 0};
   2378 #endif
   2379 
   2380 #define STORECHAR(CH)                           \
   2381     do {                                        \
   2382         p[iorder[3]] = ((CH) >> 24) & 0xff;     \
   2383         p[iorder[2]] = ((CH) >> 16) & 0xff;     \
   2384         p[iorder[1]] = ((CH) >> 8) & 0xff;      \
   2385         p[iorder[0]] = (CH) & 0xff;             \
   2386         p += 4;                                 \
   2387     } while(0)
   2388 
   2389     /* In narrow builds we can output surrogate pairs as one codepoint,
   2390        so we need less space. */
   2391 #ifndef Py_UNICODE_WIDE
   2392     for (i = pairs = 0; i < size-1; i++)
   2393         if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
   2394             0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
   2395             pairs++;
   2396 #endif
   2397     nsize = (size - pairs + (byteorder == 0));
   2398     bytesize = nsize * 4;
   2399     if (bytesize / 4 != nsize)
   2400         return PyErr_NoMemory();
   2401     v = PyString_FromStringAndSize(NULL, bytesize);
   2402     if (v == NULL)
   2403         return NULL;
   2404 
   2405     p = (unsigned char *)PyString_AS_STRING(v);
   2406     if (byteorder == 0)
   2407         STORECHAR(0xFEFF);
   2408     if (size == 0)
   2409         return v;
   2410 
   2411     if (byteorder == -1) {
   2412         /* force LE */
   2413         iorder[0] = 0;
   2414         iorder[1] = 1;
   2415         iorder[2] = 2;
   2416         iorder[3] = 3;
   2417     }
   2418     else if (byteorder == 1) {
   2419         /* force BE */
   2420         iorder[0] = 3;
   2421         iorder[1] = 2;
   2422         iorder[2] = 1;
   2423         iorder[3] = 0;
   2424     }
   2425 
   2426     while (size-- > 0) {
   2427         Py_UCS4 ch = *s++;
   2428 #ifndef Py_UNICODE_WIDE
   2429         if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
   2430             Py_UCS4 ch2 = *s;
   2431             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
   2432                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
   2433                 s++;
   2434                 size--;
   2435             }
   2436         }
   2437 #endif
   2438         STORECHAR(ch);
   2439     }
   2440     return v;
   2441 #undef STORECHAR
   2442 }
   2443 
   2444 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
   2445 {
   2446     if (!PyUnicode_Check(unicode)) {
   2447         PyErr_BadArgument();
   2448         return NULL;
   2449     }
   2450     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
   2451                                  PyUnicode_GET_SIZE(unicode),
   2452                                  NULL,
   2453                                  0);
   2454 }
   2455 
   2456 /* --- UTF-16 Codec ------------------------------------------------------- */
   2457 
   2458 PyObject *
   2459 PyUnicode_DecodeUTF16(const char *s,
   2460                       Py_ssize_t size,
   2461                       const char *errors,
   2462                       int *byteorder)
   2463 {
   2464     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
   2465 }
   2466 
   2467 PyObject *
   2468 PyUnicode_DecodeUTF16Stateful(const char *s,
   2469                               Py_ssize_t size,
   2470                               const char *errors,
   2471                               int *byteorder,
   2472                               Py_ssize_t *consumed)
   2473 {
   2474     const char *starts = s;
   2475     Py_ssize_t startinpos;
   2476     Py_ssize_t endinpos;
   2477     Py_ssize_t outpos;
   2478     PyUnicodeObject *unicode;
   2479     Py_UNICODE *p;
   2480     const unsigned char *q, *e;
   2481     int bo = 0;       /* assume native ordering by default */
   2482     const char *errmsg = "";
   2483     /* Offsets from q for retrieving byte pairs in the right order. */
   2484 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
   2485     int ihi = 1, ilo = 0;
   2486 #else
   2487     int ihi = 0, ilo = 1;
   2488 #endif
   2489     PyObject *errorHandler = NULL;
   2490     PyObject *exc = NULL;
   2491 
   2492     /* Note: size will always be longer than the resulting Unicode
   2493        character count */
   2494     unicode = _PyUnicode_New(size);
   2495     if (!unicode)
   2496         return NULL;
   2497     if (size == 0)
   2498         return (PyObject *)unicode;
   2499 
   2500     /* Unpack UTF-16 encoded data */
   2501     p = unicode->str;
   2502     q = (unsigned char *)s;
   2503     e = q + size;
   2504 
   2505     if (byteorder)
   2506         bo = *byteorder;
   2507 
   2508     /* Check for BOM marks (U+FEFF) in the input and adjust current
   2509        byte order setting accordingly. In native mode, the leading BOM
   2510        mark is skipped, in all other modes, it is copied to the output
   2511        stream as-is (giving a ZWNBSP character). */
   2512     if (bo == 0) {
   2513         if (size >= 2) {
   2514             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
   2515 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
   2516             if (bom == 0xFEFF) {
   2517                 q += 2;
   2518                 bo = -1;
   2519             }
   2520             else if (bom == 0xFFFE) {
   2521                 q += 2;
   2522                 bo = 1;
   2523             }
   2524 #else
   2525             if (bom == 0xFEFF) {
   2526                 q += 2;
   2527                 bo = 1;
   2528             }
   2529             else if (bom == 0xFFFE) {
   2530                 q += 2;
   2531                 bo = -1;
   2532             }
   2533 #endif
   2534         }
   2535     }
   2536 
   2537     if (bo == -1) {
   2538         /* force LE */
   2539         ihi = 1;
   2540         ilo = 0;
   2541     }
   2542     else if (bo == 1) {
   2543         /* force BE */
   2544         ihi = 0;
   2545         ilo = 1;
   2546     }
   2547 
   2548     while (q < e) {
   2549         Py_UNICODE ch;
   2550         /* remaining bytes at the end? (size should be even) */
   2551         if (e-q<2) {
   2552             if (consumed)
   2553                 break;
   2554             errmsg = "truncated data";
   2555             startinpos = ((const char *)q)-starts;
   2556             endinpos = ((const char *)e)-starts;
   2557             goto utf16Error;
   2558             /* The remaining input chars are ignored if the callback
   2559                chooses to skip the input */
   2560         }
   2561         ch = (q[ihi] << 8) | q[ilo];
   2562 
   2563         q += 2;
   2564 
   2565         if (ch < 0xD800 || ch > 0xDFFF) {
   2566             *p++ = ch;
   2567             continue;
   2568         }
   2569 
   2570         /* UTF-16 code pair: */
   2571         if (q >= e) {
   2572             errmsg = "unexpected end of data";
   2573             startinpos = (((const char *)q)-2)-starts;
   2574             endinpos = ((const char *)e)-starts;
   2575             goto utf16Error;
   2576         }
   2577         if (0xD800 <= ch && ch <= 0xDBFF) {
   2578             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
   2579             q += 2;
   2580             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
   2581 #ifndef Py_UNICODE_WIDE
   2582                 *p++ = ch;
   2583                 *p++ = ch2;
   2584 #else
   2585                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
   2586 #endif
   2587                 continue;
   2588             }
   2589             else {
   2590                 errmsg = "illegal UTF-16 surrogate";
   2591                 startinpos = (((const char *)q)-4)-starts;
   2592                 endinpos = startinpos+2;
   2593                 goto utf16Error;
   2594             }
   2595 
   2596         }
   2597         errmsg = "illegal encoding";
   2598         startinpos = (((const char *)q)-2)-starts;
   2599         endinpos = startinpos+2;
   2600         /* Fall through to report the error */
   2601 
   2602       utf16Error:
   2603         outpos = p-PyUnicode_AS_UNICODE(unicode);
   2604         if (unicode_decode_call_errorhandler(
   2605                 errors, &errorHandler,
   2606                 "utf16", errmsg,
   2607                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
   2608                 &unicode, &outpos, &p))
   2609             goto onError;
   2610     }
   2611 
   2612     if (byteorder)
   2613         *byteorder = bo;
   2614 
   2615     if (consumed)
   2616         *consumed = (const char *)q-starts;
   2617 
   2618     /* Adjust length */
   2619     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
   2620         goto onError;
   2621 
   2622     Py_XDECREF(errorHandler);
   2623     Py_XDECREF(exc);
   2624     return (PyObject *)unicode;
   2625 
   2626   onError:
   2627     Py_DECREF(unicode);
   2628     Py_XDECREF(errorHandler);
   2629     Py_XDECREF(exc);
   2630     return NULL;
   2631 }
   2632 
   2633 PyObject *
   2634 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
   2635                       Py_ssize_t size,
   2636                       const char *errors,
   2637                       int byteorder)
   2638 {
   2639     PyObject *v;
   2640     unsigned char *p;
   2641     Py_ssize_t nsize, bytesize;
   2642 #ifdef Py_UNICODE_WIDE
   2643     Py_ssize_t i, pairs;
   2644 #else
   2645     const int pairs = 0;
   2646 #endif
   2647     /* Offsets from p for storing byte pairs in the right order. */
   2648 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
   2649     int ihi = 1, ilo = 0;
   2650 #else
   2651     int ihi = 0, ilo = 1;
   2652 #endif
   2653 
   2654 #define STORECHAR(CH)                           \
   2655     do {                                        \
   2656         p[ihi] = ((CH) >> 8) & 0xff;            \
   2657         p[ilo] = (CH) & 0xff;                   \
   2658         p += 2;                                 \
   2659     } while(0)
   2660 
   2661 #ifdef Py_UNICODE_WIDE
   2662     for (i = pairs = 0; i < size; i++)
   2663         if (s[i] >= 0x10000)
   2664             pairs++;
   2665 #endif
   2666     /* 2 * (size + pairs + (byteorder == 0)) */
   2667     if (size > PY_SSIZE_T_MAX ||
   2668         size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
   2669         return PyErr_NoMemory();
   2670     nsize = size + pairs + (byteorder == 0);
   2671     bytesize = nsize * 2;
   2672     if (bytesize / 2 != nsize)
   2673         return PyErr_NoMemory();
   2674     v = PyString_FromStringAndSize(NULL, bytesize);
   2675     if (v == NULL)
   2676         return NULL;
   2677 
   2678     p = (unsigned char *)PyString_AS_STRING(v);
   2679     if (byteorder == 0)
   2680         STORECHAR(0xFEFF);
   2681     if (size == 0)
   2682         return v;
   2683 
   2684     if (byteorder == -1) {
   2685         /* force LE */
   2686         ihi = 1;
   2687         ilo = 0;
   2688     }
   2689     else if (byteorder == 1) {
   2690         /* force BE */
   2691         ihi = 0;
   2692         ilo = 1;
   2693     }
   2694 
   2695     while (size-- > 0) {
   2696         Py_UNICODE ch = *s++;
   2697         Py_UNICODE ch2 = 0;
   2698 #ifdef Py_UNICODE_WIDE
   2699         if (ch >= 0x10000) {
   2700             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
   2701             ch  = 0xD800 | ((ch-0x10000) >> 10);
   2702         }
   2703 #endif
   2704         STORECHAR(ch);
   2705         if (ch2)
   2706             STORECHAR(ch2);
   2707     }
   2708     return v;
   2709 #undef STORECHAR
   2710 }
   2711 
   2712 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
   2713 {
   2714     if (!PyUnicode_Check(unicode)) {
   2715         PyErr_BadArgument();
   2716         return NULL;
   2717     }
   2718     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
   2719                                  PyUnicode_GET_SIZE(unicode),
   2720                                  NULL,
   2721                                  0);
   2722 }
   2723 
   2724 /* --- Unicode Escape Codec ----------------------------------------------- */
   2725 
   2726 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
   2727 
   2728 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
   2729                                         Py_ssize_t size,
   2730                                         const char *errors)
   2731 {
   2732     const char *starts = s;
   2733     Py_ssize_t startinpos;
   2734     Py_ssize_t endinpos;
   2735     Py_ssize_t outpos;
   2736     int i;
   2737     PyUnicodeObject *v;
   2738     Py_UNICODE *p;
   2739     const char *end;
   2740     char* message;
   2741     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
   2742     PyObject *errorHandler = NULL;
   2743     PyObject *exc = NULL;
   2744 
   2745     /* Escaped strings will always be longer than the resulting
   2746        Unicode string, so we start with size here and then reduce the
   2747        length after conversion to the true value.
   2748        (but if the error callback returns a long replacement string
   2749        we'll have to allocate more space) */
   2750     v = _PyUnicode_New(size);
   2751     if (v == NULL)
   2752         goto onError;
   2753     if (size == 0)
   2754         return (PyObject *)v;
   2755 
   2756     p = PyUnicode_AS_UNICODE(v);
   2757     end = s + size;
   2758 
   2759     while (s < end) {
   2760         unsigned char c;
   2761         Py_UNICODE x;
   2762         int digits;
   2763 
   2764         /* Non-escape characters are interpreted as Unicode ordinals */
   2765         if (*s != '\\') {
   2766             *p++ = (unsigned char) *s++;
   2767             continue;
   2768         }
   2769 
   2770         startinpos = s-starts;
   2771         /* \ - Escapes */
   2772         s++;
   2773         c = *s++;
   2774         if (s > end)
   2775             c = '\0'; /* Invalid after \ */
   2776         switch (c) {
   2777 
   2778             /* \x escapes */
   2779         case '\n': break;
   2780         case '\\': *p++ = '\\'; break;
   2781         case '\'': *p++ = '\''; break;
   2782         case '\"': *p++ = '\"'; break;
   2783         case 'b': *p++ = '\b'; break;
   2784         case 'f': *p++ = '\014'; break; /* FF */
   2785         case 't': *p++ = '\t'; break;
   2786         case 'n': *p++ = '\n'; break;
   2787         case 'r': *p++ = '\r'; break;
   2788         case 'v': *p++ = '\013'; break; /* VT */
   2789         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
   2790 
   2791             /* \OOO (octal) escapes */
   2792         case '0': case '1': case '2': case '3':
   2793         case '4': case '5': case '6': case '7':
   2794             x = s[-1] - '0';
   2795             if (s < end && '0' <= *s && *s <= '7') {
   2796                 x = (x<<3) + *s++ - '0';
   2797                 if (s < end && '0' <= *s && *s <= '7')
   2798                     x = (x<<3) + *s++ - '0';
   2799             }
   2800             *p++ = x;
   2801             break;
   2802 
   2803             /* hex escapes */
   2804             /* \xXX */
   2805         case 'x':
   2806             digits = 2;
   2807             message = "truncated \\xXX escape";
   2808             goto hexescape;
   2809 
   2810             /* \uXXXX */
   2811         case 'u':
   2812             digits = 4;
   2813             message = "truncated \\uXXXX escape";
   2814             goto hexescape;
   2815 
   2816             /* \UXXXXXXXX */
   2817         case 'U':
   2818             digits = 8;
   2819             message = "truncated \\UXXXXXXXX escape";
   2820         hexescape:
   2821             chr = 0;
   2822             outpos = p-PyUnicode_AS_UNICODE(v);
   2823             if (s+digits>end) {
   2824                 endinpos = size;
   2825                 if (unicode_decode_call_errorhandler(
   2826                         errors, &errorHandler,
   2827                         "unicodeescape", "end of string in escape sequence",
   2828                         starts, size, &startinpos, &endinpos, &exc, &s,
   2829                         &v, &outpos, &p))
   2830                     goto onError;
   2831                 goto nextByte;
   2832             }
   2833             for (i = 0; i < digits; ++i) {
   2834                 c = (unsigned char) s[i];
   2835                 if (!isxdigit(c)) {
   2836                     endinpos = (s+i+1)-starts;
   2837                     if (unicode_decode_call_errorhandler(
   2838                             errors, &errorHandler,
   2839                             "unicodeescape", message,
   2840                             starts, size, &startinpos, &endinpos, &exc, &s,
   2841                             &v, &outpos, &p))
   2842                         goto onError;
   2843                     goto nextByte;
   2844                 }
   2845                 chr = (chr<<4) & ~0xF;
   2846                 if (c >= '0' && c <= '9')
   2847                     chr += c - '0';
   2848                 else if (c >= 'a' && c <= 'f')
   2849                     chr += 10 + c - 'a';
   2850                 else
   2851                     chr += 10 + c - 'A';
   2852             }
   2853             s += i;
   2854             if (chr == 0xffffffff && PyErr_Occurred())
   2855                 /* _decoding_error will have already written into the
   2856                    target buffer. */
   2857                 break;
   2858         store:
   2859             /* when we get here, chr is a 32-bit unicode character */
   2860             if (chr <= 0xffff)
   2861                 /* UCS-2 character */
   2862                 *p++ = (Py_UNICODE) chr;
   2863             else if (chr <= 0x10ffff) {
   2864                 /* UCS-4 character. Either store directly, or as
   2865                    surrogate pair. */
   2866 #ifdef Py_UNICODE_WIDE
   2867                 *p++ = chr;
   2868 #else
   2869                 chr -= 0x10000L;
   2870                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
   2871                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
   2872 #endif
   2873             } else {
   2874                 endinpos = s-starts;
   2875                 outpos = p-PyUnicode_AS_UNICODE(v);
   2876                 if (unicode_decode_call_errorhandler(
   2877                         errors, &errorHandler,
   2878                         "unicodeescape", "illegal Unicode character",
   2879                         starts, size, &startinpos, &endinpos, &exc, &s,
   2880                         &v, &outpos, &p))
   2881                     goto onError;
   2882             }
   2883             break;
   2884 
   2885             /* \N{name} */
   2886         case 'N':
   2887             message = "malformed \\N character escape";
   2888             if (ucnhash_CAPI == NULL) {
   2889                 /* load the unicode data module */
   2890                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
   2891                 if (ucnhash_CAPI == NULL)
   2892                     goto ucnhashError;
   2893             }
   2894             if (*s == '{') {
   2895                 const char *start = s+1;
   2896                 /* look for the closing brace */
   2897                 while (*s != '}' && s < end)
   2898                     s++;
   2899                 if (s > start && s < end && *s == '}') {
   2900                     /* found a name.  look it up in the unicode database */
   2901                     message = "unknown Unicode character name";
   2902                     s++;
   2903                     if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
   2904                         goto store;
   2905                 }
   2906             }
   2907             endinpos = s-starts;
   2908             outpos = p-PyUnicode_AS_UNICODE(v);
   2909             if (unicode_decode_call_errorhandler(
   2910                     errors, &errorHandler,
   2911                     "unicodeescape", message,
   2912                     starts, size, &startinpos, &endinpos, &exc, &s,
   2913                     &v, &outpos, &p))
   2914                 goto onError;
   2915             break;
   2916 
   2917         default:
   2918             if (s > end) {
   2919                 message = "\\ at end of string";
   2920                 s--;
   2921                 endinpos = s-starts;
   2922                 outpos = p-PyUnicode_AS_UNICODE(v);
   2923                 if (unicode_decode_call_errorhandler(
   2924                         errors, &errorHandler,
   2925                         "unicodeescape", message,
   2926                         starts, size, &startinpos, &endinpos, &exc, &s,
   2927                         &v, &outpos, &p))
   2928                     goto onError;
   2929             }
   2930             else {
   2931                 *p++ = '\\';
   2932                 *p++ = (unsigned char)s[-1];
   2933             }
   2934             break;
   2935         }
   2936       nextByte:
   2937         ;
   2938     }
   2939     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
   2940         goto onError;
   2941     Py_XDECREF(errorHandler);
   2942     Py_XDECREF(exc);
   2943     return (PyObject *)v;
   2944 
   2945   ucnhashError:
   2946     PyErr_SetString(
   2947         PyExc_UnicodeError,
   2948         "\\N escapes not supported (can't load unicodedata module)"
   2949         );
   2950     Py_XDECREF(v);
   2951     Py_XDECREF(errorHandler);
   2952     Py_XDECREF(exc);
   2953     return NULL;
   2954 
   2955   onError:
   2956     Py_XDECREF(v);
   2957     Py_XDECREF(errorHandler);
   2958     Py_XDECREF(exc);
   2959     return NULL;
   2960 }
   2961 
   2962 /* Return a Unicode-Escape string version of the Unicode object.
   2963 
   2964    If quotes is true, the string is enclosed in u"" or u'' quotes as
   2965    appropriate.
   2966 
   2967 */
   2968 
   2969 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
   2970                                              Py_ssize_t size,
   2971                                              Py_UNICODE ch)
   2972 {
   2973     /* like wcschr, but doesn't stop at NULL characters */
   2974 
   2975     while (size-- > 0) {
   2976         if (*s == ch)
   2977             return s;
   2978         s++;
   2979     }
   2980 
   2981     return NULL;
   2982 }
   2983 
   2984 static
   2985 PyObject *unicodeescape_string(const Py_UNICODE *s,
   2986                                Py_ssize_t size,
   2987                                int quotes)
   2988 {
   2989     PyObject *repr;
   2990     char *p;
   2991 
   2992     static const char *hexdigit = "0123456789abcdef";
   2993 #ifdef Py_UNICODE_WIDE
   2994     const Py_ssize_t expandsize = 10;
   2995 #else
   2996     const Py_ssize_t expandsize = 6;
   2997 #endif
   2998 
   2999     /* XXX(nnorwitz): rather than over-allocating, it would be
   3000        better to choose a different scheme.  Perhaps scan the
   3001        first N-chars of the string and allocate based on that size.
   3002     */
   3003     /* Initial allocation is based on the longest-possible unichr
   3004        escape.
   3005 
   3006        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
   3007        unichr, so in this case it's the longest unichr escape. In
   3008        narrow (UTF-16) builds this is five chars per source unichr
   3009        since there are two unichrs in the surrogate pair, so in narrow
   3010        (UTF-16) builds it's not the longest unichr escape.
   3011 
   3012        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
   3013        so in the narrow (UTF-16) build case it's the longest unichr
   3014        escape.
   3015     */
   3016 
   3017     if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
   3018         return PyErr_NoMemory();
   3019 
   3020     repr = PyString_FromStringAndSize(NULL,
   3021                                       2
   3022                                       + expandsize*size
   3023                                       + 1);
   3024     if (repr == NULL)
   3025         return NULL;
   3026 
   3027     p = PyString_AS_STRING(repr);
   3028 
   3029     if (quotes) {
   3030         *p++ = 'u';
   3031         *p++ = (findchar(s, size, '\'') &&
   3032                 !findchar(s, size, '"')) ? '"' : '\'';
   3033     }
   3034     while (size-- > 0) {
   3035         Py_UNICODE ch = *s++;
   3036 
   3037         /* Escape quotes and backslashes */
   3038         if ((quotes &&
   3039              ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
   3040             *p++ = '\\';
   3041             *p++ = (char) ch;
   3042             continue;
   3043         }
   3044 
   3045 #ifdef Py_UNICODE_WIDE
   3046         /* Map 21-bit characters to '\U00xxxxxx' */
   3047         else if (ch >= 0x10000) {
   3048             *p++ = '\\';
   3049             *p++ = 'U';
   3050             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
   3051             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
   3052             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
   3053             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
   3054             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
   3055             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
   3056             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
   3057             *p++ = hexdigit[ch & 0x0000000F];
   3058             continue;
   3059         }
   3060 #else
   3061         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
   3062         else if (ch >= 0xD800 && ch < 0xDC00) {
   3063             Py_UNICODE ch2;
   3064             Py_UCS4 ucs;
   3065 
   3066             ch2 = *s++;
   3067             size--;
   3068             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
   3069                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
   3070                 *p++ = '\\';
   3071                 *p++ = 'U';
   3072                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
   3073                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
   3074                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
   3075                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
   3076                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
   3077                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
   3078                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
   3079                 *p++ = hexdigit[ucs & 0x0000000F];
   3080                 continue;
   3081             }
   3082             /* Fall through: isolated surrogates are copied as-is */
   3083             s--;
   3084             size++;
   3085         }
   3086 #endif
   3087 
   3088         /* Map 16-bit characters to '\uxxxx' */
   3089         if (ch >= 256) {
   3090             *p++ = '\\';
   3091             *p++ = 'u';
   3092             *p++ = hexdigit[(ch >> 12) & 0x000F];
   3093             *p++ = hexdigit[(ch >> 8) & 0x000F];
   3094             *p++ = hexdigit[(ch >> 4) & 0x000F];
   3095             *p++ = hexdigit[ch & 0x000F];
   3096         }
   3097 
   3098         /* Map special whitespace to '\t', \n', '\r' */
   3099         else if (ch == '\t') {
   3100             *p++ = '\\';
   3101             *p++ = 't';
   3102         }
   3103         else if (ch == '\n') {
   3104             *p++ = '\\';
   3105             *p++ = 'n';
   3106         }
   3107         else if (ch == '\r') {
   3108             *p++ = '\\';
   3109             *p++ = 'r';
   3110         }
   3111 
   3112         /* Map non-printable US ASCII to '\xhh' */
   3113         else if (ch < ' ' || ch >= 0x7F) {
   3114             *p++ = '\\';
   3115             *p++ = 'x';
   3116             *p++ = hexdigit[(ch >> 4) & 0x000F];
   3117             *p++ = hexdigit[ch & 0x000F];
   3118         }
   3119 
   3120         /* Copy everything else as-is */
   3121         else
   3122             *p++ = (char) ch;
   3123     }
   3124     if (quotes)
   3125         *p++ = PyString_AS_STRING(repr)[1];
   3126 
   3127     *p = '\0';
   3128     if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
   3129         return NULL;
   3130     return repr;
   3131 }
   3132 
   3133 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
   3134                                         Py_ssize_t size)
   3135 {
   3136     return unicodeescape_string(s, size, 0);
   3137 }
   3138 
   3139 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
   3140 {
   3141     if (!PyUnicode_Check(unicode)) {
   3142         PyErr_BadArgument();
   3143         return NULL;
   3144     }
   3145     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
   3146                                          PyUnicode_GET_SIZE(unicode));
   3147 }
   3148 
   3149 /* --- Raw Unicode Escape Codec ------------------------------------------- */
   3150 
   3151 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
   3152                                            Py_ssize_t size,
   3153                                            const char *errors)
   3154 {
   3155     const char *starts = s;
   3156     Py_ssize_t startinpos;
   3157     Py_ssize_t endinpos;
   3158     Py_ssize_t outpos;
   3159     PyUnicodeObject *v;
   3160     Py_UNICODE *p;
   3161     const char *end;
   3162     const char *bs;
   3163     PyObject *errorHandler = NULL;
   3164     PyObject *exc = NULL;
   3165 
   3166     /* Escaped strings will always be longer than the resulting
   3167        Unicode string, so we start with size here and then reduce the
   3168        length after conversion to the true value. (But decoding error
   3169        handler might have to resize the string) */
   3170     v = _PyUnicode_New(size);
   3171     if (v == NULL)
   3172         goto onError;
   3173     if (size == 0)
   3174         return (PyObject *)v;
   3175     p = PyUnicode_AS_UNICODE(v);
   3176     end = s + size;
   3177     while (s < end) {
   3178         unsigned char c;
   3179         Py_UCS4 x;
   3180         int i;
   3181         int count;
   3182 
   3183         /* Non-escape characters are interpreted as Unicode ordinals */
   3184         if (*s != '\\') {
   3185             *p++ = (unsigned char)*s++;
   3186             continue;
   3187         }
   3188         startinpos = s-starts;
   3189 
   3190         /* \u-escapes are only interpreted iff the number of leading
   3191            backslashes if odd */
   3192         bs = s;
   3193         for (;s < end;) {
   3194             if (*s != '\\')
   3195                 break;
   3196             *p++ = (unsigned char)*s++;
   3197         }
   3198         if (((s - bs) & 1) == 0 ||
   3199             s >= end ||
   3200             (*s != 'u' && *s != 'U')) {
   3201             continue;
   3202         }
   3203         p--;
   3204         count = *s=='u' ? 4 : 8;
   3205         s++;
   3206 
   3207         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
   3208         outpos = p-PyUnicode_AS_UNICODE(v);
   3209         for (x = 0, i = 0; i < count; ++i, ++s) {
   3210             c = (unsigned char)*s;
   3211             if (!isxdigit(c)) {
   3212                 endinpos = s-starts;
   3213                 if (unicode_decode_call_errorhandler(
   3214                         errors, &errorHandler,
   3215                         "rawunicodeescape", "truncated \\uXXXX",
   3216                         starts, size, &startinpos, &endinpos, &exc, &s,
   3217                         &v, &outpos, &p))
   3218                     goto onError;
   3219                 goto nextByte;
   3220             }
   3221             x = (x<<4) & ~0xF;
   3222             if (c >= '0' && c <= '9')
   3223                 x += c - '0';
   3224             else if (c >= 'a' && c <= 'f')
   3225                 x += 10 + c - 'a';
   3226             else
   3227                 x += 10 + c - 'A';
   3228         }
   3229         if (x <= 0xffff)
   3230             /* UCS-2 character */
   3231             *p++ = (Py_UNICODE) x;
   3232         else if (x <= 0x10ffff) {
   3233             /* UCS-4 character. Either store directly, or as
   3234                surrogate pair. */
   3235 #ifdef Py_UNICODE_WIDE
   3236             *p++ = (Py_UNICODE) x;
   3237 #else
   3238             x -= 0x10000L;
   3239             *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
   3240             *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
   3241 #endif
   3242         } else {
   3243             endinpos = s-starts;
   3244             outpos = p-PyUnicode_AS_UNICODE(v);
   3245             if (unicode_decode_call_errorhandler(
   3246                     errors, &errorHandler,
   3247                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
   3248                     starts, size, &startinpos, &endinpos, &exc, &s,
   3249                     &v, &outpos, &p))
   3250                 goto onError;
   3251         }
   3252       nextByte:
   3253         ;
   3254     }
   3255     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
   3256         goto onError;
   3257     Py_XDECREF(errorHandler);
   3258     Py_XDECREF(exc);
   3259     return (PyObject *)v;
   3260 
   3261   onError:
   3262     Py_XDECREF(v);
   3263     Py_XDECREF(errorHandler);
   3264     Py_XDECREF(exc);
   3265     return NULL;
   3266 }
   3267 
   3268 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
   3269                                            Py_ssize_t size)
   3270 {
   3271     PyObject *repr;
   3272     char *p;
   3273     char *q;
   3274 
   3275     static const char *hexdigit = "0123456789abcdef";
   3276 #ifdef Py_UNICODE_WIDE
   3277     const Py_ssize_t expandsize = 10;
   3278 #else
   3279     const Py_ssize_t expandsize = 6;
   3280 #endif
   3281 
   3282     if (size > PY_SSIZE_T_MAX / expandsize)
   3283         return PyErr_NoMemory();
   3284 
   3285     repr = PyString_FromStringAndSize(NULL, expandsize * size);
   3286     if (repr == NULL)
   3287         return NULL;
   3288     if (size == 0)
   3289         return repr;
   3290 
   3291     p = q = PyString_AS_STRING(repr);
   3292     while (size-- > 0) {
   3293         Py_UNICODE ch = *s++;
   3294 #ifdef Py_UNICODE_WIDE
   3295         /* Map 32-bit characters to '\Uxxxxxxxx' */
   3296         if (ch >= 0x10000) {
   3297             *p++ = '\\';
   3298             *p++ = 'U';
   3299             *p++ = hexdigit[(ch >> 28) & 0xf];
   3300             *p++ = hexdigit[(ch >> 24) & 0xf];
   3301             *p++ = hexdigit[(ch >> 20) & 0xf];
   3302             *p++ = hexdigit[(ch >> 16) & 0xf];
   3303             *p++ = hexdigit[(ch >> 12) & 0xf];
   3304             *p++ = hexdigit[(ch >> 8) & 0xf];
   3305             *p++ = hexdigit[(ch >> 4) & 0xf];
   3306             *p++ = hexdigit[ch & 15];
   3307         }
   3308         else
   3309 #else
   3310             /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
   3311             if (ch >= 0xD800 && ch < 0xDC00) {
   3312                 Py_UNICODE ch2;
   3313                 Py_UCS4 ucs;
   3314 
   3315                 ch2 = *s++;
   3316                 size--;
   3317                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
   3318                     ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
   3319                     *p++ = '\\';
   3320                     *p++ = 'U';
   3321                     *p++ = hexdigit[(ucs >> 28) & 0xf];
   3322                     *p++ = hexdigit[(ucs >> 24) & 0xf];
   3323                     *p++ = hexdigit[(ucs >> 20) & 0xf];
   3324                     *p++ = hexdigit[(ucs >> 16) & 0xf];
   3325                     *p++ = hexdigit[(ucs >> 12) & 0xf];
   3326                     *p++ = hexdigit[(ucs >> 8) & 0xf];
   3327                     *p++ = hexdigit[(ucs >> 4) & 0xf];
   3328                     *p++ = hexdigit[ucs & 0xf];
   3329                     continue;
   3330                 }
   3331                 /* Fall through: isolated surrogates are copied as-is */
   3332                 s--;
   3333                 size++;
   3334             }
   3335 #endif
   3336         /* Map 16-bit characters to '\uxxxx' */
   3337         if (ch >= 256) {
   3338             *p++ = '\\';
   3339             *p++ = 'u';
   3340             *p++ = hexdigit[(ch >> 12) & 0xf];
   3341             *p++ = hexdigit[(ch >> 8) & 0xf];
   3342             *p++ = hexdigit[(ch >> 4) & 0xf];
   3343             *p++ = hexdigit[ch & 15];
   3344         }
   3345         /* Copy everything else as-is */
   3346         else
   3347             *p++ = (char) ch;
   3348     }
   3349     *p = '\0';
   3350     if (_PyString_Resize(&repr, p - q))
   3351         return NULL;
   3352     return repr;
   3353 }
   3354 
   3355 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
   3356 {
   3357     if (!PyUnicode_Check(unicode)) {
   3358         PyErr_BadArgument();
   3359         return NULL;
   3360     }
   3361     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
   3362                                             PyUnicode_GET_SIZE(unicode));
   3363 }
   3364 
   3365 /* --- Unicode Internal Codec ------------------------------------------- */
   3366 
   3367 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
   3368                                            Py_ssize_t size,
   3369                                            const char *errors)
   3370 {
   3371     const char *starts = s;
   3372     Py_ssize_t startinpos;
   3373     Py_ssize_t endinpos;
   3374     Py_ssize_t outpos;
   3375     PyUnicodeObject *v;
   3376     Py_UNICODE *p;
   3377     const char *end;
   3378     const char *reason;
   3379     PyObject *errorHandler = NULL;
   3380     PyObject *exc = NULL;
   3381 
   3382 #ifdef Py_UNICODE_WIDE
   3383     Py_UNICODE unimax = PyUnicode_GetMax();
   3384 #endif
   3385 
   3386     /* XXX overflow detection missing */
   3387     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
   3388     if (v == NULL)
   3389         goto onError;
   3390     if (PyUnicode_GetSize((PyObject *)v) == 0)
   3391         return (PyObject *)v;
   3392     p = PyUnicode_AS_UNICODE(v);
   3393     end = s + size;
   3394 
   3395     while (s < end) {
   3396         memcpy(p, s, sizeof(Py_UNICODE));
   3397         /* We have to sanity check the raw data, otherwise doom looms for
   3398            some malformed UCS-4 data. */
   3399         if (
   3400 #ifdef Py_UNICODE_WIDE
   3401             *p > unimax || *p < 0 ||
   3402 #endif
   3403             end-s < Py_UNICODE_SIZE
   3404             )
   3405         {
   3406             startinpos = s - starts;
   3407             if (end-s < Py_UNICODE_SIZE) {
   3408                 endinpos = end-starts;
   3409                 reason = "truncated input";
   3410             }
   3411             else {
   3412                 endinpos = s - starts + Py_UNICODE_SIZE;
   3413                 reason = "illegal code point (> 0x10FFFF)";
   3414             }
   3415             outpos = p - PyUnicode_AS_UNICODE(v);
   3416             if (unicode_decode_call_errorhandler(
   3417                     errors, &errorHandler,
   3418                     "unicode_internal", reason,
   3419                     starts, size, &startinpos, &endinpos, &exc, &s,
   3420                     &v, &outpos, &p)) {
   3421                 goto onError;
   3422             }
   3423         }
   3424         else {
   3425             p++;
   3426             s += Py_UNICODE_SIZE;
   3427         }
   3428     }
   3429 
   3430     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
   3431         goto onError;
   3432     Py_XDECREF(errorHandler);
   3433     Py_XDECREF(exc);
   3434     return (PyObject *)v;
   3435 
   3436   onError:
   3437     Py_XDECREF(v);
   3438     Py_XDECREF(errorHandler);
   3439     Py_XDECREF(exc);
   3440     return NULL;
   3441 }
   3442 
   3443 /* --- Latin-1 Codec ------------------------------------------------------ */
   3444 
   3445 PyObject *PyUnicode_DecodeLatin1(const char *s,
   3446                                  Py_ssize_t size,
   3447                                  const char *errors)
   3448 {
   3449     PyUnicodeObject *v;
   3450     Py_UNICODE *p;
   3451 
   3452     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
   3453     if (size == 1) {
   3454         Py_UNICODE r = *(unsigned char*)s;
   3455         return PyUnicode_FromUnicode(&r, 1);
   3456     }
   3457 
   3458     v = _PyUnicode_New(size);
   3459     if (v == NULL)
   3460         goto onError;
   3461     if (size == 0)
   3462         return (PyObject *)v;
   3463     p = PyUnicode_AS_UNICODE(v);
   3464     while (size-- > 0)
   3465         *p++ = (unsigned char)*s++;
   3466     return (PyObject *)v;
   3467 
   3468   onError:
   3469     Py_XDECREF(v);
   3470     return NULL;
   3471 }
   3472 
   3473 /* create or adjust a UnicodeEncodeError */
   3474 static void make_encode_exception(PyObject **exceptionObject,
   3475                                   const char *encoding,
   3476                                   const Py_UNICODE *unicode, Py_ssize_t size,
   3477                                   Py_ssize_t startpos, Py_ssize_t endpos,
   3478                                   const char *reason)
   3479 {
   3480     if (*exceptionObject == NULL) {
   3481         *exceptionObject = PyUnicodeEncodeError_Create(
   3482             encoding, unicode, size, startpos, endpos, reason);
   3483     }
   3484     else {
   3485         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
   3486             goto onError;
   3487         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
   3488             goto onError;
   3489         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
   3490             goto onError;
   3491         return;
   3492       onError:
   3493         Py_DECREF(*exceptionObject);
   3494         *exceptionObject = NULL;
   3495     }
   3496 }
   3497 
   3498 /* raises a UnicodeEncodeError */
   3499 static void raise_encode_exception(PyObject **exceptionObject,
   3500                                    const char *encoding,
   3501                                    const Py_UNICODE *unicode, Py_ssize_t size,
   3502                                    Py_ssize_t startpos, Py_ssize_t endpos,
   3503                                    const char *reason)
   3504 {
   3505     make_encode_exception(exceptionObject,
   3506                           encoding, unicode, size, startpos, endpos, reason);
   3507     if (*exceptionObject != NULL)
   3508         PyCodec_StrictErrors(*exceptionObject);
   3509 }
   3510 
   3511 /* error handling callback helper:
   3512    build arguments, call the callback and check the arguments,
   3513    put the result into newpos and return the replacement string, which
   3514    has to be freed by the caller */
   3515 static PyObject *unicode_encode_call_errorhandler(const char *errors,
   3516                                                   PyObject **errorHandler,
   3517                                                   const char *encoding, const char *reason,
   3518                                                   const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
   3519                                                   Py_ssize_t startpos, Py_ssize_t endpos,
   3520                                                   Py_ssize_t *newpos)
   3521 {
   3522     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
   3523 
   3524     PyObject *restuple;
   3525     PyObject *resunicode;
   3526 
   3527     if (*errorHandler == NULL) {
   3528         *errorHandler = PyCodec_LookupError(errors);
   3529         if (*errorHandler == NULL)
   3530             return NULL;
   3531     }
   3532 
   3533     make_encode_exception(exceptionObject,
   3534                           encoding, unicode, size, startpos, endpos, reason);
   3535     if (*exceptionObject == NULL)
   3536         return NULL;
   3537 
   3538     restuple = PyObject_CallFunctionObjArgs(
   3539         *errorHandler, *exceptionObject, NULL);
   3540     if (restuple == NULL)
   3541         return NULL;
   3542     if (!PyTuple_Check(restuple)) {
   3543         PyErr_SetString(PyExc_TypeError, &argparse[4]);
   3544         Py_DECREF(restuple);
   3545         return NULL;
   3546     }
   3547     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
   3548                           &resunicode, newpos)) {
   3549         Py_DECREF(restuple);
   3550         return NULL;
   3551     }
   3552     if (*newpos<0)
   3553         *newpos = size+*newpos;
   3554     if (*newpos<0 || *newpos>size) {
   3555         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
   3556         Py_DECREF(restuple);
   3557         return NULL;
   3558     }
   3559     Py_INCREF(resunicode);
   3560     Py_DECREF(restuple);
   3561     return resunicode;
   3562 }
   3563 
   3564 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
   3565                                      Py_ssize_t size,
   3566                                      const char *errors,
   3567                                      int limit)
   3568 {
   3569     /* output object */
   3570     PyObject *res;
   3571     /* pointers to the beginning and end+1 of input */
   3572     const Py_UNICODE *startp = p;
   3573     const Py_UNICODE *endp = p + size;
   3574     /* pointer to the beginning of the unencodable characters */
   3575     /* const Py_UNICODE *badp = NULL; */
   3576     /* pointer into the output */
   3577     char *str;
   3578     /* current output position */
   3579     Py_ssize_t respos = 0;
   3580     Py_ssize_t ressize;
   3581     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
   3582     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
   3583     PyObject *errorHandler = NULL;
   3584     PyObject *exc = NULL;
   3585     /* the following variable is used for caching string comparisons
   3586      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
   3587     int known_errorHandler = -1;
   3588 
   3589     /* allocate enough for a simple encoding without
   3590        replacements, if we need more, we'll resize */
   3591     res = PyString_FromStringAndSize(NULL, size);
   3592     if (res == NULL)
   3593         goto onError;
   3594     if (size == 0)
   3595         return res;
   3596     str = PyString_AS_STRING(res);
   3597     ressize = size;
   3598 
   3599     while (p<endp) {
   3600         Py_UNICODE c = *p;
   3601 
   3602         /* can we encode this? */
   3603         if (c<limit) {
   3604             /* no overflow check, because we know that the space is enough */
   3605             *str++ = (char)c;
   3606             ++p;
   3607         }
   3608         else {
   3609             Py_ssize_t unicodepos = p-startp;
   3610             Py_ssize_t requiredsize;
   3611             PyObject *repunicode;
   3612             Py_ssize_t repsize;
   3613             Py_ssize_t newpos;
   3614             Py_ssize_t respos;
   3615             Py_UNICODE *uni2;
   3616             /* startpos for collecting unencodable chars */
   3617             const Py_UNICODE *collstart = p;
   3618             const Py_UNICODE *collend = p;
   3619             /* find all unecodable characters */
   3620             while ((collend < endp) && ((*collend)>=limit))
   3621                 ++collend;
   3622             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
   3623             if (known_errorHandler==-1) {
   3624                 if ((errors==NULL) || (!strcmp(errors, "strict")))
   3625                     known_errorHandler = 1;
   3626                 else if (!strcmp(errors, "replace"))
   3627                     known_errorHandler = 2;
   3628                 else if (!strcmp(errors, "ignore"))
   3629                     known_errorHandler = 3;
   3630                 else if (!strcmp(errors, "xmlcharrefreplace"))
   3631                     known_errorHandler = 4;
   3632                 else
   3633                     known_errorHandler = 0;
   3634             }
   3635             switch (known_errorHandler) {
   3636             case 1: /* strict */
   3637                 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
   3638                 goto onError;
   3639             case 2: /* replace */
   3640                 while (collstart++<collend)
   3641                     *str++ = '?'; /* fall through */
   3642             case 3: /* ignore */
   3643                 p = collend;
   3644                 break;
   3645             case 4: /* xmlcharrefreplace */
   3646                 respos = str-PyString_AS_STRING(res);
   3647                 /* determine replacement size (temporarily (mis)uses p) */
   3648                 for (p = collstart, repsize = 0; p < collend; ++p) {
   3649                     if (*p<10)
   3650                         repsize += 2+1+1;
   3651                     else if (*p<100)
   3652                         repsize += 2+2+1;
   3653                     else if (*p<1000)
   3654                         repsize += 2+3+1;
   3655                     else if (*p<10000)
   3656                         repsize += 2+4+1;
   3657 #ifndef Py_UNICODE_WIDE
   3658                     else
   3659                         repsize += 2+5+1;
   3660 #else
   3661                     else if (*p<100000)
   3662                         repsize += 2+5+1;
   3663                     else if (*p<1000000)
   3664                         repsize += 2+6+1;
   3665                     else
   3666                         repsize += 2+7+1;
   3667 #endif
   3668                 }
   3669                 requiredsize = respos+repsize+(endp-collend);
   3670                 if (requiredsize > ressize) {
   3671                     if (requiredsize<2*ressize)
   3672                         requiredsize = 2*ressize;
   3673                     if (_PyString_Resize(&res, requiredsize))
   3674                         goto onError;
   3675                     str = PyString_AS_STRING(res) + respos;
   3676                     ressize = requiredsize;
   3677                 }
   3678                 /* generate replacement (temporarily (mis)uses p) */
   3679                 for (p = collstart; p < collend; ++p) {
   3680                     str += sprintf(str, "&#%d;", (int)*p);
   3681                 }
   3682                 p = collend;
   3683                 break;
   3684             default:
   3685                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
   3686                                                               encoding, reason, startp, size, &exc,
   3687                                                               collstart-startp, collend-startp, &newpos);
   3688                 if (repunicode == NULL)
   3689                     goto onError;
   3690                 /* need more space? (at least enough for what we have+the
   3691                    replacement+the rest of the string, so we won't have to
   3692                    check space for encodable characters) */
   3693                 respos = str-PyString_AS_STRING(res);
   3694                 repsize = PyUnicode_GET_SIZE(repunicode);
   3695                 requiredsize = respos+repsize+(endp-collend);
   3696                 if (requiredsize > ressize) {
   3697                     if (requiredsize<2*ressize)
   3698                         requiredsize = 2*ressize;
   3699                     if (_PyString_Resize(&res, requiredsize)) {
   3700                         Py_DECREF(repunicode);
   3701                         goto onError;
   3702                     }
   3703                     str = PyString_AS_STRING(res) + respos;
   3704                     ressize = requiredsize;
   3705                 }
   3706                 /* check if there is anything unencodable in the replacement
   3707                    and copy it to the output */
   3708                 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
   3709                     c = *uni2;
   3710                     if (c >= limit) {
   3711                         raise_encode_exception(&exc, encoding, startp, size,
   3712                                                unicodepos, unicodepos+1, reason);
   3713                         Py_DECREF(repunicode);
   3714                         goto onError;
   3715                     }
   3716                     *str = (char)c;
   3717                 }
   3718                 p = startp + newpos;
   3719                 Py_DECREF(repunicode);
   3720             }
   3721         }
   3722     }
   3723     /* Resize if we allocated to much */
   3724     respos = str-PyString_AS_STRING(res);
   3725     if (respos<ressize)
   3726         /* If this falls res will be NULL */
   3727         _PyString_Resize(&res, respos);
   3728     Py_XDECREF(errorHandler);
   3729     Py_XDECREF(exc);
   3730     return res;
   3731 
   3732   onError:
   3733     Py_XDECREF(res);
   3734     Py_XDECREF(errorHandler);
   3735     Py_XDECREF(exc);
   3736     return NULL;
   3737 }
   3738 
   3739 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
   3740                                  Py_ssize_t size,
   3741                                  const char *errors)
   3742 {
   3743     return unicode_encode_ucs1(p, size, errors, 256);
   3744 }
   3745 
   3746 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
   3747 {
   3748     if (!PyUnicode_Check(unicode)) {
   3749         PyErr_BadArgument();
   3750         return NULL;
   3751     }
   3752     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
   3753                                   PyUnicode_GET_SIZE(unicode),
   3754                                   NULL);
   3755 }
   3756 
   3757 /* --- 7-bit ASCII Codec -------------------------------------------------- */
   3758 
   3759 PyObject *PyUnicode_DecodeASCII(const char *s,
   3760                                 Py_ssize_t size,
   3761                                 const char *errors)
   3762 {
   3763     const char *starts = s;
   3764     PyUnicodeObject *v;
   3765     Py_UNICODE *p;
   3766     Py_ssize_t startinpos;
   3767     Py_ssize_t endinpos;
   3768     Py_ssize_t outpos;
   3769     const char *e;
   3770     PyObject *errorHandler = NULL;
   3771     PyObject *exc = NULL;
   3772 
   3773     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
   3774     if (size == 1 && *(unsigned char*)s < 128) {
   3775         Py_UNICODE r = *(unsigned char*)s;
   3776         return PyUnicode_FromUnicode(&r, 1);
   3777     }
   3778 
   3779     v = _PyUnicode_New(size);
   3780     if (v == NULL)
   3781         goto onError;
   3782     if (size == 0)
   3783         return (PyObject *)v;
   3784     p = PyUnicode_AS_UNICODE(v);
   3785     e = s + size;
   3786     while (s < e) {
   3787         register unsigned char c = (unsigned char)*s;
   3788         if (c < 128) {
   3789             *p++ = c;
   3790             ++s;
   3791         }
   3792         else {
   3793             startinpos = s-starts;
   3794             endinpos = startinpos + 1;
   3795             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
   3796             if (unicode_decode_call_errorhandler(
   3797                     errors, &errorHandler,
   3798                     "ascii", "ordinal not in range(128)",
   3799                     starts, size, &startinpos, &endinpos, &exc, &s,
   3800                     &v, &outpos, &p))
   3801                 goto onError;
   3802         }
   3803     }
   3804     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
   3805         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
   3806             goto onError;
   3807     Py_XDECREF(errorHandler);
   3808     Py_XDECREF(exc);
   3809     return (PyObject *)v;
   3810 
   3811   onError:
   3812     Py_XDECREF(v);
   3813     Py_XDECREF(errorHandler);
   3814     Py_XDECREF(exc);
   3815     return NULL;
   3816 }
   3817 
   3818 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
   3819                                 Py_ssize_t size,
   3820                                 const char *errors)
   3821 {
   3822     return unicode_encode_ucs1(p, size, errors, 128);
   3823 }
   3824 
   3825 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
   3826 {
   3827     if (!PyUnicode_Check(unicode)) {
   3828         PyErr_BadArgument();
   3829         return NULL;
   3830     }
   3831     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
   3832                                  PyUnicode_GET_SIZE(unicode),
   3833                                  NULL);
   3834 }
   3835 
   3836 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
   3837 
   3838 /* --- MBCS codecs for Windows -------------------------------------------- */
   3839 
   3840 #if SIZEOF_INT < SIZEOF_SIZE_T
   3841 #define NEED_RETRY
   3842 #endif
   3843 
   3844 /* XXX This code is limited to "true" double-byte encodings, as
   3845    a) it assumes an incomplete character consists of a single byte, and
   3846    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
   3847    encodings, see IsDBCSLeadByteEx documentation. */
   3848 
   3849 static int is_dbcs_lead_byte(const char *s, int offset)
   3850 {
   3851     const char *curr = s + offset;
   3852 
   3853     if (IsDBCSLeadByte(*curr)) {
   3854         const char *prev = CharPrev(s, curr);
   3855         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
   3856     }
   3857     return 0;
   3858 }
   3859 
   3860 /*
   3861  * Decode MBCS string into unicode object. If 'final' is set, converts
   3862  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
   3863  */
   3864 static int decode_mbcs(PyUnicodeObject **v,
   3865                        const char *s, /* MBCS string */
   3866                        int size, /* sizeof MBCS string */
   3867                        int final)
   3868 {
   3869     Py_UNICODE *p;
   3870     Py_ssize_t n = 0;
   3871     int usize = 0;
   3872 
   3873     assert(size >= 0);
   3874 
   3875     /* Skip trailing lead-byte unless 'final' is set */
   3876     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
   3877         --size;
   3878 
   3879     /* First get the size of the result */
   3880     if (size > 0) {
   3881         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
   3882         if (usize == 0) {
   3883             PyErr_SetFromWindowsErrWithFilename(0, NULL);
   3884             return -1;
   3885         }
   3886     }
   3887 
   3888     if (*v == NULL) {
   3889         /* Create unicode object */
   3890         *v = _PyUnicode_New(usize);
   3891         if (*v == NULL)
   3892             return -1;
   3893     }
   3894     else {
   3895         /* Extend unicode object */
   3896         n = PyUnicode_GET_SIZE(*v);
   3897         if (_PyUnicode_Resize(v, n + usize) < 0)
   3898             return -1;
   3899     }
   3900 
   3901     /* Do the conversion */
   3902     if (size > 0) {
   3903         p = PyUnicode_AS_UNICODE(*v) + n;
   3904         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
   3905             PyErr_SetFromWindowsErrWithFilename(0, NULL);
   3906             return -1;
   3907         }
   3908     }
   3909 
   3910     return size;
   3911 }
   3912 
   3913 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
   3914                                        Py_ssize_t size,
   3915                                        const char *errors,
   3916                                        Py_ssize_t *consumed)
   3917 {
   3918     PyUnicodeObject *v = NULL;
   3919     int done;
   3920 
   3921     if (consumed)
   3922         *consumed = 0;
   3923 
   3924 #ifdef NEED_RETRY
   3925   retry:
   3926     if (size > INT_MAX)
   3927         done = decode_mbcs(&v, s, INT_MAX, 0);
   3928     else
   3929 #endif
   3930         done = decode_mbcs(&v, s, (int)size, !consumed);
   3931 
   3932     if (done < 0) {
   3933         Py_XDECREF(v);
   3934         return NULL;
   3935     }
   3936 
   3937     if (consumed)
   3938         *consumed += done;
   3939 
   3940 #ifdef NEED_RETRY
   3941     if (size > INT_MAX) {
   3942         s += done;
   3943         size -= done;
   3944         goto retry;
   3945     }
   3946 #endif
   3947 
   3948     return (PyObject *)v;
   3949 }
   3950 
   3951 PyObject *PyUnicode_DecodeMBCS(const char *s,
   3952                                Py_ssize_t size,
   3953                                const char *errors)
   3954 {
   3955     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
   3956 }
   3957 
   3958 /*
   3959  * Convert unicode into string object (MBCS).
   3960  * Returns 0 if succeed, -1 otherwise.
   3961  */
   3962 static int encode_mbcs(PyObject **repr,
   3963                        const Py_UNICODE *p, /* unicode */
   3964                        int size) /* size of unicode */
   3965 {
   3966     int mbcssize = 0;
   3967     Py_ssize_t n = 0;
   3968 
   3969     assert(size >= 0);
   3970 
   3971     /* First get the size of the result */
   3972     if (size > 0) {
   3973         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
   3974         if (mbcssize == 0) {
   3975             PyErr_SetFromWindowsErrWithFilename(0, NULL);
   3976             return -1;
   3977         }
   3978     }
   3979 
   3980     if (*repr == NULL) {
   3981         /* Create string object */
   3982         *repr = PyString_FromStringAndSize(NULL, mbcssize);
   3983         if (*repr == NULL)
   3984             return -1;
   3985     }
   3986     else {
   3987         /* Extend string object */
   3988         n = PyString_Size(*repr);
   3989         if (_PyString_Resize(repr, n + mbcssize) < 0)
   3990             return -1;
   3991     }
   3992 
   3993     /* Do the conversion */
   3994     if (size > 0) {
   3995         char *s = PyString_AS_STRING(*repr) + n;
   3996         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
   3997             PyErr_SetFromWindowsErrWithFilename(0, NULL);
   3998             return -1;
   3999         }
   4000     }
   4001 
   4002     return 0;
   4003 }
   4004 
   4005 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
   4006                                Py_ssize_t size,
   4007                                const char *errors)
   4008 {
   4009     PyObject *repr = NULL;
   4010     int ret;
   4011 
   4012 #ifdef NEED_RETRY
   4013   retry:
   4014     if (size > INT_MAX)
   4015         ret = encode_mbcs(&repr, p, INT_MAX);
   4016     else
   4017 #endif
   4018         ret = encode_mbcs(&repr, p, (int)size);
   4019 
   4020     if (ret < 0) {
   4021         Py_XDECREF(repr);
   4022         return NULL;
   4023     }
   4024 
   4025 #ifdef NEED_RETRY
   4026     if (size > INT_MAX) {
   4027         p += INT_MAX;
   4028         size -= INT_MAX;
   4029         goto retry;
   4030     }
   4031 #endif
   4032 
   4033     return repr;
   4034 }
   4035 
   4036 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
   4037 {
   4038     if (!PyUnicode_Check(unicode)) {
   4039         PyErr_BadArgument();
   4040         return NULL;
   4041     }
   4042     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
   4043                                 PyUnicode_GET_SIZE(unicode),
   4044                                 NULL);
   4045 }
   4046 
   4047 #undef NEED_RETRY
   4048 
   4049 #endif /* MS_WINDOWS */
   4050 
   4051 /* --- Character Mapping Codec -------------------------------------------- */
   4052 
   4053 PyObject *PyUnicode_DecodeCharmap(const char *s,
   4054                                   Py_ssize_t size,
   4055                                   PyObject *mapping,
   4056                                   const char *errors)
   4057 {
   4058     const char *starts = s;
   4059     Py_ssize_t startinpos;
   4060     Py_ssize_t endinpos;
   4061     Py_ssize_t outpos;
   4062     const char *e;
   4063     PyUnicodeObject *v;
   4064     Py_UNICODE *p;
   4065     Py_ssize_t extrachars = 0;
   4066     PyObject *errorHandler = NULL;
   4067     PyObject *exc = NULL;
   4068     Py_UNICODE *mapstring = NULL;
   4069     Py_ssize_t maplen = 0;
   4070 
   4071     /* Default to Latin-1 */
   4072     if (mapping == NULL)
   4073         return PyUnicode_DecodeLatin1(s, size, errors);
   4074 
   4075     v = _PyUnicode_New(size);
   4076     if (v == NULL)
   4077         goto onError;
   4078     if (size == 0)
   4079         return (PyObject *)v;
   4080     p = PyUnicode_AS_UNICODE(v);
   4081     e = s + size;
   4082     if (PyUnicode_CheckExact(mapping)) {
   4083         mapstring = PyUnicode_AS_UNICODE(mapping);
   4084         maplen = PyUnicode_GET_SIZE(mapping);
   4085         while (s < e) {
   4086             unsigned char ch = *s;
   4087             Py_UNICODE x = 0xfffe; /* illegal value */
   4088 
   4089             if (ch < maplen)
   4090                 x = mapstring[ch];
   4091 
   4092             if (x == 0xfffe) {
   4093                 /* undefined mapping */
   4094                 outpos = p-PyUnicode_AS_UNICODE(v);
   4095                 startinpos = s-starts;
   4096                 endinpos = startinpos+1;
   4097                 if (unicode_decode_call_errorhandler(
   4098                         errors, &errorHandler,
   4099                         "charmap", "character maps to <undefined>",
   4100                         starts, size, &startinpos, &endinpos, &exc, &s,
   4101                         &v, &outpos, &p)) {
   4102                     goto onError;
   4103                 }
   4104                 continue;
   4105             }
   4106             *p++ = x;
   4107             ++s;
   4108         }
   4109     }
   4110     else {
   4111         while (s < e) {
   4112             unsigned char ch = *s;
   4113             PyObject *w, *x;
   4114 
   4115             /* Get mapping (char ordinal -> integer, Unicode char or None) */
   4116             w = PyInt_FromLong((long)ch);
   4117             if (w == NULL)
   4118                 goto onError;
   4119             x = PyObject_GetItem(mapping, w);
   4120             Py_DECREF(w);
   4121             if (x == NULL) {
   4122                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
   4123                     /* No mapping found means: mapping is undefined. */
   4124                     PyErr_Clear();
   4125                     x = Py_None;
   4126                     Py_INCREF(x);
   4127                 } else
   4128                     goto onError;
   4129             }
   4130 
   4131             /* Apply mapping */
   4132             if (PyInt_Check(x)) {
   4133                 long value = PyInt_AS_LONG(x);
   4134                 if (value < 0 || value > 65535) {
   4135                     PyErr_SetString(PyExc_TypeError,
   4136                                     "character mapping must be in range(65536)");
   4137                     Py_DECREF(x);
   4138                     goto onError;
   4139                 }
   4140                 *p++ = (Py_UNICODE)value;
   4141             }
   4142             else if (x == Py_None) {
   4143                 /* undefined mapping */
   4144                 outpos = p-PyUnicode_AS_UNICODE(v);
   4145                 startinpos = s-starts;
   4146                 endinpos = startinpos+1;
   4147                 if (unicode_decode_call_errorhandler(
   4148                         errors, &errorHandler,
   4149                         "charmap", "character maps to <undefined>",
   4150                         starts, size, &startinpos, &endinpos, &exc, &s,
   4151                         &v, &outpos, &p)) {
   4152                     Py_DECREF(x);
   4153                     goto onError;
   4154                 }
   4155                 Py_DECREF(x);
   4156                 continue;
   4157             }
   4158             else if (PyUnicode_Check(x)) {
   4159                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
   4160 
   4161                 if (targetsize == 1)
   4162                     /* 1-1 mapping */
   4163                     *p++ = *PyUnicode_AS_UNICODE(x);
   4164 
   4165                 else if (targetsize > 1) {
   4166                     /* 1-n mapping */
   4167                     if (targetsize > extrachars) {
   4168                         /* resize first */
   4169                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
   4170                         Py_ssize_t needed = (targetsize - extrachars) + \
   4171                             (targetsize << 2);
   4172                         extrachars += needed;
   4173                         /* XXX overflow detection missing */
   4174                         if (_PyUnicode_Resize(&v,
   4175                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
   4176                             Py_DECREF(x);
   4177                             goto onError;
   4178                         }
   4179                         p = PyUnicode_AS_UNICODE(v) + oldpos;
   4180                     }
   4181                     Py_UNICODE_COPY(p,
   4182                                     PyUnicode_AS_UNICODE(x),
   4183                                     targetsize);
   4184                     p += targetsize;
   4185                     extrachars -= targetsize;
   4186                 }
   4187                 /* 1-0 mapping: skip the character */
   4188             }
   4189             else {
   4190                 /* wrong return value */
   4191                 PyErr_SetString(PyExc_TypeError,
   4192                                 "character mapping must return integer, None or unicode");
   4193                 Py_DECREF(x);
   4194                 goto onError;
   4195             }
   4196             Py_DECREF(x);
   4197             ++s;
   4198         }
   4199     }
   4200     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
   4201         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
   4202             goto onError;
   4203     Py_XDECREF(errorHandler);
   4204     Py_XDECREF(exc);
   4205     return (PyObject *)v;
   4206 
   4207   onError:
   4208     Py_XDECREF(errorHandler);
   4209     Py_XDECREF(exc);
   4210     Py_XDECREF(v);
   4211     return NULL;
   4212 }
   4213 
   4214 /* Charmap encoding: the lookup table */
   4215 
   4216 struct encoding_map{
   4217     PyObject_HEAD
   4218     unsigned char level1[32];
   4219     int count2, count3;
   4220     unsigned char level23[1];
   4221 };
   4222 
   4223 static PyObject*
   4224 encoding_map_size(PyObject *obj, PyObject* args)
   4225 {
   4226     struct encoding_map *map = (struct encoding_map*)obj;
   4227     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
   4228                           128*map->count3);
   4229 }
   4230 
   4231 static PyMethodDef encoding_map_methods[] = {
   4232     {"size", encoding_map_size, METH_NOARGS,
   4233      PyDoc_STR("Return the size (in bytes) of this object") },
   4234     { 0 }
   4235 };
   4236 
   4237 static void
   4238 encoding_map_dealloc(PyObject* o)
   4239 {
   4240     PyObject_FREE(o);
   4241 }
   4242 
   4243 static PyTypeObject EncodingMapType = {
   4244     PyVarObject_HEAD_INIT(NULL, 0)
   4245     "EncodingMap",          /*tp_name*/
   4246     sizeof(struct encoding_map),   /*tp_basicsize*/
   4247     0,                      /*tp_itemsize*/
   4248     /* methods */
   4249     encoding_map_dealloc,   /*tp_dealloc*/
   4250     0,                      /*tp_print*/
   4251     0,                      /*tp_getattr*/
   4252     0,                      /*tp_setattr*/
   4253     0,                      /*tp_compare*/
   4254     0,                      /*tp_repr*/
   4255     0,                      /*tp_as_number*/
   4256     0,                      /*tp_as_sequence*/
   4257     0,                      /*tp_as_mapping*/
   4258     0,                      /*tp_hash*/
   4259     0,                      /*tp_call*/
   4260     0,                      /*tp_str*/
   4261     0,                      /*tp_getattro*/
   4262     0,                      /*tp_setattro*/
   4263     0,                      /*tp_as_buffer*/
   4264     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
   4265     0,                      /*tp_doc*/
   4266     0,                      /*tp_traverse*/
   4267     0,                      /*tp_clear*/
   4268     0,                      /*tp_richcompare*/
   4269     0,                      /*tp_weaklistoffset*/
   4270     0,                      /*tp_iter*/
   4271     0,                      /*tp_iternext*/
   4272     encoding_map_methods,   /*tp_methods*/
   4273     0,                      /*tp_members*/
   4274     0,                      /*tp_getset*/
   4275     0,                      /*tp_base*/
   4276     0,                      /*tp_dict*/
   4277     0,                      /*tp_descr_get*/
   4278     0,                      /*tp_descr_set*/
   4279     0,                      /*tp_dictoffset*/
   4280     0,                      /*tp_init*/
   4281     0,                      /*tp_alloc*/
   4282     0,                      /*tp_new*/
   4283     0,                      /*tp_free*/
   4284     0,                      /*tp_is_gc*/
   4285 };
   4286 
   4287 PyObject*
   4288 PyUnicode_BuildEncodingMap(PyObject* string)
   4289 {
   4290     Py_UNICODE *decode;
   4291     PyObject *result;
   4292     struct encoding_map *mresult;
   4293     int i;
   4294     int need_dict = 0;
   4295     unsigned char level1[32];
   4296     unsigned char level2[512];
   4297     unsigned char *mlevel1, *mlevel2, *mlevel3;
   4298     int count2 = 0, count3 = 0;
   4299 
   4300     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
   4301         PyErr_BadArgument();
   4302         return NULL;
   4303     }
   4304     decode = PyUnicode_AS_UNICODE(string);
   4305     memset(level1, 0xFF, sizeof level1);
   4306     memset(level2, 0xFF, sizeof level2);
   4307 
   4308     /* If there isn't a one-to-one mapping of NULL to \0,
   4309        or if there are non-BMP characters, we need to use
   4310        a mapping dictionary. */
   4311     if (decode[0] != 0)
   4312         need_dict = 1;
   4313     for (i = 1; i < 256; i++) {
   4314         int l1, l2;
   4315         if (decode[i] == 0
   4316 #ifdef Py_UNICODE_WIDE
   4317             || decode[i] > 0xFFFF
   4318 #endif
   4319             ) {
   4320             need_dict = 1;
   4321             break;
   4322         }
   4323         if (decode[i] == 0xFFFE)
   4324             /* unmapped character */
   4325             continue;
   4326         l1 = decode[i] >> 11;
   4327         l2 = decode[i] >> 7;
   4328         if (level1[l1] == 0xFF)
   4329             level1[l1] = count2++;
   4330         if (level2[l2] == 0xFF)
   4331             level2[l2] = count3++;
   4332     }
   4333 
   4334     if (count2 >= 0xFF || count3 >= 0xFF)
   4335         need_dict = 1;
   4336 
   4337     if (need_dict) {
   4338         PyObject *result = PyDict_New();
   4339         PyObject *key, *value;
   4340         if (!result)
   4341             return NULL;
   4342         for (i = 0; i < 256; i++) {
   4343             value = NULL;
   4344             key = PyInt_FromLong(decode[i]);
   4345             value = PyInt_FromLong(i);
   4346             if (!key || !value)
   4347                 goto failed1;
   4348             if (PyDict_SetItem(result, key, value) == -1)
   4349                 goto failed1;
   4350             Py_DECREF(key);
   4351             Py_DECREF(value);
   4352         }
   4353         return result;
   4354       failed1:
   4355         Py_XDECREF(key);
   4356         Py_XDECREF(value);
   4357         Py_DECREF(result);
   4358         return NULL;
   4359     }
   4360 
   4361     /* Create a three-level trie */
   4362     result = PyObject_MALLOC(sizeof(struct encoding_map) +
   4363                              16*count2 + 128*count3 - 1);
   4364     if (!result)
   4365         return PyErr_NoMemory();
   4366     PyObject_Init(result, &EncodingMapType);
   4367     mresult = (struct encoding_map*)result;
   4368     mresult->count2 = count2;
   4369     mresult->count3 = count3;
   4370     mlevel1 = mresult->level1;
   4371     mlevel2 = mresult->level23;
   4372     mlevel3 = mresult->level23 + 16*count2;
   4373     memcpy(mlevel1, level1, 32);
   4374     memset(mlevel2, 0xFF, 16*count2);
   4375     memset(mlevel3, 0, 128*count3);
   4376     count3 = 0;
   4377     for (i = 1; i < 256; i++) {
   4378         int o1, o2, o3, i2, i3;
   4379         if (decode[i] == 0xFFFE)
   4380             /* unmapped character */
   4381             continue;
   4382         o1 = decode[i]>>11;
   4383         o2 = (decode[i]>>7) & 0xF;
   4384         i2 = 16*mlevel1[o1] + o2;
   4385         if (mlevel2[i2] == 0xFF)
   4386             mlevel2[i2] = count3++;
   4387         o3 = decode[i] & 0x7F;
   4388         i3 = 128*mlevel2[i2] + o3;
   4389         mlevel3[i3] = i;
   4390     }
   4391     return result;
   4392 }
   4393 
   4394 static int
   4395 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
   4396 {
   4397     struct encoding_map *map = (struct encoding_map*)mapping;
   4398     int l1 = c>>11;
   4399     int l2 = (c>>7) & 0xF;
   4400     int l3 = c & 0x7F;
   4401     int i;
   4402 
   4403 #ifdef Py_UNICODE_WIDE
   4404     if (c > 0xFFFF) {
   4405         return -1;
   4406     }
   4407 #endif
   4408     if (c == 0)
   4409         return 0;
   4410     /* level 1*/
   4411     i = map->level1[l1];
   4412     if (i == 0xFF) {
   4413         return -1;
   4414     }
   4415     /* level 2*/
   4416     i = map->level23[16*i+l2];
   4417     if (i == 0xFF) {
   4418         return -1;
   4419     }
   4420     /* level 3 */
   4421     i = map->level23[16*map->count2 + 128*i + l3];
   4422     if (i == 0) {
   4423         return -1;
   4424     }
   4425     return i;
   4426 }
   4427 
   4428 /* Lookup the character ch in the mapping. If the character
   4429    can't be found, Py_None is returned (or NULL, if another
   4430    error occurred). */
   4431 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
   4432 {
   4433     PyObject *w = PyInt_FromLong((long)c);
   4434     PyObject *x;
   4435 
   4436     if (w == NULL)
   4437         return NULL;
   4438     x = PyObject_GetItem(mapping, w);
   4439     Py_DECREF(w);
   4440     if (x == NULL) {
   4441         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
   4442             /* No mapping found means: mapping is undefined. */
   4443             PyErr_Clear();
   4444             x = Py_None;
   4445             Py_INCREF(x);
   4446             return x;
   4447         } else
   4448             return NULL;
   4449     }
   4450     else if (x == Py_None)
   4451         return x;
   4452     else if (PyInt_Check(x)) {
   4453         long value = PyInt_AS_LONG(x);
   4454         if (value < 0 || value > 255) {
   4455             PyErr_SetString(PyExc_TypeError,
   4456                             "character mapping must be in range(256)");
   4457             Py_DECREF(x);
   4458             return NULL;
   4459         }
   4460         return x;
   4461     }
   4462     else if (PyString_Check(x))
   4463         return x;
   4464     else {
   4465         /* wrong return value */
   4466         PyErr_SetString(PyExc_TypeError,
   4467                         "character mapping must return integer, None or str");
   4468         Py_DECREF(x);
   4469         return NULL;
   4470     }
   4471 }
   4472 
   4473 static int
   4474 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
   4475 {
   4476     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
   4477     /* exponentially overallocate to minimize reallocations */
   4478     if (requiredsize < 2*outsize)
   4479         requiredsize = 2*outsize;
   4480     if (_PyString_Resize(outobj, requiredsize)) {
   4481         return 0;
   4482     }
   4483     return 1;
   4484 }
   4485 
   4486 typedef enum charmapencode_result {
   4487     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
   4488 }charmapencode_result;
   4489 /* lookup the character, put the result in the output string and adjust
   4490    various state variables. Reallocate the output string if not enough
   4491    space is available. Return a new reference to the object that
   4492    was put in the output buffer, or Py_None, if the mapping was undefined
   4493    (in which case no character was written) or NULL, if a
   4494    reallocation error occurred. The caller must decref the result */
   4495 static
   4496 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
   4497                                           PyObject **outobj, Py_ssize_t *outpos)
   4498 {
   4499     PyObject *rep;
   4500     char *outstart;
   4501     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
   4502 
   4503     if (Py_TYPE(mapping) == &EncodingMapType) {
   4504         int res = encoding_map_lookup(c, mapping);
   4505         Py_ssize_t requiredsize = *outpos+1;
   4506         if (res == -1)
   4507             return enc_FAILED;
   4508         if (outsize<requiredsize)
   4509             if (!charmapencode_resize(outobj, outpos, requiredsize))
   4510                 return enc_EXCEPTION;
   4511         outstart = PyString_AS_STRING(*outobj);
   4512         outstart[(*outpos)++] = (char)res;
   4513         return enc_SUCCESS;
   4514     }
   4515 
   4516     rep = charmapencode_lookup(c, mapping);
   4517     if (rep==NULL)
   4518         return enc_EXCEPTION;
   4519     else if (rep==Py_None) {
   4520         Py_DECREF(rep);
   4521         return enc_FAILED;
   4522     } else {
   4523         if (PyInt_Check(rep)) {
   4524             Py_ssize_t requiredsize = *outpos+1;
   4525             if (outsize<requiredsize)
   4526                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
   4527                     Py_DECREF(rep);
   4528                     return enc_EXCEPTION;
   4529                 }
   4530             outstart = PyString_AS_STRING(*outobj);
   4531             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
   4532         }
   4533         else {
   4534             const char *repchars = PyString_AS_STRING(rep);
   4535             Py_ssize_t repsize = PyString_GET_SIZE(rep);
   4536             Py_ssize_t requiredsize = *outpos+repsize;
   4537             if (outsize<requiredsize)
   4538                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
   4539                     Py_DECREF(rep);
   4540                     return enc_EXCEPTION;
   4541                 }
   4542             outstart = PyString_AS_STRING(*outobj);
   4543             memcpy(outstart + *outpos, repchars, repsize);
   4544             *outpos += repsize;
   4545         }
   4546     }
   4547     Py_DECREF(rep);
   4548     return enc_SUCCESS;
   4549 }
   4550 
   4551 /* handle an error in PyUnicode_EncodeCharmap
   4552    Return 0 on success, -1 on error */
   4553 static
   4554 int charmap_encoding_error(
   4555     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
   4556     PyObject **exceptionObject,
   4557     int *known_errorHandler, PyObject **errorHandler, const char *errors,
   4558     PyObject **res, Py_ssize_t *respos)
   4559 {
   4560     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
   4561     Py_ssize_t repsize;
   4562     Py_ssize_t newpos;
   4563     Py_UNICODE *uni2;
   4564     /* startpos for collecting unencodable chars */
   4565     Py_ssize_t collstartpos = *inpos;
   4566     Py_ssize_t collendpos = *inpos+1;
   4567     Py_ssize_t collpos;
   4568     char *encoding = "charmap";
   4569     char *reason = "character maps to <undefined>";
   4570     charmapencode_result x;
   4571 
   4572     /* find all unencodable characters */
   4573     while (collendpos < size) {
   4574         PyObject *rep;
   4575         if (Py_TYPE(mapping) == &EncodingMapType) {
   4576             int res = encoding_map_lookup(p[collendpos], mapping);
   4577             if (res != -1)
   4578                 break;
   4579             ++collendpos;
   4580             continue;
   4581         }
   4582 
   4583         rep = charmapencode_lookup(p[collendpos], mapping);
   4584         if (rep==NULL)
   4585             return -1;
   4586         else if (rep!=Py_None) {
   4587             Py_DECREF(rep);
   4588             break;
   4589         }
   4590         Py_DECREF(rep);
   4591         ++collendpos;
   4592     }
   4593     /* cache callback name lookup
   4594      * (if not done yet, i.e. it's the first error) */
   4595     if (*known_errorHandler==-1) {
   4596         if ((errors==NULL) || (!strcmp(errors, "strict")))
   4597             *known_errorHandler = 1;
   4598         else if (!strcmp(errors, "replace"))
   4599             *known_errorHandler = 2;
   4600         else if (!strcmp(errors, "ignore"))
   4601             *known_errorHandler = 3;
   4602         else if (!strcmp(errors, "xmlcharrefreplace"))
   4603             *known_errorHandler = 4;
   4604         else
   4605             *known_errorHandler = 0;
   4606     }
   4607     switch (*known_errorHandler) {
   4608     case 1: /* strict */
   4609         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
   4610         return -1;
   4611     case 2: /* replace */
   4612         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
   4613             x = charmapencode_output('?', mapping, res, respos);
   4614             if (x==enc_EXCEPTION) {
   4615                 return -1;
   4616             }
   4617             else if (x==enc_FAILED) {
   4618                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
   4619                 return -1;
   4620             }
   4621         }
   4622         /* fall through */
   4623     case 3: /* ignore */
   4624         *inpos = collendpos;
   4625         break;
   4626     case 4: /* xmlcharrefreplace */
   4627         /* generate replacement (temporarily (mis)uses p) */
   4628         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
   4629             char buffer[2+29+1+1];
   4630             char *cp;
   4631             sprintf(buffer, "&#%d;", (int)p[collpos]);
   4632             for (cp = buffer; *cp; ++cp) {
   4633                 x = charmapencode_output(*cp, mapping, res, respos);
   4634                 if (x==enc_EXCEPTION)
   4635                     return -1;
   4636                 else if (x==enc_FAILED) {
   4637                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
   4638                     return -1;
   4639                 }
   4640             }
   4641         }
   4642         *inpos = collendpos;
   4643         break;
   4644     default:
   4645         repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
   4646                                                       encoding, reason, p, size, exceptionObject,
   4647                                                       collstartpos, collendpos, &newpos);
   4648         if (repunicode == NULL)
   4649             return -1;
   4650         /* generate replacement  */
   4651         repsize = PyUnicode_GET_SIZE(repunicode);
   4652         for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
   4653             x = charmapencode_output(*uni2, mapping, res, respos);
   4654             if (x==enc_EXCEPTION) {
   4655                 return -1;
   4656             }
   4657             else if (x==enc_FAILED) {
   4658                 Py_DECREF(repunicode);
   4659                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
   4660                 return -1;
   4661             }
   4662         }
   4663         *inpos = newpos;
   4664         Py_DECREF(repunicode);
   4665     }
   4666     return 0;
   4667 }
   4668 
   4669 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
   4670                                   Py_ssize_t size,
   4671                                   PyObject *mapping,
   4672                                   const char *errors)
   4673 {
   4674     /* output object */
   4675     PyObject *res = NULL;
   4676     /* current input position */
   4677     Py_ssize_t inpos = 0;
   4678     /* current output position */
   4679     Py_ssize_t respos = 0;
   4680     PyObject *errorHandler = NULL;
   4681     PyObject *exc = NULL;
   4682     /* the following variable is used for caching string comparisons
   4683      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
   4684      * 3=ignore, 4=xmlcharrefreplace */
   4685     int known_errorHandler = -1;
   4686 
   4687     /* Default to Latin-1 */
   4688     if (mapping == NULL)
   4689         return PyUnicode_EncodeLatin1(p, size, errors);
   4690 
   4691     /* allocate enough for a simple encoding without
   4692        replacements, if we need more, we'll resize */
   4693     res = PyString_FromStringAndSize(NULL, size);
   4694     if (res == NULL)
   4695         goto onError;
   4696     if (size == 0)
   4697         return res;
   4698 
   4699     while (inpos<size) {
   4700         /* try to encode it */
   4701         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
   4702         if (x==enc_EXCEPTION) /* error */
   4703             goto onError;
   4704         if (x==enc_FAILED) { /* unencodable character */
   4705             if (charmap_encoding_error(p, size, &inpos, mapping,
   4706                                        &exc,
   4707                                        &known_errorHandler, &errorHandler, errors,
   4708                                        &res, &respos)) {
   4709                 goto onError;
   4710             }
   4711         }
   4712         else
   4713             /* done with this character => adjust input position */
   4714             ++inpos;
   4715     }
   4716 
   4717     /* Resize if we allocated to much */
   4718     if (respos<PyString_GET_SIZE(res)) {
   4719         if (_PyString_Resize(&res, respos))
   4720             goto onError;
   4721     }
   4722     Py_XDECREF(exc);
   4723     Py_XDECREF(errorHandler);
   4724     return res;
   4725 
   4726   onError:
   4727     Py_XDECREF(res);
   4728     Py_XDECREF(exc);
   4729     Py_XDECREF(errorHandler);
   4730     return NULL;
   4731 }
   4732 
   4733 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
   4734                                     PyObject *mapping)
   4735 {
   4736     if (!PyUnicode_Check(unicode) || mapping == NULL) {
   4737         PyErr_BadArgument();
   4738         return NULL;
   4739     }
   4740     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
   4741                                    PyUnicode_GET_SIZE(unicode),
   4742                                    mapping,
   4743                                    NULL);
   4744 }
   4745 
   4746 /* create or adjust a UnicodeTranslateError */
   4747 static void make_translate_exception(PyObject **exceptionObject,
   4748                                      const Py_UNICODE *unicode, Py_ssize_t size,
   4749                                      Py_ssize_t startpos, Py_ssize_t endpos,
   4750                                      const char *reason)
   4751 {
   4752     if (*exceptionObject == NULL) {
   4753         *exceptionObject = PyUnicodeTranslateError_Create(
   4754             unicode, size, startpos, endpos, reason);
   4755     }
   4756     else {
   4757         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
   4758             goto onError;
   4759         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
   4760             goto onError;
   4761         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
   4762             goto onError;
   4763         return;
   4764       onError:
   4765         Py_DECREF(*exceptionObject);
   4766         *exceptionObject = NULL;
   4767     }
   4768 }
   4769 
   4770 /* raises a UnicodeTranslateError */
   4771 static void raise_translate_exception(PyObject **exceptionObject,
   4772                                       const Py_UNICODE *unicode, Py_ssize_t size,
   4773                                       Py_ssize_t startpos, Py_ssize_t endpos,
   4774                                       const char *reason)
   4775 {
   4776     make_translate_exception(exceptionObject,
   4777                              unicode, size, startpos, endpos, reason);
   4778     if (*exceptionObject != NULL)
   4779         PyCodec_StrictErrors(*exceptionObject);
   4780 }
   4781 
   4782 /* error handling callback helper:
   4783    build arguments, call the callback and check the arguments,
   4784    put the result into newpos and return the replacement string, which
   4785    has to be freed by the caller */
   4786 static PyObject *unicode_translate_call_errorhandler(const char *errors,
   4787                                                      PyObject **errorHandler,
   4788                                                      const char *reason,
   4789                                                      const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
   4790                                                      Py_ssize_t startpos, Py_ssize_t endpos,
   4791                                                      Py_ssize_t *newpos)
   4792 {
   4793     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
   4794 
   4795     Py_ssize_t i_newpos;
   4796     PyObject *restuple;
   4797     PyObject *resunicode;
   4798 
   4799     if (*errorHandler == NULL) {
   4800         *errorHandler = PyCodec_LookupError(errors);
   4801         if (*errorHandler == NULL)
   4802             return NULL;
   4803     }
   4804 
   4805     make_translate_exception(exceptionObject,
   4806                              unicode, size, startpos, endpos, reason);
   4807     if (*exceptionObject == NULL)
   4808         return NULL;
   4809 
   4810     restuple = PyObject_CallFunctionObjArgs(
   4811         *errorHandler, *exceptionObject, NULL);
   4812     if (restuple == NULL)
   4813         return NULL;
   4814     if (!PyTuple_Check(restuple)) {
   4815         PyErr_SetString(PyExc_TypeError, &argparse[4]);
   4816         Py_DECREF(restuple);
   4817         return NULL;
   4818     }
   4819     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
   4820                           &resunicode, &i_newpos)) {
   4821         Py_DECREF(restuple);
   4822         return NULL;
   4823     }
   4824     if (i_newpos<0)
   4825         *newpos = size+i_newpos;
   4826     else
   4827         *newpos = i_newpos;
   4828     if (*newpos<0 || *newpos>size) {
   4829         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
   4830         Py_DECREF(restuple);
   4831         return NULL;
   4832     }
   4833     Py_INCREF(resunicode);
   4834     Py_DECREF(restuple);
   4835     return resunicode;
   4836 }
   4837 
   4838 /* Lookup the character ch in the mapping and put the result in result,
   4839    which must be decrefed by the caller.
   4840    Return 0 on success, -1 on error */
   4841 static
   4842 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
   4843 {
   4844     PyObject *w = PyInt_FromLong((long)c);
   4845     PyObject *x;
   4846 
   4847     if (w == NULL)
   4848         return -1;
   4849     x = PyObject_GetItem(mapping, w);
   4850     Py_DECREF(w);
   4851     if (x == NULL) {
   4852         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
   4853             /* No mapping found means: use 1:1 mapping. */
   4854             PyErr_Clear();
   4855             *result = NULL;
   4856             return 0;
   4857         } else
   4858             return -1;
   4859     }
   4860     else if (x == Py_None) {
   4861         *result = x;
   4862         return 0;
   4863     }
   4864     else if (PyInt_Check(x)) {
   4865         long value = PyInt_AS_LONG(x);
   4866         long max = PyUnicode_GetMax();
   4867         if (value < 0 || value > max) {
   4868             PyErr_Format(PyExc_TypeError,
   4869                          "character mapping must be in range(0x%lx)", max+1);
   4870             Py_DECREF(x);
   4871             return -1;
   4872         }
   4873         *result = x;
   4874         return 0;
   4875     }
   4876     else if (PyUnicode_Check(x)) {
   4877         *result = x;
   4878         return 0;
   4879     }
   4880     else {
   4881         /* wrong return value */
   4882         PyErr_SetString(PyExc_TypeError,
   4883                         "character mapping must return integer, None or unicode");
   4884         Py_DECREF(x);
   4885         return -1;
   4886     }
   4887 }
   4888 /* ensure that *outobj is at least requiredsize characters long,
   4889    if not reallocate and adjust various state variables.
   4890    Return 0 on success, -1 on error */
   4891 static
   4892 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
   4893                                Py_ssize_t requiredsize)
   4894 {
   4895     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
   4896     if (requiredsize > oldsize) {
   4897         /* remember old output position */
   4898         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
   4899         /* exponentially overallocate to minimize reallocations */
   4900         if (requiredsize < 2 * oldsize)
   4901             requiredsize = 2 * oldsize;
   4902         if (PyUnicode_Resize(outobj, requiredsize) < 0)
   4903             return -1;
   4904         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
   4905     }
   4906     return 0;
   4907 }
   4908 /* lookup the character, put the result in the output string and adjust
   4909    various state variables. Return a new reference to the object that
   4910    was put in the output buffer in *result, or Py_None, if the mapping was
   4911    undefined (in which case no character was written).
   4912    The called must decref result.
   4913    Return 0 on success, -1 on error. */
   4914 static
   4915 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
   4916                             Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
   4917                             PyObject **res)
   4918 {
   4919     if (charmaptranslate_lookup(*curinp, mapping, res))
   4920         return -1;
   4921     if (*res==NULL) {
   4922         /* not found => default to 1:1 mapping */
   4923         *(*outp)++ = *curinp;
   4924     }
   4925     else if (*res==Py_None)
   4926         ;
   4927     else if (PyInt_Check(*res)) {
   4928         /* no overflow check, because we know that the space is enough */
   4929         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
   4930     }
   4931     else if (PyUnicode_Check(*res)) {
   4932         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
   4933         if (repsize==1) {
   4934             /* no overflow check, because we know that the space is enough */
   4935             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
   4936         }
   4937         else if (repsize!=0) {
   4938             /* more than one character */
   4939             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
   4940                 (insize - (curinp-startinp)) +
   4941                 repsize - 1;
   4942             if (charmaptranslate_makespace(outobj, outp, requiredsize))
   4943                 return -1;
   4944             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
   4945             *outp += repsize;
   4946         }
   4947     }
   4948     else
   4949         return -1;
   4950     return 0;
   4951 }
   4952 
   4953 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
   4954                                      Py_ssize_t size,
   4955                                      PyObject *mapping,
   4956                                      const char *errors)
   4957 {
   4958     /* output object */
   4959     PyObject *res = NULL;
   4960     /* pointers to the beginning and end+1 of input */
   4961     const Py_UNICODE *startp = p;
   4962     const Py_UNICODE *endp = p + size;
   4963     /* pointer into the output */
   4964     Py_UNICODE *str;
   4965     /* current output position */
   4966     Py_ssize_t respos = 0;
   4967     char *reason = "character maps to <undefined>";
   4968     PyObject *errorHandler = NULL;
   4969     PyObject *exc = NULL;
   4970     /* the following variable is used for caching string comparisons
   4971      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
   4972      * 3=ignore, 4=xmlcharrefreplace */
   4973     int known_errorHandler = -1;
   4974 
   4975     if (mapping == NULL) {
   4976         PyErr_BadArgument();
   4977         return NULL;
   4978     }
   4979 
   4980     /* allocate enough for a simple 1:1 translation without
   4981        replacements, if we need more, we'll resize */
   4982     res = PyUnicode_FromUnicode(NULL, size);
   4983     if (res == NULL)
   4984         goto onError;
   4985     if (size == 0)
   4986         return res;
   4987     str = PyUnicode_AS_UNICODE(res);
   4988 
   4989     while (p<endp) {
   4990         /* try to encode it */
   4991         PyObject *x = NULL;
   4992         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
   4993             Py_XDECREF(x);
   4994             goto onError;
   4995         }
   4996         Py_XDECREF(x);
   4997         if (x!=Py_None) /* it worked => adjust input pointer */
   4998             ++p;
   4999         else { /* untranslatable character */
   5000             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
   5001             Py_ssize_t repsize;
   5002             Py_ssize_t newpos;
   5003             Py_UNICODE *uni2;
   5004             /* startpos for collecting untranslatable chars */
   5005             const Py_UNICODE *collstart = p;
   5006             const Py_UNICODE *collend = p+1;
   5007             const Py_UNICODE *coll;
   5008 
   5009             /* find all untranslatable characters */
   5010             while (collend < endp) {
   5011                 if (charmaptranslate_lookup(*collend, mapping, &x))
   5012                     goto onError;
   5013                 Py_XDECREF(x);
   5014                 if (x!=Py_None)
   5015                     break;
   5016                 ++collend;
   5017             }
   5018             /* cache callback name lookup
   5019              * (if not done yet, i.e. it's the first error) */
   5020             if (known_errorHandler==-1) {
   5021                 if ((errors==NULL) || (!strcmp(errors, "strict")))
   5022                     known_errorHandler = 1;
   5023                 else if (!strcmp(errors, "replace"))
   5024                     known_errorHandler = 2;
   5025                 else if (!strcmp(errors, "ignore"))
   5026                     known_errorHandler = 3;
   5027                 else if (!strcmp(errors, "xmlcharrefreplace"))
   5028                     known_errorHandler = 4;
   5029                 else
   5030                     known_errorHandler = 0;
   5031             }
   5032             switch (known_errorHandler) {
   5033             case 1: /* strict */
   5034                 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
   5035                 goto onError;
   5036             case 2: /* replace */
   5037                 /* No need to check for space, this is a 1:1 replacement */
   5038                 for (coll = collstart; coll<collend; ++coll)
   5039                     *str++ = '?';
   5040                 /* fall through */
   5041             case 3: /* ignore */
   5042                 p = collend;
   5043                 break;
   5044             case 4: /* xmlcharrefreplace */
   5045                 /* generate replacement (temporarily (mis)uses p) */
   5046                 for (p = collstart; p < collend; ++p) {
   5047                     char buffer[2+29+1+1];
   5048                     char *cp;
   5049                     sprintf(buffer, "&#%d;", (int)*p);
   5050                     if (charmaptranslate_makespace(&res, &str,
   5051                                                    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
   5052                         goto onError;
   5053                     for (cp = buffer; *cp; ++cp)
   5054                         *str++ = *cp;
   5055                 }
   5056                 p = collend;
   5057                 break;
   5058             default:
   5059                 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
   5060                                                                  reason, startp, size, &exc,
   5061                                                                  collstart-startp, collend-startp, &newpos);
   5062                 if (repunicode == NULL)
   5063                     goto onError;
   5064                 /* generate replacement  */
   5065                 repsize = PyUnicode_GET_SIZE(repunicode);
   5066                 if (charmaptranslate_makespace(&res, &str,
   5067                                                (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
   5068                     Py_DECREF(repunicode);
   5069                     goto onError;
   5070                 }
   5071                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
   5072                     *str++ = *uni2;
   5073                 p = startp + newpos;
   5074                 Py_DECREF(repunicode);
   5075             }
   5076         }
   5077     }
   5078     /* Resize if we allocated to much */
   5079     respos = str-PyUnicode_AS_UNICODE(res);
   5080     if (respos<PyUnicode_GET_SIZE(res)) {
   5081         if (PyUnicode_Resize(&res, respos) < 0)
   5082             goto onError;
   5083     }
   5084     Py_XDECREF(exc);
   5085     Py_XDECREF(errorHandler);
   5086     return res;
   5087 
   5088   onError:
   5089     Py_XDECREF(res);
   5090     Py_XDECREF(exc);
   5091     Py_XDECREF(errorHandler);
   5092     return NULL;
   5093 }
   5094 
   5095 PyObject *PyUnicode_Translate(PyObject *str,
   5096                               PyObject *mapping,
   5097                               const char *errors)
   5098 {
   5099     PyObject *result;
   5100 
   5101     str = PyUnicode_FromObject(str);
   5102     if (str == NULL)
   5103         goto onError;
   5104     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
   5105                                         PyUnicode_GET_SIZE(str),
   5106                                         mapping,
   5107                                         errors);
   5108     Py_DECREF(str);
   5109     return result;
   5110 
   5111   onError:
   5112     Py_XDECREF(str);
   5113     return NULL;
   5114 }
   5115 
   5116 /* --- Decimal Encoder ---------------------------------------------------- */
   5117 
   5118 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
   5119                             Py_ssize_t length,
   5120                             char *output,
   5121                             const char *errors)
   5122 {
   5123     Py_UNICODE *p, *end;
   5124     PyObject *errorHandler = NULL;
   5125     PyObject *exc = NULL;
   5126     const char *encoding = "decimal";
   5127     const char *reason = "invalid decimal Unicode string";
   5128     /* the following variable is used for caching string comparisons
   5129      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
   5130     int known_errorHandler = -1;
   5131 
   5132     if (output == NULL) {
   5133         PyErr_BadArgument();
   5134         return -1;
   5135     }
   5136 
   5137     p = s;
   5138     end = s + length;
   5139     while (p < end) {
   5140         register Py_UNICODE ch = *p;
   5141         int decimal;
   5142         PyObject *repunicode;
   5143         Py_ssize_t repsize;
   5144         Py_ssize_t newpos;
   5145         Py_UNICODE *uni2;
   5146         Py_UNICODE *collstart;
   5147         Py_UNICODE *collend;
   5148 
   5149         if (Py_UNICODE_ISSPACE(ch)) {
   5150             *output++ = ' ';
   5151             ++p;
   5152             continue;
   5153         }
   5154         decimal = Py_UNICODE_TODECIMAL(ch);
   5155         if (decimal >= 0) {
   5156             *output++ = '0' + decimal;
   5157             ++p;
   5158             continue;
   5159         }
   5160         if (0 < ch && ch < 256) {
   5161             *output++ = (char)ch;
   5162             ++p;
   5163             continue;
   5164         }
   5165         /* All other characters are considered unencodable */
   5166         collstart = p;
   5167         collend = p+1;
   5168         while (collend < end) {
   5169             if ((0 < *collend && *collend < 256) ||
   5170                 !Py_UNICODE_ISSPACE(*collend) ||
   5171                 Py_UNICODE_TODECIMAL(*collend))
   5172                 break;
   5173         }
   5174         /* cache callback name lookup
   5175          * (if not done yet, i.e. it's the first error) */
   5176         if (known_errorHandler==-1) {
   5177             if ((errors==NULL) || (!strcmp(errors, "strict")))
   5178                 known_errorHandler = 1;
   5179             else if (!strcmp(errors, "replace"))
   5180                 known_errorHandler = 2;
   5181             else if (!strcmp(errors, "ignore"))
   5182                 known_errorHandler = 3;
   5183             else if (!strcmp(errors, "xmlcharrefreplace"))
   5184                 known_errorHandler = 4;
   5185             else
   5186                 known_errorHandler = 0;
   5187         }
   5188         switch (known_errorHandler) {
   5189         case 1: /* strict */
   5190             raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
   5191             goto onError;
   5192         case 2: /* replace */
   5193             for (p = collstart; p < collend; ++p)
   5194                 *output++ = '?';
   5195             /* fall through */
   5196         case 3: /* ignore */
   5197             p = collend;
   5198             break;
   5199         case 4: /* xmlcharrefreplace */
   5200             /* generate replacement (temporarily (mis)uses p) */
   5201             for (p = collstart; p < collend; ++p)
   5202                 output += sprintf(output, "&#%d;", (int)*p);
   5203             p = collend;
   5204             break;
   5205         default:
   5206             repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
   5207                                                           encoding, reason, s, length, &exc,
   5208                                                           collstart-s, collend-s, &newpos);
   5209             if (repunicode == NULL)
   5210                 goto onError;
   5211             /* generate replacement  */
   5212             repsize = PyUnicode_GET_SIZE(repunicode);
   5213             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
   5214                 Py_UNICODE ch = *uni2;
   5215                 if (Py_UNICODE_ISSPACE(ch))
   5216                     *output++ = ' ';
   5217                 else {
   5218                     decimal = Py_UNICODE_TODECIMAL(ch);
   5219                     if (decimal >= 0)
   5220                         *output++ = '0' + decimal;
   5221                     else if (0 < ch && ch < 256)
   5222                         *output++ = (char)ch;
   5223                     else {
   5224                         Py_DECREF(repunicode);
   5225                         raise_encode_exception(&exc, encoding,
   5226                                                s, length, collstart-s, collend-s, reason);
   5227                         goto onError;
   5228                     }
   5229                 }
   5230             }
   5231             p = s + newpos;
   5232             Py_DECREF(repunicode);
   5233         }
   5234     }
   5235     /* 0-terminate the output string */
   5236     *output++ = '\0';
   5237     Py_XDECREF(exc);
   5238     Py_XDECREF(errorHandler);
   5239     return 0;
   5240 
   5241   onError:
   5242     Py_XDECREF(exc);
   5243     Py_XDECREF(errorHandler);
   5244     return -1;
   5245 }
   5246 
   5247 /* --- Helpers ------------------------------------------------------------ */
   5248 
   5249 #include "stringlib/unicodedefs.h"
   5250 #include "stringlib/fastsearch.h"
   5251 
   5252 #include "stringlib/count.h"
   5253 #include "stringlib/find.h"
   5254 #include "stringlib/partition.h"
   5255 #include "stringlib/split.h"
   5256 
   5257 /* helper macro to fixup start/end slice values */
   5258 #define ADJUST_INDICES(start, end, len)         \
   5259     if (end > len)                              \
   5260         end = len;                              \
   5261     else if (end < 0) {                         \
   5262         end += len;                             \
   5263         if (end < 0)                            \
   5264             end = 0;                            \
   5265     }                                           \
   5266     if (start < 0) {                            \
   5267         start += len;                           \
   5268         if (start < 0)                          \
   5269             start = 0;                          \
   5270     }
   5271 
   5272 Py_ssize_t PyUnicode_Count(PyObject *str,
   5273                            PyObject *substr,
   5274                            Py_ssize_t start,
   5275                            Py_ssize_t end)
   5276 {
   5277     Py_ssize_t result;
   5278     PyUnicodeObject* str_obj;
   5279     PyUnicodeObject* sub_obj;
   5280 
   5281     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
   5282     if (!str_obj)
   5283         return -1;
   5284     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
   5285     if (!sub_obj) {
   5286         Py_DECREF(str_obj);
   5287         return -1;
   5288     }
   5289 
   5290     ADJUST_INDICES(start, end, str_obj->length);
   5291     result = stringlib_count(
   5292         str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
   5293         PY_SSIZE_T_MAX
   5294         );
   5295 
   5296     Py_DECREF(sub_obj);
   5297     Py_DECREF(str_obj);
   5298 
   5299     return result;
   5300 }
   5301 
   5302 Py_ssize_t PyUnicode_Find(PyObject *str,
   5303                           PyObject *sub,
   5304                           Py_ssize_t start,
   5305                           Py_ssize_t end,
   5306                           int direction)
   5307 {
   5308     Py_ssize_t result;
   5309 
   5310     str = PyUnicode_FromObject(str);
   5311     if (!str)
   5312         return -2;
   5313     sub = PyUnicode_FromObject(sub);
   5314     if (!sub) {
   5315         Py_DECREF(str);
   5316         return -2;
   5317     }
   5318 
   5319     if (direction > 0)
   5320         result = stringlib_find_slice(
   5321             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
   5322             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
   5323             start, end
   5324             );
   5325     else
   5326         result = stringlib_rfind_slice(
   5327             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
   5328             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
   5329             start, end
   5330             );
   5331 
   5332     Py_DECREF(str);
   5333     Py_DECREF(sub);
   5334 
   5335     return result;
   5336 }
   5337 
   5338 static
   5339 int tailmatch(PyUnicodeObject *self,
   5340               PyUnicodeObject *substring,
   5341               Py_ssize_t start,
   5342               Py_ssize_t end,
   5343               int direction)
   5344 {
   5345     if (substring->length == 0)
   5346         return 1;
   5347 
   5348     ADJUST_INDICES(start, end, self->length);
   5349     end -= substring->length;
   5350     if (end < start)
   5351         return 0;
   5352 
   5353     if (direction > 0) {
   5354         if (Py_UNICODE_MATCH(self, end, substring))
   5355             return 1;
   5356     } else {
   5357         if (Py_UNICODE_MATCH(self, start, substring))
   5358             return 1;
   5359     }
   5360 
   5361     return 0;
   5362 }
   5363 
   5364 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
   5365                                PyObject *substr,
   5366                                Py_ssize_t start,
   5367                                Py_ssize_t end,
   5368                                int direction)
   5369 {
   5370     Py_ssize_t result;
   5371 
   5372     str = PyUnicode_FromObject(str);
   5373     if (str == NULL)
   5374         return -1;
   5375     substr = PyUnicode_FromObject(substr);
   5376     if (substr == NULL) {
   5377         Py_DECREF(str);
   5378         return -1;
   5379     }
   5380 
   5381     result = tailmatch((PyUnicodeObject *)str,
   5382                        (PyUnicodeObject *)substr,
   5383                        start, end, direction);
   5384     Py_DECREF(str);
   5385     Py_DECREF(substr);
   5386     return result;
   5387 }
   5388 
   5389 /* Apply fixfct filter to the Unicode object self and return a
   5390    reference to the modified object */
   5391 
   5392 static
   5393 PyObject *fixup(PyUnicodeObject *self,
   5394                 int (*fixfct)(PyUnicodeObject *s))
   5395 {
   5396 
   5397     PyUnicodeObject *u;
   5398 
   5399     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
   5400     if (u == NULL)
   5401         return NULL;
   5402 
   5403     Py_UNICODE_COPY(u->str, self->str, self->length);
   5404 
   5405     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
   5406         /* fixfct should return TRUE if it modified the buffer. If
   5407            FALSE, return a reference to the original buffer instead
   5408            (to save space, not time) */
   5409         Py_INCREF(self);
   5410         Py_DECREF(u);
   5411         return (PyObject*) self;
   5412     }
   5413     return (PyObject*) u;
   5414 }
   5415 
   5416 static
   5417 int fixupper(PyUnicodeObject *self)
   5418 {
   5419     Py_ssize_t len = self->length;
   5420     Py_UNICODE *s = self->str;
   5421     int status = 0;
   5422 
   5423     while (len-- > 0) {
   5424         register Py_UNICODE ch;
   5425 
   5426         ch = Py_UNICODE_TOUPPER(*s);
   5427         if (ch != *s) {
   5428             status = 1;
   5429             *s = ch;
   5430         }
   5431         s++;
   5432     }
   5433 
   5434     return status;
   5435 }
   5436 
   5437 static
   5438 int fixlower(PyUnicodeObject *self)
   5439 {
   5440     Py_ssize_t len = self->length;
   5441     Py_UNICODE *s = self->str;
   5442     int status = 0;
   5443 
   5444     while (len-- > 0) {
   5445         register Py_UNICODE ch;
   5446 
   5447         ch = Py_UNICODE_TOLOWER(*s);
   5448         if (ch != *s) {
   5449             status = 1;
   5450             *s = ch;
   5451         }
   5452         s++;
   5453     }
   5454 
   5455     return status;
   5456 }
   5457 
   5458 static
   5459 int fixswapcase(PyUnicodeObject *self)
   5460 {
   5461     Py_ssize_t len = self->length;
   5462     Py_UNICODE *s = self->str;
   5463     int status = 0;
   5464 
   5465     while (len-- > 0) {
   5466         if (Py_UNICODE_ISUPPER(*s)) {
   5467             *s = Py_UNICODE_TOLOWER(*s);
   5468             status = 1;
   5469         } else if (Py_UNICODE_ISLOWER(*s)) {
   5470             *s = Py_UNICODE_TOUPPER(*s);
   5471             status = 1;
   5472         }
   5473         s++;
   5474     }
   5475 
   5476     return status;
   5477 }
   5478 
   5479 static
   5480 int fixcapitalize(PyUnicodeObject *self)
   5481 {
   5482     Py_ssize_t len = self->length;
   5483     Py_UNICODE *s = self->str;
   5484     int status = 0;
   5485 
   5486     if (len == 0)
   5487         return 0;
   5488     if (Py_UNICODE_ISLOWER(*s)) {
   5489         *s = Py_UNICODE_TOUPPER(*s);
   5490         status = 1;
   5491     }
   5492     s++;
   5493     while (--len > 0) {
   5494         if (Py_UNICODE_ISUPPER(*s)) {
   5495             *s = Py_UNICODE_TOLOWER(*s);
   5496             status = 1;
   5497         }
   5498         s++;
   5499     }
   5500     return status;
   5501 }
   5502 
   5503 static
   5504 int fixtitle(PyUnicodeObject *self)
   5505 {
   5506     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   5507     register Py_UNICODE *e;
   5508     int previous_is_cased;
   5509 
   5510     /* Shortcut for single character strings */
   5511     if (PyUnicode_GET_SIZE(self) == 1) {
   5512         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
   5513         if (*p != ch) {
   5514             *p = ch;
   5515             return 1;
   5516         }
   5517         else
   5518             return 0;
   5519     }
   5520 
   5521     e = p + PyUnicode_GET_SIZE(self);
   5522     previous_is_cased = 0;
   5523     for (; p < e; p++) {
   5524         register const Py_UNICODE ch = *p;
   5525 
   5526         if (previous_is_cased)
   5527             *p = Py_UNICODE_TOLOWER(ch);
   5528         else
   5529             *p = Py_UNICODE_TOTITLE(ch);
   5530 
   5531         if (Py_UNICODE_ISLOWER(ch) ||
   5532             Py_UNICODE_ISUPPER(ch) ||
   5533             Py_UNICODE_ISTITLE(ch))
   5534             previous_is_cased = 1;
   5535         else
   5536             previous_is_cased = 0;
   5537     }
   5538     return 1;
   5539 }
   5540 
   5541 PyObject *
   5542 PyUnicode_Join(PyObject *separator, PyObject *seq)
   5543 {
   5544     PyObject *internal_separator = NULL;
   5545     const Py_UNICODE blank = ' ';
   5546     const Py_UNICODE *sep = &blank;
   5547     Py_ssize_t seplen = 1;
   5548     PyUnicodeObject *res = NULL; /* the result */
   5549     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
   5550     Py_ssize_t res_used;         /* # used bytes */
   5551     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
   5552     PyObject *fseq;          /* PySequence_Fast(seq) */
   5553     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
   5554     PyObject *item;
   5555     Py_ssize_t i;
   5556 
   5557     fseq = PySequence_Fast(seq, "");
   5558     if (fseq == NULL) {
   5559         return NULL;
   5560     }
   5561 
   5562     /* Grrrr.  A codec may be invoked to convert str objects to
   5563      * Unicode, and so it's possible to call back into Python code
   5564      * during PyUnicode_FromObject(), and so it's possible for a sick
   5565      * codec to change the size of fseq (if seq is a list).  Therefore
   5566      * we have to keep refetching the size -- can't assume seqlen
   5567      * is invariant.
   5568      */
   5569     seqlen = PySequence_Fast_GET_SIZE(fseq);
   5570     /* If empty sequence, return u"". */
   5571     if (seqlen == 0) {
   5572         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
   5573         goto Done;
   5574     }
   5575     /* If singleton sequence with an exact Unicode, return that. */
   5576     if (seqlen == 1) {
   5577         item = PySequence_Fast_GET_ITEM(fseq, 0);
   5578         if (PyUnicode_CheckExact(item)) {
   5579             Py_INCREF(item);
   5580             res = (PyUnicodeObject *)item;
   5581             goto Done;
   5582         }
   5583     }
   5584 
   5585     /* At least two items to join, or one that isn't exact Unicode. */
   5586     if (seqlen > 1) {
   5587         /* Set up sep and seplen -- they're needed. */
   5588         if (separator == NULL) {
   5589             sep = &blank;
   5590             seplen = 1;
   5591         }
   5592         else {
   5593             internal_separator = PyUnicode_FromObject(separator);
   5594             if (internal_separator == NULL)
   5595                 goto onError;
   5596             sep = PyUnicode_AS_UNICODE(internal_separator);
   5597             seplen = PyUnicode_GET_SIZE(internal_separator);
   5598             /* In case PyUnicode_FromObject() mutated seq. */
   5599             seqlen = PySequence_Fast_GET_SIZE(fseq);
   5600         }
   5601     }
   5602 
   5603     /* Get space. */
   5604     res = _PyUnicode_New(res_alloc);
   5605     if (res == NULL)
   5606         goto onError;
   5607     res_p = PyUnicode_AS_UNICODE(res);
   5608     res_used = 0;
   5609 
   5610     for (i = 0; i < seqlen; ++i) {
   5611         Py_ssize_t itemlen;
   5612         Py_ssize_t new_res_used;
   5613 
   5614         item = PySequence_Fast_GET_ITEM(fseq, i);
   5615         /* Convert item to Unicode. */
   5616         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
   5617             PyErr_Format(PyExc_TypeError,
   5618                          "sequence item %zd: expected string or Unicode,"
   5619                          " %.80s found",
   5620                          i, Py_TYPE(item)->tp_name);
   5621             goto onError;
   5622         }
   5623         item = PyUnicode_FromObject(item);
   5624         if (item == NULL)
   5625             goto onError;
   5626         /* We own a reference to item from here on. */
   5627 
   5628         /* In case PyUnicode_FromObject() mutated seq. */
   5629         seqlen = PySequence_Fast_GET_SIZE(fseq);
   5630 
   5631         /* Make sure we have enough space for the separator and the item. */
   5632         itemlen = PyUnicode_GET_SIZE(item);
   5633         new_res_used = res_used + itemlen;
   5634         if (new_res_used < 0)
   5635             goto Overflow;
   5636         if (i < seqlen - 1) {
   5637             new_res_used += seplen;
   5638             if (new_res_used < 0)
   5639                 goto Overflow;
   5640         }
   5641         if (new_res_used > res_alloc) {
   5642             /* double allocated size until it's big enough */
   5643             do {
   5644                 res_alloc += res_alloc;
   5645                 if (res_alloc <= 0)
   5646                     goto Overflow;
   5647             } while (new_res_used > res_alloc);
   5648             if (_PyUnicode_Resize(&res, res_alloc) < 0) {
   5649                 Py_DECREF(item);
   5650                 goto onError;
   5651             }
   5652             res_p = PyUnicode_AS_UNICODE(res) + res_used;
   5653         }
   5654 
   5655         /* Copy item, and maybe the separator. */
   5656         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
   5657         res_p += itemlen;
   5658         if (i < seqlen - 1) {
   5659             Py_UNICODE_COPY(res_p, sep, seplen);
   5660             res_p += seplen;
   5661         }
   5662         Py_DECREF(item);
   5663         res_used = new_res_used;
   5664     }
   5665 
   5666     /* Shrink res to match the used area; this probably can't fail,
   5667      * but it's cheap to check.
   5668      */
   5669     if (_PyUnicode_Resize(&res, res_used) < 0)
   5670         goto onError;
   5671 
   5672   Done:
   5673     Py_XDECREF(internal_separator);
   5674     Py_DECREF(fseq);
   5675     return (PyObject *)res;
   5676 
   5677   Overflow:
   5678     PyErr_SetString(PyExc_OverflowError,
   5679                     "join() result is too long for a Python string");
   5680     Py_DECREF(item);
   5681     /* fall through */
   5682 
   5683   onError:
   5684     Py_XDECREF(internal_separator);
   5685     Py_DECREF(fseq);
   5686     Py_XDECREF(res);
   5687     return NULL;
   5688 }
   5689 
   5690 static
   5691 PyUnicodeObject *pad(PyUnicodeObject *self,
   5692                      Py_ssize_t left,
   5693                      Py_ssize_t right,
   5694                      Py_UNICODE fill)
   5695 {
   5696     PyUnicodeObject *u;
   5697 
   5698     if (left < 0)
   5699         left = 0;
   5700     if (right < 0)
   5701         right = 0;
   5702 
   5703     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
   5704         Py_INCREF(self);
   5705         return self;
   5706     }
   5707 
   5708     if (left > PY_SSIZE_T_MAX - self->length ||
   5709         right > PY_SSIZE_T_MAX - (left + self->length)) {
   5710         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
   5711         return NULL;
   5712     }
   5713     u = _PyUnicode_New(left + self->length + right);
   5714     if (u) {
   5715         if (left)
   5716             Py_UNICODE_FILL(u->str, fill, left);
   5717         Py_UNICODE_COPY(u->str + left, self->str, self->length);
   5718         if (right)
   5719             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
   5720     }
   5721 
   5722     return u;
   5723 }
   5724 
   5725 PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
   5726 {
   5727     PyObject *list;
   5728 
   5729     string = PyUnicode_FromObject(string);
   5730     if (string == NULL)
   5731         return NULL;
   5732 
   5733     list = stringlib_splitlines(
   5734         (PyObject*) string, PyUnicode_AS_UNICODE(string),
   5735         PyUnicode_GET_SIZE(string), keepends);
   5736 
   5737     Py_DECREF(string);
   5738     return list;
   5739 }
   5740 
   5741 static
   5742 PyObject *split(PyUnicodeObject *self,
   5743                 PyUnicodeObject *substring,
   5744                 Py_ssize_t maxcount)
   5745 {
   5746     if (maxcount < 0)
   5747         maxcount = PY_SSIZE_T_MAX;
   5748 
   5749     if (substring == NULL)
   5750         return stringlib_split_whitespace(
   5751             (PyObject*) self,  self->str, self->length, maxcount
   5752             );
   5753 
   5754     return stringlib_split(
   5755         (PyObject*) self,  self->str, self->length,
   5756         substring->str, substring->length,
   5757         maxcount
   5758         );
   5759 }
   5760 
   5761 static
   5762 PyObject *rsplit(PyUnicodeObject *self,
   5763                  PyUnicodeObject *substring,
   5764                  Py_ssize_t maxcount)
   5765 {
   5766     if (maxcount < 0)
   5767         maxcount = PY_SSIZE_T_MAX;
   5768 
   5769     if (substring == NULL)
   5770         return stringlib_rsplit_whitespace(
   5771             (PyObject*) self,  self->str, self->length, maxcount
   5772             );
   5773 
   5774     return stringlib_rsplit(
   5775         (PyObject*) self,  self->str, self->length,
   5776         substring->str, substring->length,
   5777         maxcount
   5778         );
   5779 }
   5780 
   5781 static
   5782 PyObject *replace(PyUnicodeObject *self,
   5783                   PyUnicodeObject *str1,
   5784                   PyUnicodeObject *str2,
   5785                   Py_ssize_t maxcount)
   5786 {
   5787     PyUnicodeObject *u;
   5788 
   5789     if (maxcount < 0)
   5790         maxcount = PY_SSIZE_T_MAX;
   5791     else if (maxcount == 0 || self->length == 0)
   5792         goto nothing;
   5793 
   5794     if (str1->length == str2->length) {
   5795         Py_ssize_t i;
   5796         /* same length */
   5797         if (str1->length == 0)
   5798             goto nothing;
   5799         if (str1->length == 1) {
   5800             /* replace characters */
   5801             Py_UNICODE u1, u2;
   5802             if (!findchar(self->str, self->length, str1->str[0]))
   5803                 goto nothing;
   5804             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
   5805             if (!u)
   5806                 return NULL;
   5807             Py_UNICODE_COPY(u->str, self->str, self->length);
   5808             u1 = str1->str[0];
   5809             u2 = str2->str[0];
   5810             for (i = 0; i < u->length; i++)
   5811                 if (u->str[i] == u1) {
   5812                     if (--maxcount < 0)
   5813                         break;
   5814                     u->str[i] = u2;
   5815                 }
   5816         } else {
   5817             i = stringlib_find(
   5818                 self->str, self->length, str1->str, str1->length, 0
   5819                 );
   5820             if (i < 0)
   5821                 goto nothing;
   5822             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
   5823             if (!u)
   5824                 return NULL;
   5825             Py_UNICODE_COPY(u->str, self->str, self->length);
   5826 
   5827             /* change everything in-place, starting with this one */
   5828             Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
   5829             i += str1->length;
   5830 
   5831             while ( --maxcount > 0) {
   5832                 i = stringlib_find(self->str+i, self->length-i,
   5833                                    str1->str, str1->length,
   5834                                    i);
   5835                 if (i == -1)
   5836                     break;
   5837                 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
   5838                 i += str1->length;
   5839             }
   5840         }
   5841     } else {
   5842 
   5843         Py_ssize_t n, i, j;
   5844         Py_ssize_t product, new_size, delta;
   5845         Py_UNICODE *p;
   5846 
   5847         /* replace strings */
   5848         n = stringlib_count(self->str, self->length, str1->str, str1->length,
   5849                             maxcount);
   5850         if (n == 0)
   5851             goto nothing;
   5852         /* new_size = self->length + n * (str2->length - str1->length)); */
   5853         delta = (str2->length - str1->length);
   5854         if (delta == 0) {
   5855             new_size = self->length;
   5856         } else {
   5857             product = n * (str2->length - str1->length);
   5858             if ((product / (str2->length - str1->length)) != n) {
   5859                 PyErr_SetString(PyExc_OverflowError,
   5860                                 "replace string is too long");
   5861                 return NULL;
   5862             }
   5863             new_size = self->length + product;
   5864             if (new_size < 0) {
   5865                 PyErr_SetString(PyExc_OverflowError,
   5866                                 "replace string is too long");
   5867                 return NULL;
   5868             }
   5869         }
   5870         u = _PyUnicode_New(new_size);
   5871         if (!u)
   5872             return NULL;
   5873         i = 0;
   5874         p = u->str;
   5875         if (str1->length > 0) {
   5876             while (n-- > 0) {
   5877                 /* look for next match */
   5878                 j = stringlib_find(self->str+i, self->length-i,
   5879                                    str1->str, str1->length,
   5880                                    i);
   5881                 if (j == -1)
   5882                     break;
   5883                 else if (j > i) {
   5884                     /* copy unchanged part [i:j] */
   5885                     Py_UNICODE_COPY(p, self->str+i, j-i);
   5886                     p += j - i;
   5887                 }
   5888                 /* copy substitution string */
   5889                 if (str2->length > 0) {
   5890                     Py_UNICODE_COPY(p, str2->str, str2->length);
   5891                     p += str2->length;
   5892                 }
   5893                 i = j + str1->length;
   5894             }
   5895             if (i < self->length)
   5896                 /* copy tail [i:] */
   5897                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
   5898         } else {
   5899             /* interleave */
   5900             while (n > 0) {
   5901                 Py_UNICODE_COPY(p, str2->str, str2->length);
   5902                 p += str2->length;
   5903                 if (--n <= 0)
   5904                     break;
   5905                 *p++ = self->str[i++];
   5906             }
   5907             Py_UNICODE_COPY(p, self->str+i, self->length-i);
   5908         }
   5909     }
   5910     return (PyObject *) u;
   5911 
   5912   nothing:
   5913     /* nothing to replace; return original string (when possible) */
   5914     if (PyUnicode_CheckExact(self)) {
   5915         Py_INCREF(self);
   5916         return (PyObject *) self;
   5917     }
   5918     return PyUnicode_FromUnicode(self->str, self->length);
   5919 }
   5920 
   5921 /* --- Unicode Object Methods --------------------------------------------- */
   5922 
   5923 PyDoc_STRVAR(title__doc__,
   5924              "S.title() -> unicode\n\
   5925 \n\
   5926 Return a titlecased version of S, i.e. words start with title case\n\
   5927 characters, all remaining cased characters have lower case.");
   5928 
   5929 static PyObject*
   5930 unicode_title(PyUnicodeObject *self)
   5931 {
   5932     return fixup(self, fixtitle);
   5933 }
   5934 
   5935 PyDoc_STRVAR(capitalize__doc__,
   5936              "S.capitalize() -> unicode\n\
   5937 \n\
   5938 Return a capitalized version of S, i.e. make the first character\n\
   5939 have upper case and the rest lower case.");
   5940 
   5941 static PyObject*
   5942 unicode_capitalize(PyUnicodeObject *self)
   5943 {
   5944     return fixup(self, fixcapitalize);
   5945 }
   5946 
   5947 #if 0
   5948 PyDoc_STRVAR(capwords__doc__,
   5949              "S.capwords() -> unicode\n\
   5950 \n\
   5951 Apply .capitalize() to all words in S and return the result with\n\
   5952 normalized whitespace (all whitespace strings are replaced by ' ').");
   5953 
   5954 static PyObject*
   5955 unicode_capwords(PyUnicodeObject *self)
   5956 {
   5957     PyObject *list;
   5958     PyObject *item;
   5959     Py_ssize_t i;
   5960 
   5961     /* Split into words */
   5962     list = split(self, NULL, -1);
   5963     if (!list)
   5964         return NULL;
   5965 
   5966     /* Capitalize each word */
   5967     for (i = 0; i < PyList_GET_SIZE(list); i++) {
   5968         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
   5969                      fixcapitalize);
   5970         if (item == NULL)
   5971             goto onError;
   5972         Py_DECREF(PyList_GET_ITEM(list, i));
   5973         PyList_SET_ITEM(list, i, item);
   5974     }
   5975 
   5976     /* Join the words to form a new string */
   5977     item = PyUnicode_Join(NULL, list);
   5978 
   5979   onError:
   5980     Py_DECREF(list);
   5981     return (PyObject *)item;
   5982 }
   5983 #endif
   5984 
   5985 /* Argument converter.  Coerces to a single unicode character */
   5986 
   5987 static int
   5988 convert_uc(PyObject *obj, void *addr)
   5989 {
   5990     Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
   5991     PyObject *uniobj;
   5992     Py_UNICODE *unistr;
   5993 
   5994     uniobj = PyUnicode_FromObject(obj);
   5995     if (uniobj == NULL) {
   5996         PyErr_SetString(PyExc_TypeError,
   5997                         "The fill character cannot be converted to Unicode");
   5998         return 0;
   5999     }
   6000     if (PyUnicode_GET_SIZE(uniobj) != 1) {
   6001         PyErr_SetString(PyExc_TypeError,
   6002                         "The fill character must be exactly one character long");
   6003         Py_DECREF(uniobj);
   6004         return 0;
   6005     }
   6006     unistr = PyUnicode_AS_UNICODE(uniobj);
   6007     *fillcharloc = unistr[0];
   6008     Py_DECREF(uniobj);
   6009     return 1;
   6010 }
   6011 
   6012 PyDoc_STRVAR(center__doc__,
   6013              "S.center(width[, fillchar]) -> unicode\n\
   6014 \n\
   6015 Return S centered in a Unicode string of length width. Padding is\n\
   6016 done using the specified fill character (default is a space)");
   6017 
   6018 static PyObject *
   6019 unicode_center(PyUnicodeObject *self, PyObject *args)
   6020 {
   6021     Py_ssize_t marg, left;
   6022     Py_ssize_t width;
   6023     Py_UNICODE fillchar = ' ';
   6024 
   6025     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
   6026         return NULL;
   6027 
   6028     if (self->length >= width && PyUnicode_CheckExact(self)) {
   6029         Py_INCREF(self);
   6030         return (PyObject*) self;
   6031     }
   6032 
   6033     marg = width - self->length;
   6034     left = marg / 2 + (marg & width & 1);
   6035 
   6036     return (PyObject*) pad(self, left, marg - left, fillchar);
   6037 }
   6038 
   6039 #if 0
   6040 
   6041 /* This code should go into some future Unicode collation support
   6042    module. The basic comparison should compare ordinals on a naive
   6043    basis (this is what Java does and thus Jython too). */
   6044 
   6045 /* speedy UTF-16 code point order comparison */
   6046 /* gleaned from: */
   6047 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
   6048 
   6049 static short utf16Fixup[32] =
   6050 {
   6051     0, 0, 0, 0, 0, 0, 0, 0,
   6052     0, 0, 0, 0, 0, 0, 0, 0,
   6053     0, 0, 0, 0, 0, 0, 0, 0,
   6054     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
   6055 };
   6056 
   6057 static int
   6058 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
   6059 {
   6060     Py_ssize_t len1, len2;
   6061 
   6062     Py_UNICODE *s1 = str1->str;
   6063     Py_UNICODE *s2 = str2->str;
   6064 
   6065     len1 = str1->length;
   6066     len2 = str2->length;
   6067 
   6068     while (len1 > 0 && len2 > 0) {
   6069         Py_UNICODE c1, c2;
   6070 
   6071         c1 = *s1++;
   6072         c2 = *s2++;
   6073 
   6074         if (c1 > (1<<11) * 26)
   6075             c1 += utf16Fixup[c1>>11];
   6076         if (c2 > (1<<11) * 26)
   6077             c2 += utf16Fixup[c2>>11];
   6078         /* now c1 and c2 are in UTF-32-compatible order */
   6079 
   6080         if (c1 != c2)
   6081             return (c1 < c2) ? -1 : 1;
   6082 
   6083         len1--; len2--;
   6084     }
   6085 
   6086     return (len1 < len2) ? -1 : (len1 != len2);
   6087 }
   6088 
   6089 #else
   6090 
   6091 static int
   6092 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
   6093 {
   6094     register Py_ssize_t len1, len2;
   6095 
   6096     Py_UNICODE *s1 = str1->str;
   6097     Py_UNICODE *s2 = str2->str;
   6098 
   6099     len1 = str1->length;
   6100     len2 = str2->length;
   6101 
   6102     while (len1 > 0 && len2 > 0) {
   6103         Py_UNICODE c1, c2;
   6104 
   6105         c1 = *s1++;
   6106         c2 = *s2++;
   6107 
   6108         if (c1 != c2)
   6109             return (c1 < c2) ? -1 : 1;
   6110 
   6111         len1--; len2--;
   6112     }
   6113 
   6114     return (len1 < len2) ? -1 : (len1 != len2);
   6115 }
   6116 
   6117 #endif
   6118 
   6119 int PyUnicode_Compare(PyObject *left,
   6120                       PyObject *right)
   6121 {
   6122     PyUnicodeObject *u = NULL, *v = NULL;
   6123     int result;
   6124 
   6125     /* Coerce the two arguments */
   6126     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
   6127     if (u == NULL)
   6128         goto onError;
   6129     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
   6130     if (v == NULL)
   6131         goto onError;
   6132 
   6133     /* Shortcut for empty or interned objects */
   6134     if (v == u) {
   6135         Py_DECREF(u);
   6136         Py_DECREF(v);
   6137         return 0;
   6138     }
   6139 
   6140     result = unicode_compare(u, v);
   6141 
   6142     Py_DECREF(u);
   6143     Py_DECREF(v);
   6144     return result;
   6145 
   6146   onError:
   6147     Py_XDECREF(u);
   6148     Py_XDECREF(v);
   6149     return -1;
   6150 }
   6151 
   6152 PyObject *PyUnicode_RichCompare(PyObject *left,
   6153                                 PyObject *right,
   6154                                 int op)
   6155 {
   6156     int result;
   6157 
   6158     result = PyUnicode_Compare(left, right);
   6159     if (result == -1 && PyErr_Occurred())
   6160         goto onError;
   6161 
   6162     /* Convert the return value to a Boolean */
   6163     switch (op) {
   6164     case Py_EQ:
   6165         result = (result == 0);
   6166         break;
   6167     case Py_NE:
   6168         result = (result != 0);
   6169         break;
   6170     case Py_LE:
   6171         result = (result <= 0);
   6172         break;
   6173     case Py_GE:
   6174         result = (result >= 0);
   6175         break;
   6176     case Py_LT:
   6177         result = (result == -1);
   6178         break;
   6179     case Py_GT:
   6180         result = (result == 1);
   6181         break;
   6182     }
   6183     return PyBool_FromLong(result);
   6184 
   6185   onError:
   6186 
   6187     /* Standard case
   6188 
   6189        Type errors mean that PyUnicode_FromObject() could not convert
   6190        one of the arguments (usually the right hand side) to Unicode,
   6191        ie. we can't handle the comparison request. However, it is
   6192        possible that the other object knows a comparison method, which
   6193        is why we return Py_NotImplemented to give the other object a
   6194        chance.
   6195 
   6196     */
   6197     if (PyErr_ExceptionMatches(PyExc_TypeError)) {
   6198         PyErr_Clear();
   6199         Py_INCREF(Py_NotImplemented);
   6200         return Py_NotImplemented;
   6201     }
   6202     if (op != Py_EQ && op != Py_NE)
   6203         return NULL;
   6204 
   6205     /* Equality comparison.
   6206 
   6207        This is a special case: we silence any PyExc_UnicodeDecodeError
   6208        and instead turn it into a PyErr_UnicodeWarning.
   6209 
   6210     */
   6211     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
   6212         return NULL;
   6213     PyErr_Clear();
   6214     if (PyErr_Warn(PyExc_UnicodeWarning,
   6215                    (op == Py_EQ) ?
   6216                    "Unicode equal comparison "
   6217                    "failed to convert both arguments to Unicode - "
   6218                    "interpreting them as being unequal" :
   6219                    "Unicode unequal comparison "
   6220                    "failed to convert both arguments to Unicode - "
   6221                    "interpreting them as being unequal"
   6222             ) < 0)
   6223         return NULL;
   6224     result = (op == Py_NE);
   6225     return PyBool_FromLong(result);
   6226 }
   6227 
   6228 int PyUnicode_Contains(PyObject *container,
   6229                        PyObject *element)
   6230 {
   6231     PyObject *str, *sub;
   6232     int result;
   6233 
   6234     /* Coerce the two arguments */
   6235     sub = PyUnicode_FromObject(element);
   6236     if (!sub) {
   6237         return -1;
   6238     }
   6239 
   6240     str = PyUnicode_FromObject(container);
   6241     if (!str) {
   6242         Py_DECREF(sub);
   6243         return -1;
   6244     }
   6245 
   6246     result = stringlib_contains_obj(str, sub);
   6247 
   6248     Py_DECREF(str);
   6249     Py_DECREF(sub);
   6250 
   6251     return result;
   6252 }
   6253 
   6254 /* Concat to string or Unicode object giving a new Unicode object. */
   6255 
   6256 PyObject *PyUnicode_Concat(PyObject *left,
   6257                            PyObject *right)
   6258 {
   6259     PyUnicodeObject *u = NULL, *v = NULL, *w;
   6260 
   6261     /* Coerce the two arguments */
   6262     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
   6263     if (u == NULL)
   6264         goto onError;
   6265     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
   6266     if (v == NULL)
   6267         goto onError;
   6268 
   6269     /* Shortcuts */
   6270     if (v == unicode_empty) {
   6271         Py_DECREF(v);
   6272         return (PyObject *)u;
   6273     }
   6274     if (u == unicode_empty) {
   6275         Py_DECREF(u);
   6276         return (PyObject *)v;
   6277     }
   6278 
   6279     /* Concat the two Unicode strings */
   6280     w = _PyUnicode_New(u->length + v->length);
   6281     if (w == NULL)
   6282         goto onError;
   6283     Py_UNICODE_COPY(w->str, u->str, u->length);
   6284     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
   6285 
   6286     Py_DECREF(u);
   6287     Py_DECREF(v);
   6288     return (PyObject *)w;
   6289 
   6290   onError:
   6291     Py_XDECREF(u);
   6292     Py_XDECREF(v);
   6293     return NULL;
   6294 }
   6295 
   6296 PyDoc_STRVAR(count__doc__,
   6297              "S.count(sub[, start[, end]]) -> int\n\
   6298 \n\
   6299 Return the number of non-overlapping occurrences of substring sub in\n\
   6300 Unicode string S[start:end].  Optional arguments start and end are\n\
   6301 interpreted as in slice notation.");
   6302 
   6303 static PyObject *
   6304 unicode_count(PyUnicodeObject *self, PyObject *args)
   6305 {
   6306     PyUnicodeObject *substring;
   6307     Py_ssize_t start = 0;
   6308     Py_ssize_t end = PY_SSIZE_T_MAX;
   6309     PyObject *result;
   6310 
   6311     if (!stringlib_parse_args_finds_unicode("count", args, &substring,
   6312                                             &start, &end))
   6313         return NULL;
   6314 
   6315     ADJUST_INDICES(start, end, self->length);
   6316     result = PyInt_FromSsize_t(
   6317         stringlib_count(self->str + start, end - start,
   6318                         substring->str, substring->length,
   6319                         PY_SSIZE_T_MAX)
   6320         );
   6321 
   6322     Py_DECREF(substring);
   6323 
   6324     return result;
   6325 }
   6326 
   6327 PyDoc_STRVAR(encode__doc__,
   6328              "S.encode([encoding[,errors]]) -> string or unicode\n\
   6329 \n\
   6330 Encodes S using the codec registered for encoding. encoding defaults\n\
   6331 to the default encoding. errors may be given to set a different error\n\
   6332 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
   6333 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
   6334 'xmlcharrefreplace' as well as any other name registered with\n\
   6335 codecs.register_error that can handle UnicodeEncodeErrors.");
   6336 
   6337 static PyObject *
   6338 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
   6339 {
   6340     static char *kwlist[] = {"encoding", "errors", 0};
   6341     char *encoding = NULL;
   6342     char *errors = NULL;
   6343     PyObject *v;
   6344 
   6345     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
   6346                                      kwlist, &encoding, &errors))
   6347         return NULL;
   6348     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
   6349     if (v == NULL)
   6350         goto onError;
   6351     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
   6352         PyErr_Format(PyExc_TypeError,
   6353                      "encoder did not return a string/unicode object "
   6354                      "(type=%.400s)",
   6355                      Py_TYPE(v)->tp_name);
   6356         Py_DECREF(v);
   6357         return NULL;
   6358     }
   6359     return v;
   6360 
   6361   onError:
   6362     return NULL;
   6363 }
   6364 
   6365 PyDoc_STRVAR(decode__doc__,
   6366              "S.decode([encoding[,errors]]) -> string or unicode\n\
   6367 \n\
   6368 Decodes S using the codec registered for encoding. encoding defaults\n\
   6369 to the default encoding. errors may be given to set a different error\n\
   6370 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
   6371 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
   6372 as well as any other name registerd with codecs.register_error that is\n\
   6373 able to handle UnicodeDecodeErrors.");
   6374 
   6375 static PyObject *
   6376 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
   6377 {
   6378     static char *kwlist[] = {"encoding", "errors", 0};
   6379     char *encoding = NULL;
   6380     char *errors = NULL;
   6381     PyObject *v;
   6382 
   6383     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
   6384                                      kwlist, &encoding, &errors))
   6385         return NULL;
   6386     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
   6387     if (v == NULL)
   6388         goto onError;
   6389     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
   6390         PyErr_Format(PyExc_TypeError,
   6391                      "decoder did not return a string/unicode object "
   6392                      "(type=%.400s)",
   6393                      Py_TYPE(v)->tp_name);
   6394         Py_DECREF(v);
   6395         return NULL;
   6396     }
   6397     return v;
   6398 
   6399   onError:
   6400     return NULL;
   6401 }
   6402 
   6403 PyDoc_STRVAR(expandtabs__doc__,
   6404              "S.expandtabs([tabsize]) -> unicode\n\
   6405 \n\
   6406 Return a copy of S where all tab characters are expanded using spaces.\n\
   6407 If tabsize is not given, a tab size of 8 characters is assumed.");
   6408 
   6409 static PyObject*
   6410 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
   6411 {
   6412     Py_UNICODE *e;
   6413     Py_UNICODE *p;
   6414     Py_UNICODE *q;
   6415     Py_UNICODE *qe;
   6416     Py_ssize_t i, j, incr;
   6417     PyUnicodeObject *u;
   6418     int tabsize = 8;
   6419 
   6420     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
   6421         return NULL;
   6422 
   6423     /* First pass: determine size of output string */
   6424     i = 0; /* chars up to and including most recent \n or \r */
   6425     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
   6426     e = self->str + self->length; /* end of input */
   6427     for (p = self->str; p < e; p++)
   6428         if (*p == '\t') {
   6429             if (tabsize > 0) {
   6430                 incr = tabsize - (j % tabsize); /* cannot overflow */
   6431                 if (j > PY_SSIZE_T_MAX - incr)
   6432                     goto overflow1;
   6433                 j += incr;
   6434             }
   6435         }
   6436         else {
   6437             if (j > PY_SSIZE_T_MAX - 1)
   6438                 goto overflow1;
   6439             j++;
   6440             if (*p == '\n' || *p == '\r') {
   6441                 if (i > PY_SSIZE_T_MAX - j)
   6442                     goto overflow1;
   6443                 i += j;
   6444                 j = 0;
   6445             }
   6446         }
   6447 
   6448     if (i > PY_SSIZE_T_MAX - j)
   6449         goto overflow1;
   6450 
   6451     /* Second pass: create output string and fill it */
   6452     u = _PyUnicode_New(i + j);
   6453     if (!u)
   6454         return NULL;
   6455 
   6456     j = 0; /* same as in first pass */
   6457     q = u->str; /* next output char */
   6458     qe = u->str + u->length; /* end of output */
   6459 
   6460     for (p = self->str; p < e; p++)
   6461         if (*p == '\t') {
   6462             if (tabsize > 0) {
   6463                 i = tabsize - (j % tabsize);
   6464                 j += i;
   6465                 while (i--) {
   6466                     if (q >= qe)
   6467                         goto overflow2;
   6468                     *q++ = ' ';
   6469                 }
   6470             }
   6471         }
   6472         else {
   6473             if (q >= qe)
   6474                 goto overflow2;
   6475             *q++ = *p;
   6476             j++;
   6477             if (*p == '\n' || *p == '\r')
   6478                 j = 0;
   6479         }
   6480 
   6481     return (PyObject*) u;
   6482 
   6483   overflow2:
   6484     Py_DECREF(u);
   6485   overflow1:
   6486     PyErr_SetString(PyExc_OverflowError, "new string is too long");
   6487     return NULL;
   6488 }
   6489 
   6490 PyDoc_STRVAR(find__doc__,
   6491              "S.find(sub [,start [,end]]) -> int\n\
   6492 \n\
   6493 Return the lowest index in S where substring sub is found,\n\
   6494 such that sub is contained within s[start:end].  Optional\n\
   6495 arguments start and end are interpreted as in slice notation.\n\
   6496 \n\
   6497 Return -1 on failure.");
   6498 
   6499 static PyObject *
   6500 unicode_find(PyUnicodeObject *self, PyObject *args)
   6501 {
   6502     PyUnicodeObject *substring;
   6503     Py_ssize_t start;
   6504     Py_ssize_t end;
   6505     Py_ssize_t result;
   6506 
   6507     if (!stringlib_parse_args_finds_unicode("find", args, &substring,
   6508                                             &start, &end))
   6509         return NULL;
   6510 
   6511     result = stringlib_find_slice(
   6512         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
   6513         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
   6514         start, end
   6515         );
   6516 
   6517     Py_DECREF(substring);
   6518 
   6519     return PyInt_FromSsize_t(result);
   6520 }
   6521 
   6522 static PyObject *
   6523 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
   6524 {
   6525     if (index < 0 || index >= self->length) {
   6526         PyErr_SetString(PyExc_IndexError, "string index out of range");
   6527         return NULL;
   6528     }
   6529 
   6530     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
   6531 }
   6532 
   6533 static long
   6534 unicode_hash(PyUnicodeObject *self)
   6535 {
   6536     /* Since Unicode objects compare equal to their ASCII string
   6537        counterparts, they should use the individual character values
   6538        as basis for their hash value.  This is needed to assure that
   6539        strings and Unicode objects behave in the same way as
   6540        dictionary keys. */
   6541 
   6542     register Py_ssize_t len;
   6543     register Py_UNICODE *p;
   6544     register long x;
   6545 
   6546     if (self->hash != -1)
   6547         return self->hash;
   6548     len = PyUnicode_GET_SIZE(self);
   6549     p = PyUnicode_AS_UNICODE(self);
   6550     x = *p << 7;
   6551     while (--len >= 0)
   6552         x = (1000003*x) ^ *p++;
   6553     x ^= PyUnicode_GET_SIZE(self);
   6554     if (x == -1)
   6555         x = -2;
   6556     self->hash = x;
   6557     return x;
   6558 }
   6559 
   6560 PyDoc_STRVAR(index__doc__,
   6561              "S.index(sub [,start [,end]]) -> int\n\
   6562 \n\
   6563 Like S.find() but raise ValueError when the substring is not found.");
   6564 
   6565 static PyObject *
   6566 unicode_index(PyUnicodeObject *self, PyObject *args)
   6567 {
   6568     Py_ssize_t result;
   6569     PyUnicodeObject *substring;
   6570     Py_ssize_t start;
   6571     Py_ssize_t end;
   6572 
   6573     if (!stringlib_parse_args_finds_unicode("index", args, &substring,
   6574                                             &start, &end))
   6575         return NULL;
   6576 
   6577     result = stringlib_find_slice(
   6578         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
   6579         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
   6580         start, end
   6581         );
   6582 
   6583     Py_DECREF(substring);
   6584 
   6585     if (result < 0) {
   6586         PyErr_SetString(PyExc_ValueError, "substring not found");
   6587         return NULL;
   6588     }
   6589 
   6590     return PyInt_FromSsize_t(result);
   6591 }
   6592 
   6593 PyDoc_STRVAR(islower__doc__,
   6594              "S.islower() -> bool\n\
   6595 \n\
   6596 Return True if all cased characters in S are lowercase and there is\n\
   6597 at least one cased character in S, False otherwise.");
   6598 
   6599 static PyObject*
   6600 unicode_islower(PyUnicodeObject *self)
   6601 {
   6602     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6603     register const Py_UNICODE *e;
   6604     int cased;
   6605 
   6606     /* Shortcut for single character strings */
   6607     if (PyUnicode_GET_SIZE(self) == 1)
   6608         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
   6609 
   6610     /* Special case for empty strings */
   6611     if (PyUnicode_GET_SIZE(self) == 0)
   6612         return PyBool_FromLong(0);
   6613 
   6614     e = p + PyUnicode_GET_SIZE(self);
   6615     cased = 0;
   6616     for (; p < e; p++) {
   6617         register const Py_UNICODE ch = *p;
   6618 
   6619         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
   6620             return PyBool_FromLong(0);
   6621         else if (!cased && Py_UNICODE_ISLOWER(ch))
   6622             cased = 1;
   6623     }
   6624     return PyBool_FromLong(cased);
   6625 }
   6626 
   6627 PyDoc_STRVAR(isupper__doc__,
   6628              "S.isupper() -> bool\n\
   6629 \n\
   6630 Return True if all cased characters in S are uppercase and there is\n\
   6631 at least one cased character in S, False otherwise.");
   6632 
   6633 static PyObject*
   6634 unicode_isupper(PyUnicodeObject *self)
   6635 {
   6636     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6637     register const Py_UNICODE *e;
   6638     int cased;
   6639 
   6640     /* Shortcut for single character strings */
   6641     if (PyUnicode_GET_SIZE(self) == 1)
   6642         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
   6643 
   6644     /* Special case for empty strings */
   6645     if (PyUnicode_GET_SIZE(self) == 0)
   6646         return PyBool_FromLong(0);
   6647 
   6648     e = p + PyUnicode_GET_SIZE(self);
   6649     cased = 0;
   6650     for (; p < e; p++) {
   6651         register const Py_UNICODE ch = *p;
   6652 
   6653         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
   6654             return PyBool_FromLong(0);
   6655         else if (!cased && Py_UNICODE_ISUPPER(ch))
   6656             cased = 1;
   6657     }
   6658     return PyBool_FromLong(cased);
   6659 }
   6660 
   6661 PyDoc_STRVAR(istitle__doc__,
   6662              "S.istitle() -> bool\n\
   6663 \n\
   6664 Return True if S is a titlecased string and there is at least one\n\
   6665 character in S, i.e. upper- and titlecase characters may only\n\
   6666 follow uncased characters and lowercase characters only cased ones.\n\
   6667 Return False otherwise.");
   6668 
   6669 static PyObject*
   6670 unicode_istitle(PyUnicodeObject *self)
   6671 {
   6672     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6673     register const Py_UNICODE *e;
   6674     int cased, previous_is_cased;
   6675 
   6676     /* Shortcut for single character strings */
   6677     if (PyUnicode_GET_SIZE(self) == 1)
   6678         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
   6679                                (Py_UNICODE_ISUPPER(*p) != 0));
   6680 
   6681     /* Special case for empty strings */
   6682     if (PyUnicode_GET_SIZE(self) == 0)
   6683         return PyBool_FromLong(0);
   6684 
   6685     e = p + PyUnicode_GET_SIZE(self);
   6686     cased = 0;
   6687     previous_is_cased = 0;
   6688     for (; p < e; p++) {
   6689         register const Py_UNICODE ch = *p;
   6690 
   6691         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
   6692             if (previous_is_cased)
   6693                 return PyBool_FromLong(0);
   6694             previous_is_cased = 1;
   6695             cased = 1;
   6696         }
   6697         else if (Py_UNICODE_ISLOWER(ch)) {
   6698             if (!previous_is_cased)
   6699                 return PyBool_FromLong(0);
   6700             previous_is_cased = 1;
   6701             cased = 1;
   6702         }
   6703         else
   6704             previous_is_cased = 0;
   6705     }
   6706     return PyBool_FromLong(cased);
   6707 }
   6708 
   6709 PyDoc_STRVAR(isspace__doc__,
   6710              "S.isspace() -> bool\n\
   6711 \n\
   6712 Return True if all characters in S are whitespace\n\
   6713 and there is at least one character in S, False otherwise.");
   6714 
   6715 static PyObject*
   6716 unicode_isspace(PyUnicodeObject *self)
   6717 {
   6718     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6719     register const Py_UNICODE *e;
   6720 
   6721     /* Shortcut for single character strings */
   6722     if (PyUnicode_GET_SIZE(self) == 1 &&
   6723         Py_UNICODE_ISSPACE(*p))
   6724         return PyBool_FromLong(1);
   6725 
   6726     /* Special case for empty strings */
   6727     if (PyUnicode_GET_SIZE(self) == 0)
   6728         return PyBool_FromLong(0);
   6729 
   6730     e = p + PyUnicode_GET_SIZE(self);
   6731     for (; p < e; p++) {
   6732         if (!Py_UNICODE_ISSPACE(*p))
   6733             return PyBool_FromLong(0);
   6734     }
   6735     return PyBool_FromLong(1);
   6736 }
   6737 
   6738 PyDoc_STRVAR(isalpha__doc__,
   6739              "S.isalpha() -> bool\n\
   6740 \n\
   6741 Return True if all characters in S are alphabetic\n\
   6742 and there is at least one character in S, False otherwise.");
   6743 
   6744 static PyObject*
   6745 unicode_isalpha(PyUnicodeObject *self)
   6746 {
   6747     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6748     register const Py_UNICODE *e;
   6749 
   6750     /* Shortcut for single character strings */
   6751     if (PyUnicode_GET_SIZE(self) == 1 &&
   6752         Py_UNICODE_ISALPHA(*p))
   6753         return PyBool_FromLong(1);
   6754 
   6755     /* Special case for empty strings */
   6756     if (PyUnicode_GET_SIZE(self) == 0)
   6757         return PyBool_FromLong(0);
   6758 
   6759     e = p + PyUnicode_GET_SIZE(self);
   6760     for (; p < e; p++) {
   6761         if (!Py_UNICODE_ISALPHA(*p))
   6762             return PyBool_FromLong(0);
   6763     }
   6764     return PyBool_FromLong(1);
   6765 }
   6766 
   6767 PyDoc_STRVAR(isalnum__doc__,
   6768              "S.isalnum() -> bool\n\
   6769 \n\
   6770 Return True if all characters in S are alphanumeric\n\
   6771 and there is at least one character in S, False otherwise.");
   6772 
   6773 static PyObject*
   6774 unicode_isalnum(PyUnicodeObject *self)
   6775 {
   6776     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6777     register const Py_UNICODE *e;
   6778 
   6779     /* Shortcut for single character strings */
   6780     if (PyUnicode_GET_SIZE(self) == 1 &&
   6781         Py_UNICODE_ISALNUM(*p))
   6782         return PyBool_FromLong(1);
   6783 
   6784     /* Special case for empty strings */
   6785     if (PyUnicode_GET_SIZE(self) == 0)
   6786         return PyBool_FromLong(0);
   6787 
   6788     e = p + PyUnicode_GET_SIZE(self);
   6789     for (; p < e; p++) {
   6790         if (!Py_UNICODE_ISALNUM(*p))
   6791             return PyBool_FromLong(0);
   6792     }
   6793     return PyBool_FromLong(1);
   6794 }
   6795 
   6796 PyDoc_STRVAR(isdecimal__doc__,
   6797              "S.isdecimal() -> bool\n\
   6798 \n\
   6799 Return True if there are only decimal characters in S,\n\
   6800 False otherwise.");
   6801 
   6802 static PyObject*
   6803 unicode_isdecimal(PyUnicodeObject *self)
   6804 {
   6805     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6806     register const Py_UNICODE *e;
   6807 
   6808     /* Shortcut for single character strings */
   6809     if (PyUnicode_GET_SIZE(self) == 1 &&
   6810         Py_UNICODE_ISDECIMAL(*p))
   6811         return PyBool_FromLong(1);
   6812 
   6813     /* Special case for empty strings */
   6814     if (PyUnicode_GET_SIZE(self) == 0)
   6815         return PyBool_FromLong(0);
   6816 
   6817     e = p + PyUnicode_GET_SIZE(self);
   6818     for (; p < e; p++) {
   6819         if (!Py_UNICODE_ISDECIMAL(*p))
   6820             return PyBool_FromLong(0);
   6821     }
   6822     return PyBool_FromLong(1);
   6823 }
   6824 
   6825 PyDoc_STRVAR(isdigit__doc__,
   6826              "S.isdigit() -> bool\n\
   6827 \n\
   6828 Return True if all characters in S are digits\n\
   6829 and there is at least one character in S, False otherwise.");
   6830 
   6831 static PyObject*
   6832 unicode_isdigit(PyUnicodeObject *self)
   6833 {
   6834     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6835     register const Py_UNICODE *e;
   6836 
   6837     /* Shortcut for single character strings */
   6838     if (PyUnicode_GET_SIZE(self) == 1 &&
   6839         Py_UNICODE_ISDIGIT(*p))
   6840         return PyBool_FromLong(1);
   6841 
   6842     /* Special case for empty strings */
   6843     if (PyUnicode_GET_SIZE(self) == 0)
   6844         return PyBool_FromLong(0);
   6845 
   6846     e = p + PyUnicode_GET_SIZE(self);
   6847     for (; p < e; p++) {
   6848         if (!Py_UNICODE_ISDIGIT(*p))
   6849             return PyBool_FromLong(0);
   6850     }
   6851     return PyBool_FromLong(1);
   6852 }
   6853 
   6854 PyDoc_STRVAR(isnumeric__doc__,
   6855              "S.isnumeric() -> bool\n\
   6856 \n\
   6857 Return True if there are only numeric characters in S,\n\
   6858 False otherwise.");
   6859 
   6860 static PyObject*
   6861 unicode_isnumeric(PyUnicodeObject *self)
   6862 {
   6863     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
   6864     register const Py_UNICODE *e;
   6865 
   6866     /* Shortcut for single character strings */
   6867     if (PyUnicode_GET_SIZE(self) == 1 &&
   6868         Py_UNICODE_ISNUMERIC(*p))
   6869         return PyBool_FromLong(1);
   6870 
   6871     /* Special case for empty strings */
   6872     if (PyUnicode_GET_SIZE(self) == 0)
   6873         return PyBool_FromLong(0);
   6874 
   6875     e = p + PyUnicode_GET_SIZE(self);
   6876     for (; p < e; p++) {
   6877         if (!Py_UNICODE_ISNUMERIC(*p))
   6878             return PyBool_FromLong(0);
   6879     }
   6880     return PyBool_FromLong(1);
   6881 }
   6882 
   6883 PyDoc_STRVAR(join__doc__,
   6884              "S.join(iterable) -> unicode\n\
   6885 \n\
   6886 Return a string which is the concatenation of the strings in the\n\
   6887 iterable.  The separator between elements is S.");
   6888 
   6889 static PyObject*
   6890 unicode_join(PyObject *self, PyObject *data)
   6891 {
   6892     return PyUnicode_Join(self, data);
   6893 }
   6894 
   6895 static Py_ssize_t
   6896 unicode_length(PyUnicodeObject *self)
   6897 {
   6898     return self->length;
   6899 }
   6900 
   6901 PyDoc_STRVAR(ljust__doc__,
   6902              "S.ljust(width[, fillchar]) -> int\n\
   6903 \n\
   6904 Return S left-justified in a Unicode string of length width. Padding is\n\
   6905 done using the specified fill character (default is a space).");
   6906 
   6907 static PyObject *
   6908 unicode_ljust(PyUnicodeObject *self, PyObject *args)
   6909 {
   6910     Py_ssize_t width;
   6911     Py_UNICODE fillchar = ' ';
   6912 
   6913     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
   6914         return NULL;
   6915 
   6916     if (self->length >= width && PyUnicode_CheckExact(self)) {
   6917         Py_INCREF(self);
   6918         return (PyObject*) self;
   6919     }
   6920 
   6921     return (PyObject*) pad(self, 0, width - self->length, fillchar);
   6922 }
   6923 
   6924 PyDoc_STRVAR(lower__doc__,
   6925              "S.lower() -> unicode\n\
   6926 \n\
   6927 Return a copy of the string S converted to lowercase.");
   6928 
   6929 static PyObject*
   6930 unicode_lower(PyUnicodeObject *self)
   6931 {
   6932     return fixup(self, fixlower);
   6933 }
   6934 
   6935 #define LEFTSTRIP 0
   6936 #define RIGHTSTRIP 1
   6937 #define BOTHSTRIP 2
   6938 
   6939 /* Arrays indexed by above */
   6940 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
   6941 
   6942 #define STRIPNAME(i) (stripformat[i]+3)
   6943 
   6944 /* externally visible for str.strip(unicode) */
   6945 PyObject *
   6946 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
   6947 {
   6948     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
   6949     Py_ssize_t len = PyUnicode_GET_SIZE(self);
   6950     Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
   6951     Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
   6952     Py_ssize_t i, j;
   6953 
   6954     BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
   6955 
   6956     i = 0;
   6957     if (striptype != RIGHTSTRIP) {
   6958         while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
   6959             i++;
   6960         }
   6961     }
   6962 
   6963     j = len;
   6964     if (striptype != LEFTSTRIP) {
   6965         do {
   6966             j--;
   6967         } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
   6968         j++;
   6969     }
   6970 
   6971     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
   6972         Py_INCREF(self);
   6973         return (PyObject*)self;
   6974     }
   6975     else
   6976         return PyUnicode_FromUnicode(s+i, j-i);
   6977 }
   6978 
   6979 
   6980 static PyObject *
   6981 do_strip(PyUnicodeObject *self, int striptype)
   6982 {
   6983     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
   6984     Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
   6985 
   6986     i = 0;
   6987     if (striptype != RIGHTSTRIP) {
   6988         while (i < len && Py_UNICODE_ISSPACE(s[i])) {
   6989             i++;
   6990         }
   6991     }
   6992 
   6993     j = len;
   6994     if (striptype != LEFTSTRIP) {
   6995         do {
   6996             j--;
   6997         } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
   6998         j++;
   6999     }
   7000 
   7001     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
   7002         Py_INCREF(self);
   7003         return (PyObject*)self;
   7004     }
   7005     else
   7006         return PyUnicode_FromUnicode(s+i, j-i);
   7007 }
   7008 
   7009 
   7010 static PyObject *
   7011 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
   7012 {
   7013     PyObject *sep = NULL;
   7014 
   7015     if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
   7016         return NULL;
   7017 
   7018     if (sep != NULL && sep != Py_None) {
   7019         if (PyUnicode_Check(sep))
   7020             return _PyUnicode_XStrip(self, striptype, sep);
   7021         else if (PyString_Check(sep)) {
   7022             PyObject *res;
   7023             sep = PyUnicode_FromObject(sep);
   7024             if (sep==NULL)
   7025                 return NULL;
   7026             res = _PyUnicode_XStrip(self, striptype, sep);
   7027             Py_DECREF(sep);
   7028             return res;
   7029         }
   7030         else {
   7031             PyErr_Format(PyExc_TypeError,
   7032                          "%s arg must be None, unicode or str",
   7033                          STRIPNAME(striptype));
   7034             return NULL;
   7035         }
   7036     }
   7037 
   7038     return do_strip(self, striptype);
   7039 }
   7040 
   7041 
   7042 PyDoc_STRVAR(strip__doc__,
   7043              "S.strip([chars]) -> unicode\n\
   7044 \n\
   7045 Return a copy of the string S with leading and trailing\n\
   7046 whitespace removed.\n\
   7047 If chars is given and not None, remove characters in chars instead.\n\
   7048 If chars is a str, it will be converted to unicode before stripping");
   7049 
   7050 static PyObject *
   7051 unicode_strip(PyUnicodeObject *self, PyObject *args)
   7052 {
   7053     if (PyTuple_GET_SIZE(args) == 0)
   7054         return do_strip(self, BOTHSTRIP); /* Common case */
   7055     else
   7056         return do_argstrip(self, BOTHSTRIP, args);
   7057 }
   7058 
   7059 
   7060 PyDoc_STRVAR(lstrip__doc__,
   7061              "S.lstrip([chars]) -> unicode\n\
   7062 \n\
   7063 Return a copy of the string S with leading whitespace removed.\n\
   7064 If chars is given and not None, remove characters in chars instead.\n\
   7065 If chars is a str, it will be converted to unicode before stripping");
   7066 
   7067 static PyObject *
   7068 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
   7069 {
   7070     if (PyTuple_GET_SIZE(args) == 0)
   7071         return do_strip(self, LEFTSTRIP); /* Common case */
   7072     else
   7073         return do_argstrip(self, LEFTSTRIP, args);
   7074 }
   7075 
   7076 
   7077 PyDoc_STRVAR(rstrip__doc__,
   7078              "S.rstrip([chars]) -> unicode\n\
   7079 \n\
   7080 Return a copy of the string S with trailing whitespace removed.\n\
   7081 If chars is given and not None, remove characters in chars instead.\n\
   7082 If chars is a str, it will be converted to unicode before stripping");
   7083 
   7084 static PyObject *
   7085 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
   7086 {
   7087     if (PyTuple_GET_SIZE(args) == 0)
   7088         return do_strip(self, RIGHTSTRIP); /* Common case */
   7089     else
   7090         return do_argstrip(self, RIGHTSTRIP, args);
   7091 }
   7092 
   7093 
   7094 static PyObject*
   7095 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
   7096 {
   7097     PyUnicodeObject *u;
   7098     Py_UNICODE *p;
   7099     Py_ssize_t nchars;
   7100     size_t nbytes;
   7101 
   7102     if (len < 0)
   7103         len = 0;
   7104 
   7105     if (len == 1 && PyUnicode_CheckExact(str)) {
   7106         /* no repeat, return original string */
   7107         Py_INCREF(str);
   7108         return (PyObject*) str;
   7109     }
   7110 
   7111     /* ensure # of chars needed doesn't overflow int and # of bytes
   7112      * needed doesn't overflow size_t
   7113      */
   7114     nchars = len * str->length;
   7115     if (len && nchars / len != str->length) {
   7116         PyErr_SetString(PyExc_OverflowError,
   7117                         "repeated string is too long");
   7118         return NULL;
   7119     }
   7120     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
   7121     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
   7122         PyErr_SetString(PyExc_OverflowError,
   7123                         "repeated string is too long");
   7124         return NULL;
   7125     }
   7126     u = _PyUnicode_New(nchars);
   7127     if (!u)
   7128         return NULL;
   7129 
   7130     p = u->str;
   7131 
   7132     if (str->length == 1 && len > 0) {
   7133         Py_UNICODE_FILL(p, str->str[0], len);
   7134     } else {
   7135         Py_ssize_t done = 0; /* number of characters copied this far */
   7136         if (done < nchars) {
   7137             Py_UNICODE_COPY(p, str->str, str->length);
   7138             done = str->length;
   7139         }
   7140         while (done < nchars) {
   7141             Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
   7142             Py_UNICODE_COPY(p+done, p, n);
   7143             done += n;
   7144         }
   7145     }
   7146 
   7147     return (PyObject*) u;
   7148 }
   7149 
   7150 PyObject *PyUnicode_Replace(PyObject *obj,
   7151                             PyObject *subobj,
   7152                             PyObject *replobj,
   7153                             Py_ssize_t maxcount)
   7154 {
   7155     PyObject *self;
   7156     PyObject *str1;
   7157     PyObject *str2;
   7158     PyObject *result;
   7159 
   7160     self = PyUnicode_FromObject(obj);
   7161     if (self == NULL)
   7162         return NULL;
   7163     str1 = PyUnicode_FromObject(subobj);
   7164     if (str1 == NULL) {
   7165         Py_DECREF(self);
   7166         return NULL;
   7167     }
   7168     str2 = PyUnicode_FromObject(replobj);
   7169     if (str2 == NULL) {
   7170         Py_DECREF(self);
   7171         Py_DECREF(str1);
   7172         return NULL;
   7173     }
   7174     result = replace((PyUnicodeObject *)self,
   7175                      (PyUnicodeObject *)str1,
   7176                      (PyUnicodeObject *)str2,
   7177                      maxcount);
   7178     Py_DECREF(self);
   7179     Py_DECREF(str1);
   7180     Py_DECREF(str2);
   7181     return result;
   7182 }
   7183 
   7184 PyDoc_STRVAR(replace__doc__,
   7185              "S.replace(old, new[, count]) -> unicode\n\
   7186 \n\
   7187 Return a copy of S with all occurrences of substring\n\
   7188 old replaced by new.  If the optional argument count is\n\
   7189 given, only the first count occurrences are replaced.");
   7190 
   7191 static PyObject*
   7192 unicode_replace(PyUnicodeObject *self, PyObject *args)
   7193 {
   7194     PyUnicodeObject *str1;
   7195     PyUnicodeObject *str2;
   7196     Py_ssize_t maxcount = -1;
   7197     PyObject *result;
   7198 
   7199     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
   7200         return NULL;
   7201     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
   7202     if (str1 == NULL)
   7203         return NULL;
   7204     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
   7205     if (str2 == NULL) {
   7206         Py_DECREF(str1);
   7207         return NULL;
   7208     }
   7209 
   7210     result = replace(self, str1, str2, maxcount);
   7211 
   7212     Py_DECREF(str1);
   7213     Py_DECREF(str2);
   7214     return result;
   7215 }
   7216 
   7217 static
   7218 PyObject *unicode_repr(PyObject *unicode)
   7219 {
   7220     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
   7221                                 PyUnicode_GET_SIZE(unicode),
   7222                                 1);
   7223 }
   7224 
   7225 PyDoc_STRVAR(rfind__doc__,
   7226              "S.rfind(sub [,start [,end]]) -> int\n\
   7227 \n\
   7228 Return the highest index in S where substring sub is found,\n\
   7229 such that sub is contained within s[start:end].  Optional\n\
   7230 arguments start and end are interpreted as in slice notation.\n\
   7231 \n\
   7232 Return -1 on failure.");
   7233 
   7234 static PyObject *
   7235 unicode_rfind(PyUnicodeObject *self, PyObject *args)
   7236 {
   7237     PyUnicodeObject *substring;
   7238     Py_ssize_t start;
   7239     Py_ssize_t end;
   7240     Py_ssize_t result;
   7241 
   7242     if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
   7243                                             &start, &end))
   7244         return NULL;
   7245 
   7246     result = stringlib_rfind_slice(
   7247         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
   7248         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
   7249         start, end
   7250         );
   7251 
   7252     Py_DECREF(substring);
   7253 
   7254     return PyInt_FromSsize_t(result);
   7255 }
   7256 
   7257 PyDoc_STRVAR(rindex__doc__,
   7258              "S.rindex(sub [,start [,end]]) -> int\n\
   7259 \n\
   7260 Like S.rfind() but raise ValueError when the substring is not found.");
   7261 
   7262 static PyObject *
   7263 unicode_rindex(PyUnicodeObject *self, PyObject *args)
   7264 {
   7265     PyUnicodeObject *substring;
   7266     Py_ssize_t start;
   7267     Py_ssize_t end;
   7268     Py_ssize_t result;
   7269 
   7270     if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
   7271                                             &start, &end))
   7272         return NULL;
   7273 
   7274     result = stringlib_rfind_slice(
   7275         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
   7276         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
   7277         start, end
   7278         );
   7279 
   7280     Py_DECREF(substring);
   7281 
   7282     if (result < 0) {
   7283         PyErr_SetString(PyExc_ValueError, "substring not found");
   7284         return NULL;
   7285     }
   7286     return PyInt_FromSsize_t(result);
   7287 }
   7288 
   7289 PyDoc_STRVAR(rjust__doc__,
   7290              "S.rjust(width[, fillchar]) -> unicode\n\
   7291 \n\
   7292 Return S right-justified in a Unicode string of length width. Padding is\n\
   7293 done using the specified fill character (default is a space).");
   7294 
   7295 static PyObject *
   7296 unicode_rjust(PyUnicodeObject *self, PyObject *args)
   7297 {
   7298     Py_ssize_t width;
   7299     Py_UNICODE fillchar = ' ';
   7300 
   7301     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
   7302         return NULL;
   7303 
   7304     if (self->length >= width && PyUnicode_CheckExact(self)) {
   7305         Py_INCREF(self);
   7306         return (PyObject*) self;
   7307     }
   7308 
   7309     return (PyObject*) pad(self, width - self->length, 0, fillchar);
   7310 }
   7311 
   7312 static PyObject*
   7313 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
   7314 {
   7315     /* standard clamping */
   7316     if (start < 0)
   7317         start = 0;
   7318     if (end < 0)
   7319         end = 0;
   7320     if (end > self->length)
   7321         end = self->length;
   7322     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
   7323         /* full slice, return original string */
   7324         Py_INCREF(self);
   7325         return (PyObject*) self;
   7326     }
   7327     if (start > end)
   7328         start = end;
   7329     /* copy slice */
   7330     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
   7331                                              end - start);
   7332 }
   7333 
   7334 PyObject *PyUnicode_Split(PyObject *s,
   7335                           PyObject *sep,
   7336                           Py_ssize_t maxsplit)
   7337 {
   7338     PyObject *result;
   7339 
   7340     s = PyUnicode_FromObject(s);
   7341     if (s == NULL)
   7342         return NULL;
   7343     if (sep != NULL) {
   7344         sep = PyUnicode_FromObject(sep);
   7345         if (sep == NULL) {
   7346             Py_DECREF(s);
   7347             return NULL;
   7348         }
   7349     }
   7350 
   7351     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
   7352 
   7353     Py_DECREF(s);
   7354     Py_XDECREF(sep);
   7355     return result;
   7356 }
   7357 
   7358 PyDoc_STRVAR(split__doc__,
   7359              "S.split([sep [,maxsplit]]) -> list of strings\n\
   7360 \n\
   7361 Return a list of the words in S, using sep as the\n\
   7362 delimiter string.  If maxsplit is given, at most maxsplit\n\
   7363 splits are done. If sep is not specified or is None, any\n\
   7364 whitespace string is a separator and empty strings are\n\
   7365 removed from the result.");
   7366 
   7367 static PyObject*
   7368 unicode_split(PyUnicodeObject *self, PyObject *args)
   7369 {
   7370     PyObject *substring = Py_None;
   7371     Py_ssize_t maxcount = -1;
   7372 
   7373     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
   7374         return NULL;
   7375 
   7376     if (substring == Py_None)
   7377         return split(self, NULL, maxcount);
   7378     else if (PyUnicode_Check(substring))
   7379         return split(self, (PyUnicodeObject *)substring, maxcount);
   7380     else
   7381         return PyUnicode_Split((PyObject *)self, substring, maxcount);
   7382 }
   7383 
   7384 PyObject *
   7385 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
   7386 {
   7387     PyObject* str_obj;
   7388     PyObject* sep_obj;
   7389     PyObject* out;
   7390 
   7391     str_obj = PyUnicode_FromObject(str_in);
   7392     if (!str_obj)
   7393         return NULL;
   7394     sep_obj = PyUnicode_FromObject(sep_in);
   7395     if (!sep_obj) {
   7396         Py_DECREF(str_obj);
   7397         return NULL;
   7398     }
   7399 
   7400     out = stringlib_partition(
   7401         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
   7402         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
   7403         );
   7404 
   7405     Py_DECREF(sep_obj);
   7406     Py_DECREF(str_obj);
   7407 
   7408     return out;
   7409 }
   7410 
   7411 
   7412 PyObject *
   7413 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
   7414 {
   7415     PyObject* str_obj;
   7416     PyObject* sep_obj;
   7417     PyObject* out;
   7418 
   7419     str_obj = PyUnicode_FromObject(str_in);
   7420     if (!str_obj)
   7421         return NULL;
   7422     sep_obj = PyUnicode_FromObject(sep_in);
   7423     if (!sep_obj) {
   7424         Py_DECREF(str_obj);
   7425         return NULL;
   7426     }
   7427 
   7428     out = stringlib_rpartition(
   7429         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
   7430         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
   7431         );
   7432 
   7433     Py_DECREF(sep_obj);
   7434     Py_DECREF(str_obj);
   7435 
   7436     return out;
   7437 }
   7438 
   7439 PyDoc_STRVAR(partition__doc__,
   7440              "S.partition(sep) -> (head, sep, tail)\n\
   7441 \n\
   7442 Search for the separator sep in S, and return the part before it,\n\
   7443 the separator itself, and the part after it.  If the separator is not\n\
   7444 found, return S and two empty strings.");
   7445 
   7446 static PyObject*
   7447 unicode_partition(PyUnicodeObject *self, PyObject *separator)
   7448 {
   7449     return PyUnicode_Partition((PyObject *)self, separator);
   7450 }
   7451 
   7452 PyDoc_STRVAR(rpartition__doc__,
   7453              "S.rpartition(sep) -> (head, sep, tail)\n\
   7454 \n\
   7455 Search for the separator sep in S, starting at the end of S, and return\n\
   7456 the part before it, the separator itself, and the part after it.  If the\n\
   7457 separator is not found, return two empty strings and S.");
   7458 
   7459 static PyObject*
   7460 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
   7461 {
   7462     return PyUnicode_RPartition((PyObject *)self, separator);
   7463 }
   7464 
   7465 PyObject *PyUnicode_RSplit(PyObject *s,
   7466                            PyObject *sep,
   7467                            Py_ssize_t maxsplit)
   7468 {
   7469     PyObject *result;
   7470 
   7471     s = PyUnicode_FromObject(s);
   7472     if (s == NULL)
   7473         return NULL;
   7474     if (sep != NULL) {
   7475         sep = PyUnicode_FromObject(sep);
   7476         if (sep == NULL) {
   7477             Py_DECREF(s);
   7478             return NULL;
   7479         }
   7480     }
   7481 
   7482     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
   7483 
   7484     Py_DECREF(s);
   7485     Py_XDECREF(sep);
   7486     return result;
   7487 }
   7488 
   7489 PyDoc_STRVAR(rsplit__doc__,
   7490              "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
   7491 \n\
   7492 Return a list of the words in S, using sep as the\n\
   7493 delimiter string, starting at the end of the string and\n\
   7494 working to the front.  If maxsplit is given, at most maxsplit\n\
   7495 splits are done. If sep is not specified, any whitespace string\n\
   7496 is a separator.");
   7497 
   7498 static PyObject*
   7499 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
   7500 {
   7501     PyObject *substring = Py_None;
   7502     Py_ssize_t maxcount = -1;
   7503 
   7504     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
   7505         return NULL;
   7506 
   7507     if (substring == Py_None)
   7508         return rsplit(self, NULL, maxcount);
   7509     else if (PyUnicode_Check(substring))
   7510         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
   7511     else
   7512         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
   7513 }
   7514 
   7515 PyDoc_STRVAR(splitlines__doc__,
   7516              "S.splitlines([keepends]) -> list of strings\n\
   7517 \n\
   7518 Return a list of the lines in S, breaking at line boundaries.\n\
   7519 Line breaks are not included in the resulting list unless keepends\n\
   7520 is given and true.");
   7521 
   7522 static PyObject*
   7523 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
   7524 {
   7525     int keepends = 0;
   7526 
   7527     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
   7528         return NULL;
   7529 
   7530     return PyUnicode_Splitlines((PyObject *)self, keepends);
   7531 }
   7532 
   7533 static
   7534 PyObject *unicode_str(PyUnicodeObject *self)
   7535 {
   7536     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
   7537 }
   7538 
   7539 PyDoc_STRVAR(swapcase__doc__,
   7540              "S.swapcase() -> unicode\n\
   7541 \n\
   7542 Return a copy of S with uppercase characters converted to lowercase\n\
   7543 and vice versa.");
   7544 
   7545 static PyObject*
   7546 unicode_swapcase(PyUnicodeObject *self)
   7547 {
   7548     return fixup(self, fixswapcase);
   7549 }
   7550 
   7551 PyDoc_STRVAR(translate__doc__,
   7552              "S.translate(table) -> unicode\n\
   7553 \n\
   7554 Return a copy of the string S, where all characters have been mapped\n\
   7555 through the given translation table, which must be a mapping of\n\
   7556 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
   7557 Unmapped characters are left untouched. Characters mapped to None\n\
   7558 are deleted.");
   7559 
   7560 static PyObject*
   7561 unicode_translate(PyUnicodeObject *self, PyObject *table)
   7562 {
   7563     return PyUnicode_TranslateCharmap(self->str,
   7564                                       self->length,
   7565                                       table,
   7566                                       "ignore");
   7567 }
   7568 
   7569 PyDoc_STRVAR(upper__doc__,
   7570              "S.upper() -> unicode\n\
   7571 \n\
   7572 Return a copy of S converted to uppercase.");
   7573 
   7574 static PyObject*
   7575 unicode_upper(PyUnicodeObject *self)
   7576 {
   7577     return fixup(self, fixupper);
   7578 }
   7579 
   7580 PyDoc_STRVAR(zfill__doc__,
   7581              "S.zfill(width) -> unicode\n\
   7582 \n\
   7583 Pad a numeric string S with zeros on the left, to fill a field\n\
   7584 of the specified width. The string S is never truncated.");
   7585 
   7586 static PyObject *
   7587 unicode_zfill(PyUnicodeObject *self, PyObject *args)
   7588 {
   7589     Py_ssize_t fill;
   7590     PyUnicodeObject *u;
   7591 
   7592     Py_ssize_t width;
   7593     if (!PyArg_ParseTuple(args, "n:zfill", &width))
   7594         return NULL;
   7595 
   7596     if (self->length >= width) {
   7597         if (PyUnicode_CheckExact(self)) {
   7598             Py_INCREF(self);
   7599             return (PyObject*) self;
   7600         }
   7601         else
   7602             return PyUnicode_FromUnicode(
   7603                 PyUnicode_AS_UNICODE(self),
   7604                 PyUnicode_GET_SIZE(self)
   7605                 );
   7606     }
   7607 
   7608     fill = width - self->length;
   7609 
   7610     u = pad(self, fill, 0, '0');
   7611 
   7612     if (u == NULL)
   7613         return NULL;
   7614 
   7615     if (u->str[fill] == '+' || u->str[fill] == '-') {
   7616         /* move sign to beginning of string */
   7617         u->str[0] = u->str[fill];
   7618         u->str[fill] = '0';
   7619     }
   7620 
   7621     return (PyObject*) u;
   7622 }
   7623 
   7624 #if 0
   7625 static PyObject*
   7626 free_listsize(PyUnicodeObject *self)
   7627 {
   7628     return PyInt_FromLong(numfree);
   7629 }
   7630 #endif
   7631 
   7632 PyDoc_STRVAR(startswith__doc__,
   7633              "S.startswith(prefix[, start[, end]]) -> bool\n\
   7634 \n\
   7635 Return True if S starts with the specified prefix, False otherwise.\n\
   7636 With optional start, test S beginning at that position.\n\
   7637 With optional end, stop comparing S at that position.\n\
   7638 prefix can also be a tuple of strings to try.");
   7639 
   7640 static PyObject *
   7641 unicode_startswith(PyUnicodeObject *self,
   7642                    PyObject *args)
   7643 {
   7644     PyObject *subobj;
   7645     PyUnicodeObject *substring;
   7646     Py_ssize_t start = 0;
   7647     Py_ssize_t end = PY_SSIZE_T_MAX;
   7648     int result;
   7649 
   7650     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
   7651         return NULL;
   7652     if (PyTuple_Check(subobj)) {
   7653         Py_ssize_t i;
   7654         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
   7655             substring = (PyUnicodeObject *)PyUnicode_FromObject(
   7656                 PyTuple_GET_ITEM(subobj, i));
   7657             if (substring == NULL)
   7658                 return NULL;
   7659             result = tailmatch(self, substring, start, end, -1);
   7660             Py_DECREF(substring);
   7661             if (result) {
   7662                 Py_RETURN_TRUE;
   7663             }
   7664         }
   7665         /* nothing matched */
   7666         Py_RETURN_FALSE;
   7667     }
   7668     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
   7669     if (substring == NULL) {
   7670         if (PyErr_ExceptionMatches(PyExc_TypeError))
   7671             PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
   7672                          "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
   7673         return NULL;
   7674     }
   7675     result = tailmatch(self, substring, start, end, -1);
   7676     Py_DECREF(substring);
   7677     return PyBool_FromLong(result);
   7678 }
   7679 
   7680 
   7681 PyDoc_STRVAR(endswith__doc__,
   7682              "S.endswith(suffix[, start[, end]]) -> bool\n\
   7683 \n\
   7684 Return True if S ends with the specified suffix, False otherwise.\n\
   7685 With optional start, test S beginning at that position.\n\
   7686 With optional end, stop comparing S at that position.\n\
   7687 suffix can also be a tuple of strings to try.");
   7688 
   7689 static PyObject *
   7690 unicode_endswith(PyUnicodeObject *self,
   7691                  PyObject *args)
   7692 {
   7693     PyObject *subobj;
   7694     PyUnicodeObject *substring;
   7695     Py_ssize_t start = 0;
   7696     Py_ssize_t end = PY_SSIZE_T_MAX;
   7697     int result;
   7698 
   7699     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
   7700         return NULL;
   7701     if (PyTuple_Check(subobj)) {
   7702         Py_ssize_t i;
   7703         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
   7704             substring = (PyUnicodeObject *)PyUnicode_FromObject(
   7705                 PyTuple_GET_ITEM(subobj, i));
   7706             if (substring == NULL)
   7707                 return NULL;
   7708             result = tailmatch(self, substring, start, end, +1);
   7709             Py_DECREF(substring);
   7710             if (result) {
   7711                 Py_RETURN_TRUE;
   7712             }
   7713         }
   7714         Py_RETURN_FALSE;
   7715     }
   7716     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
   7717     if (substring == NULL) {
   7718         if (PyErr_ExceptionMatches(PyExc_TypeError))
   7719             PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
   7720                          "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
   7721         return NULL;
   7722     }
   7723     result = tailmatch(self, substring, start, end, +1);
   7724     Py_DECREF(substring);
   7725     return PyBool_FromLong(result);
   7726 }
   7727 
   7728 
   7729 /* Implements do_string_format, which is unicode because of stringlib */
   7730 #include "stringlib/string_format.h"
   7731 
   7732 PyDoc_STRVAR(format__doc__,
   7733              "S.format(*args, **kwargs) -> unicode\n\
   7734 \n\
   7735 Return a formatted version of S, using substitutions from args and kwargs.\n\
   7736 The substitutions are identified by braces ('{' and '}').");
   7737 
   7738 static PyObject *
   7739 unicode__format__(PyObject *self, PyObject *args)
   7740 {
   7741     PyObject *format_spec;
   7742     PyObject *result = NULL;
   7743     PyObject *tmp = NULL;
   7744 
   7745     /* If 2.x, convert format_spec to the same type as value */
   7746     /* This is to allow things like u''.format('') */
   7747     if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
   7748         goto done;
   7749     if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
   7750         PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
   7751                      "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
   7752         goto done;
   7753     }
   7754     tmp = PyObject_Unicode(format_spec);
   7755     if (tmp == NULL)
   7756         goto done;
   7757     format_spec = tmp;
   7758 
   7759     result = _PyUnicode_FormatAdvanced(self,
   7760                                        PyUnicode_AS_UNICODE(format_spec),
   7761                                        PyUnicode_GET_SIZE(format_spec));
   7762   done:
   7763     Py_XDECREF(tmp);
   7764     return result;
   7765 }
   7766 
   7767 PyDoc_STRVAR(p_format__doc__,
   7768              "S.__format__(format_spec) -> unicode\n\
   7769 \n\
   7770 Return a formatted version of S as described by format_spec.");
   7771 
   7772 static PyObject *
   7773 unicode__sizeof__(PyUnicodeObject *v)
   7774 {
   7775     return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
   7776                              sizeof(Py_UNICODE) * (v->length + 1));
   7777 }
   7778 
   7779 PyDoc_STRVAR(sizeof__doc__,
   7780              "S.__sizeof__() -> size of S in memory, in bytes\n\
   7781 \n\
   7782 ");
   7783 
   7784 static PyObject *
   7785 unicode_getnewargs(PyUnicodeObject *v)
   7786 {
   7787     return Py_BuildValue("(u#)", v->str, v->length);
   7788 }
   7789 
   7790 
   7791 static PyMethodDef unicode_methods[] = {
   7792 
   7793     /* Order is according to common usage: often used methods should
   7794        appear first, since lookup is done sequentially. */
   7795 
   7796     {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
   7797     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
   7798     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
   7799     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
   7800     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
   7801     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
   7802     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
   7803     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
   7804     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
   7805     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
   7806     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
   7807     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
   7808     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
   7809     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
   7810     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
   7811     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
   7812     {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
   7813 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
   7814     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
   7815     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
   7816     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
   7817     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
   7818     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
   7819     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
   7820     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
   7821     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
   7822     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
   7823     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
   7824     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
   7825     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
   7826     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
   7827     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
   7828     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
   7829     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
   7830     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
   7831     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
   7832     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
   7833     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
   7834     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
   7835     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
   7836     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
   7837     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
   7838     {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
   7839     {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
   7840     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
   7841 #if 0
   7842     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
   7843 #endif
   7844 
   7845 #if 0
   7846     /* This one is just used for debugging the implementation. */
   7847     {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
   7848 #endif
   7849 
   7850     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
   7851     {NULL, NULL}
   7852 };
   7853 
   7854 static PyObject *
   7855 unicode_mod(PyObject *v, PyObject *w)
   7856 {
   7857     if (!PyUnicode_Check(v)) {
   7858         Py_INCREF(Py_NotImplemented);
   7859         return Py_NotImplemented;
   7860     }
   7861     return PyUnicode_Format(v, w);
   7862 }
   7863 
   7864 static PyNumberMethods unicode_as_number = {
   7865     0,              /*nb_add*/
   7866     0,              /*nb_subtract*/
   7867     0,              /*nb_multiply*/
   7868     0,              /*nb_divide*/
   7869     unicode_mod,            /*nb_remainder*/
   7870 };
   7871 
   7872 static PySequenceMethods unicode_as_sequence = {
   7873     (lenfunc) unicode_length,       /* sq_length */
   7874     PyUnicode_Concat,           /* sq_concat */
   7875     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
   7876     (ssizeargfunc) unicode_getitem,     /* sq_item */
   7877     (ssizessizeargfunc) unicode_slice,  /* sq_slice */
   7878     0,                  /* sq_ass_item */
   7879     0,                  /* sq_ass_slice */
   7880     PyUnicode_Contains,         /* sq_contains */
   7881 };
   7882 
   7883 static PyObject*
   7884 unicode_subscript(PyUnicodeObject* self, PyObject* item)
   7885 {
   7886     if (PyIndex_Check(item)) {
   7887         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
   7888         if (i == -1 && PyErr_Occurred())
   7889             return NULL;
   7890         if (i < 0)
   7891             i += PyUnicode_GET_SIZE(self);
   7892         return unicode_getitem(self, i);
   7893     } else if (PySlice_Check(item)) {
   7894         Py_ssize_t start, stop, step, slicelength, cur, i;
   7895         Py_UNICODE* source_buf;
   7896         Py_UNICODE* result_buf;
   7897         PyObject* result;
   7898 
   7899         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
   7900                                  &start, &stop, &step, &slicelength) < 0) {
   7901             return NULL;
   7902         }
   7903 
   7904         if (slicelength <= 0) {
   7905             return PyUnicode_FromUnicode(NULL, 0);
   7906         } else if (start == 0 && step == 1 && slicelength == self->length &&
   7907                    PyUnicode_CheckExact(self)) {
   7908             Py_INCREF(self);
   7909             return (PyObject *)self;
   7910         } else if (step == 1) {
   7911             return PyUnicode_FromUnicode(self->str + start, slicelength);
   7912         } else {
   7913             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
   7914             result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
   7915                                                        sizeof(Py_UNICODE));
   7916 
   7917             if (result_buf == NULL)
   7918                 return PyErr_NoMemory();
   7919 
   7920             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
   7921                 result_buf[i] = source_buf[cur];
   7922             }
   7923 
   7924             result = PyUnicode_FromUnicode(result_buf, slicelength);
   7925             PyObject_FREE(result_buf);
   7926             return result;
   7927         }
   7928     } else {
   7929         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
   7930         return NULL;
   7931     }
   7932 }
   7933 
   7934 static PyMappingMethods unicode_as_mapping = {
   7935     (lenfunc)unicode_length,        /* mp_length */
   7936     (binaryfunc)unicode_subscript,  /* mp_subscript */
   7937     (objobjargproc)0,           /* mp_ass_subscript */
   7938 };
   7939 
   7940 static Py_ssize_t
   7941 unicode_buffer_getreadbuf(PyUnicodeObject *self,
   7942                           Py_ssize_t index,
   7943                           const void **ptr)
   7944 {
   7945     if (index != 0) {
   7946         PyErr_SetString(PyExc_SystemError,
   7947                         "accessing non-existent unicode segment");
   7948         return -1;
   7949     }
   7950     *ptr = (void *) self->str;
   7951     return PyUnicode_GET_DATA_SIZE(self);
   7952 }
   7953 
   7954 static Py_ssize_t
   7955 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
   7956                            const void **ptr)
   7957 {
   7958     PyErr_SetString(PyExc_TypeError,
   7959                     "cannot use unicode as modifiable buffer");
   7960     return -1;
   7961 }
   7962 
   7963 static int
   7964 unicode_buffer_getsegcount(PyUnicodeObject *self,
   7965                            Py_ssize_t *lenp)
   7966 {
   7967     if (lenp)
   7968         *lenp = PyUnicode_GET_DATA_SIZE(self);
   7969     return 1;
   7970 }
   7971 
   7972 static Py_ssize_t
   7973 unicode_buffer_getcharbuf(PyUnicodeObject *self,
   7974                           Py_ssize_t index,
   7975                           const void **ptr)
   7976 {
   7977     PyObject *str;
   7978 
   7979     if (index != 0) {
   7980         PyErr_SetString(PyExc_SystemError,
   7981                         "accessing non-existent unicode segment");
   7982         return -1;
   7983     }
   7984     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
   7985     if (str == NULL)
   7986         return -1;
   7987     *ptr = (void *) PyString_AS_STRING(str);
   7988     return PyString_GET_SIZE(str);
   7989 }
   7990 
   7991 /* Helpers for PyUnicode_Format() */
   7992 
   7993 static PyObject *
   7994 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
   7995 {
   7996     Py_ssize_t argidx = *p_argidx;
   7997     if (argidx < arglen) {
   7998         (*p_argidx)++;
   7999         if (arglen < 0)
   8000             return args;
   8001         else
   8002             return PyTuple_GetItem(args, argidx);
   8003     }
   8004     PyErr_SetString(PyExc_TypeError,
   8005                     "not enough arguments for format string");
   8006     return NULL;
   8007 }
   8008 
   8009 #define F_LJUST (1<<0)
   8010 #define F_SIGN  (1<<1)
   8011 #define F_BLANK (1<<2)
   8012 #define F_ALT   (1<<3)
   8013 #define F_ZERO  (1<<4)
   8014 
   8015 static Py_ssize_t
   8016 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
   8017 {
   8018     register Py_ssize_t i;
   8019     Py_ssize_t len = strlen(charbuffer);
   8020     for (i = len - 1; i >= 0; i--)
   8021         buffer[i] = (Py_UNICODE) charbuffer[i];
   8022 
   8023     return len;
   8024 }
   8025 
   8026 static int
   8027 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
   8028 {
   8029     Py_ssize_t result;
   8030 
   8031     PyOS_snprintf((char *)buffer, len, format, x);
   8032     result = strtounicode(buffer, (char *)buffer);
   8033     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
   8034 }
   8035 
   8036 /* XXX To save some code duplication, formatfloat/long/int could have been
   8037    shared with stringobject.c, converting from 8-bit to Unicode after the
   8038    formatting is done. */
   8039 
   8040 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
   8041 
   8042 static PyObject *
   8043 formatfloat(PyObject *v, int flags, int prec, int type)
   8044 {
   8045     char *p;
   8046     PyObject *result;
   8047     double x;
   8048 
   8049     x = PyFloat_AsDouble(v);
   8050     if (x == -1.0 && PyErr_Occurred())
   8051         return NULL;
   8052 
   8053     if (prec < 0)
   8054         prec = 6;
   8055 
   8056     p = PyOS_double_to_string(x, type, prec,
   8057                               (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
   8058     if (p == NULL)
   8059         return NULL;
   8060     result = PyUnicode_FromStringAndSize(p, strlen(p));
   8061     PyMem_Free(p);
   8062     return result;
   8063 }
   8064 
   8065 static PyObject*
   8066 formatlong(PyObject *val, int flags, int prec, int type)
   8067 {
   8068     char *buf;
   8069     int i, len;
   8070     PyObject *str; /* temporary string object. */
   8071     PyUnicodeObject *result;
   8072 
   8073     str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
   8074     if (!str)
   8075         return NULL;
   8076     result = _PyUnicode_New(len);
   8077     if (!result) {
   8078         Py_DECREF(str);
   8079         return NULL;
   8080     }
   8081     for (i = 0; i < len; i++)
   8082         result->str[i] = buf[i];
   8083     result->str[len] = 0;
   8084     Py_DECREF(str);
   8085     return (PyObject*)result;
   8086 }
   8087 
   8088 static int
   8089 formatint(Py_UNICODE *buf,
   8090           size_t buflen,
   8091           int flags,
   8092           int prec,
   8093           int type,
   8094           PyObject *v)
   8095 {
   8096     /* fmt = '%#.' + `prec` + 'l' + `type`
   8097      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
   8098      *                     + 1 + 1
   8099      *                   = 24
   8100      */
   8101     char fmt[64]; /* plenty big enough! */
   8102     char *sign;
   8103     long x;
   8104 
   8105     x = PyInt_AsLong(v);
   8106     if (x == -1 && PyErr_Occurred())
   8107         return -1;
   8108     if (x < 0 && type == 'u') {
   8109         type = 'd';
   8110     }
   8111     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
   8112         sign = "-";
   8113     else
   8114         sign = "";
   8115     if (prec < 0)
   8116         prec = 1;
   8117 
   8118     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
   8119      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
   8120      */
   8121     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
   8122         PyErr_SetString(PyExc_OverflowError,
   8123                         "formatted integer is too long (precision too large?)");
   8124         return -1;
   8125     }
   8126 
   8127     if ((flags & F_ALT) &&
   8128         (type == 'x' || type == 'X')) {
   8129         /* When converting under %#x or %#X, there are a number
   8130          * of issues that cause pain:
   8131          * - when 0 is being converted, the C standard leaves off
   8132          *   the '0x' or '0X', which is inconsistent with other
   8133          *   %#x/%#X conversions and inconsistent with Python's
   8134          *   hex() function
   8135          * - there are platforms that violate the standard and
   8136          *   convert 0 with the '0x' or '0X'
   8137          *   (Metrowerks, Compaq Tru64)
   8138          * - there are platforms that give '0x' when converting
   8139          *   under %#X, but convert 0 in accordance with the
   8140          *   standard (OS/2 EMX)
   8141          *
   8142          * We can achieve the desired consistency by inserting our
   8143          * own '0x' or '0X' prefix, and substituting %x/%X in place
   8144          * of %#x/%#X.
   8145          *
   8146          * Note that this is the same approach as used in
   8147          * formatint() in stringobject.c
   8148          */
   8149         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
   8150                       sign, type, prec, type);
   8151     }
   8152     else {
   8153         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
   8154                       sign, (flags&F_ALT) ? "#" : "",
   8155                       prec, type);
   8156     }
   8157     if (sign[0])
   8158         return longtounicode(buf, buflen, fmt, -x);
   8159     else
   8160         return longtounicode(buf, buflen, fmt, x);
   8161 }
   8162 
   8163 static int
   8164 formatchar(Py_UNICODE *buf,
   8165            size_t buflen,
   8166            PyObject *v)
   8167 {
   8168     PyObject *unistr;
   8169     char *str;
   8170     /* presume that the buffer is at least 2 characters long */
   8171     if (PyUnicode_Check(v)) {
   8172         if (PyUnicode_GET_SIZE(v) != 1)
   8173             goto onError;
   8174         buf[0] = PyUnicode_AS_UNICODE(v)[0];
   8175     }
   8176 
   8177     else if (PyString_Check(v)) {
   8178         if (PyString_GET_SIZE(v) != 1)
   8179             goto onError;
   8180         /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
   8181            with a UnicodeDecodeError if 'char' is not decodable with the
   8182            default encoding (usually ASCII, but it might be something else) */
   8183         str = PyString_AS_STRING(v);
   8184         if ((unsigned char)str[0] > 0x7F) {
   8185             /* the char is not ASCII; try to decode the string using the
   8186                default encoding and return -1 to let the UnicodeDecodeError
   8187                be raised if the string can't be decoded */
   8188             unistr = PyUnicode_Decode(str, 1, NULL, "strict");
   8189             if (unistr == NULL)
   8190                 return -1;
   8191             buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
   8192             Py_DECREF(unistr);
   8193         }
   8194         else
   8195             buf[0] = (Py_UNICODE)str[0];
   8196     }
   8197 
   8198     else {
   8199         /* Integer input truncated to a character */
   8200         long x;
   8201         x = PyInt_AsLong(v);
   8202         if (x == -1 && PyErr_Occurred())
   8203             goto onError;
   8204 #ifdef Py_UNICODE_WIDE
   8205         if (x < 0 || x > 0x10ffff) {
   8206             PyErr_SetString(PyExc_OverflowError,
   8207                             "%c arg not in range(0x110000) "
   8208                             "(wide Python build)");
   8209             return -1;
   8210         }
   8211 #else
   8212         if (x < 0 || x > 0xffff) {
   8213             PyErr_SetString(PyExc_OverflowError,
   8214                             "%c arg not in range(0x10000) "
   8215                             "(narrow Python build)");
   8216             return -1;
   8217         }
   8218 #endif
   8219         buf[0] = (Py_UNICODE) x;
   8220     }
   8221     buf[1] = '\0';
   8222     return 1;
   8223 
   8224   onError:
   8225     PyErr_SetString(PyExc_TypeError,
   8226                     "%c requires int or char");
   8227     return -1;
   8228 }
   8229 
   8230 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
   8231 
   8232    FORMATBUFLEN is the length of the buffer in which the ints &
   8233    chars are formatted. XXX This is a magic number. Each formatting
   8234    routine does bounds checking to ensure no overflow, but a better
   8235    solution may be to malloc a buffer of appropriate size for each
   8236    format. For now, the current solution is sufficient.
   8237 */
   8238 #define FORMATBUFLEN (size_t)120
   8239 
   8240 PyObject *PyUnicode_Format(PyObject *format,
   8241                            PyObject *args)
   8242 {
   8243     Py_UNICODE *fmt, *res;
   8244     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
   8245     int args_owned = 0;
   8246     PyUnicodeObject *result = NULL;
   8247     PyObject *dict = NULL;
   8248     PyObject *uformat;
   8249 
   8250     if (format == NULL || args == NULL) {
   8251         PyErr_BadInternalCall();
   8252         return NULL;
   8253     }
   8254     uformat = PyUnicode_FromObject(format);
   8255     if (uformat == NULL)
   8256         return NULL;
   8257     fmt = PyUnicode_AS_UNICODE(uformat);
   8258     fmtcnt = PyUnicode_GET_SIZE(uformat);
   8259 
   8260     reslen = rescnt = fmtcnt + 100;
   8261     result = _PyUnicode_New(reslen);
   8262     if (result == NULL)
   8263         goto onError;
   8264     res = PyUnicode_AS_UNICODE(result);
   8265 
   8266     if (PyTuple_Check(args)) {
   8267         arglen = PyTuple_Size(args);
   8268         argidx = 0;
   8269     }
   8270     else {
   8271         arglen = -1;
   8272         argidx = -2;
   8273     }
   8274     if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
   8275         !PyObject_TypeCheck(args, &PyBaseString_Type))
   8276         dict = args;
   8277 
   8278     while (--fmtcnt >= 0) {
   8279         if (*fmt != '%') {
   8280             if (--rescnt < 0) {
   8281                 rescnt = fmtcnt + 100;
   8282                 reslen += rescnt;
   8283                 if (_PyUnicode_Resize(&result, reslen) < 0)
   8284                     goto onError;
   8285                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
   8286                 --rescnt;
   8287             }
   8288             *res++ = *fmt++;
   8289         }
   8290         else {
   8291             /* Got a format specifier */
   8292             int flags = 0;
   8293             Py_ssize_t width = -1;
   8294             int prec = -1;
   8295             Py_UNICODE c = '\0';
   8296             Py_UNICODE fill;
   8297             int isnumok;
   8298             PyObject *v       = NULL;
   8299             PyObject *temp    = NULL;
   8300             Py_UNICODE *pbuf  = NULL;
   8301             Py_UNICODE sign;
   8302             Py_ssize_t len;
   8303             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
   8304 
   8305             fmt++;
   8306             if (*fmt == '(') {
   8307                 Py_UNICODE *keystart;
   8308                 Py_ssize_t keylen;
   8309                 PyObject *key;
   8310                 int pcount = 1;
   8311 
   8312                 if (dict == NULL) {
   8313                     PyErr_SetString(PyExc_TypeError,
   8314                                     "format requires a mapping");
   8315                     goto onError;
   8316                 }
   8317                 ++fmt;
   8318                 --fmtcnt;
   8319                 keystart = fmt;
   8320                 /* Skip over balanced parentheses */
   8321                 while (pcount > 0 && --fmtcnt >= 0) {
   8322                     if (*fmt == ')')
   8323                         --pcount;
   8324                     else if (*fmt == '(')
   8325                         ++pcount;
   8326                     fmt++;
   8327                 }
   8328                 keylen = fmt - keystart - 1;
   8329                 if (fmtcnt < 0 || pcount > 0) {
   8330                     PyErr_SetString(PyExc_ValueError,
   8331                                     "incomplete format key");
   8332                     goto onError;
   8333                 }
   8334 #if 0
   8335                 /* keys are converted to strings using UTF-8 and
   8336                    then looked up since Python uses strings to hold
   8337                    variables names etc. in its namespaces and we
   8338                    wouldn't want to break common idioms. */
   8339                 key = PyUnicode_EncodeUTF8(keystart,
   8340                                            keylen,
   8341                                            NULL);
   8342 #else
   8343                 key = PyUnicode_FromUnicode(keystart, keylen);
   8344 #endif
   8345                 if (key == NULL)
   8346                     goto onError;
   8347                 if (args_owned) {
   8348                     Py_DECREF(args);
   8349                     args_owned = 0;
   8350                 }
   8351                 args = PyObject_GetItem(dict, key);
   8352                 Py_DECREF(key);
   8353                 if (args == NULL) {
   8354                     goto onError;
   8355                 }
   8356                 args_owned = 1;
   8357                 arglen = -1;
   8358                 argidx = -2;
   8359             }
   8360             while (--fmtcnt >= 0) {
   8361                 switch (c = *fmt++) {
   8362                 case '-': flags |= F_LJUST; continue;
   8363                 case '+': flags |= F_SIGN; continue;
   8364                 case ' ': flags |= F_BLANK; continue;
   8365                 case '#': flags |= F_ALT; continue;
   8366                 case '0': flags |= F_ZERO; continue;
   8367                 }
   8368                 break;
   8369             }
   8370             if (c == '*') {
   8371                 v = getnextarg(args, arglen, &argidx);
   8372                 if (v == NULL)
   8373                     goto onError;
   8374                 if (!PyInt_Check(v)) {
   8375                     PyErr_SetString(PyExc_TypeError,
   8376                                     "* wants int");
   8377                     goto onError;
   8378                 }
   8379                 width = PyInt_AsLong(v);
   8380                 if (width < 0) {
   8381                     flags |= F_LJUST;
   8382                     width = -width;
   8383                 }
   8384                 if (--fmtcnt >= 0)
   8385                     c = *fmt++;
   8386             }
   8387             else if (c >= '0' && c <= '9') {
   8388                 width = c - '0';
   8389                 while (--fmtcnt >= 0) {
   8390                     c = *fmt++;
   8391                     if (c < '0' || c > '9')
   8392                         break;
   8393                     if ((width*10) / 10 != width) {
   8394                         PyErr_SetString(PyExc_ValueError,
   8395                                         "width too big");
   8396                         goto onError;
   8397                     }
   8398                     width = width*10 + (c - '0');
   8399                 }
   8400             }
   8401             if (c == '.') {
   8402                 prec = 0;
   8403                 if (--fmtcnt >= 0)
   8404                     c = *fmt++;
   8405                 if (c == '*') {
   8406                     v = getnextarg(args, arglen, &argidx);
   8407                     if (v == NULL)
   8408                         goto onError;
   8409                     if (!PyInt_Check(v)) {
   8410                         PyErr_SetString(PyExc_TypeError,
   8411                                         "* wants int");
   8412                         goto onError;
   8413                     }
   8414                     prec = PyInt_AsLong(v);
   8415                     if (prec < 0)
   8416                         prec = 0;
   8417                     if (--fmtcnt >= 0)
   8418                         c = *fmt++;
   8419                 }
   8420                 else if (c >= '0' && c <= '9') {
   8421                     prec = c - '0';
   8422                     while (--fmtcnt >= 0) {
   8423                         c = *fmt++;
   8424                         if (c < '0' || c > '9')
   8425                             break;
   8426                         if ((prec*10) / 10 != prec) {
   8427                             PyErr_SetString(PyExc_ValueError,
   8428                                             "prec too big");
   8429                             goto onError;
   8430                         }
   8431                         prec = prec*10 + (c - '0');
   8432                     }
   8433                 }
   8434             } /* prec */
   8435             if (fmtcnt >= 0) {
   8436                 if (c == 'h' || c == 'l' || c == 'L') {
   8437                     if (--fmtcnt >= 0)
   8438                         c = *fmt++;
   8439                 }
   8440             }
   8441             if (fmtcnt < 0) {
   8442                 PyErr_SetString(PyExc_ValueError,
   8443                                 "incomplete format");
   8444                 goto onError;
   8445             }
   8446             if (c != '%') {
   8447                 v = getnextarg(args, arglen, &argidx);
   8448                 if (v == NULL)
   8449                     goto onError;
   8450             }
   8451             sign = 0;
   8452             fill = ' ';
   8453             switch (c) {
   8454 
   8455             case '%':
   8456                 pbuf = formatbuf;
   8457                 /* presume that buffer length is at least 1 */
   8458                 pbuf[0] = '%';
   8459                 len = 1;
   8460                 break;
   8461 
   8462             case 's':
   8463             case 'r':
   8464                 if (PyUnicode_CheckExact(v) && c == 's') {
   8465                     temp = v;
   8466                     Py_INCREF(temp);
   8467                 }
   8468                 else {
   8469                     PyObject *unicode;
   8470                     if (c == 's')
   8471                         temp = PyObject_Unicode(v);
   8472                     else
   8473                         temp = PyObject_Repr(v);
   8474                     if (temp == NULL)
   8475                         goto onError;
   8476                     if (PyUnicode_Check(temp))
   8477                         /* nothing to do */;
   8478                     else if (PyString_Check(temp)) {
   8479                         /* convert to string to Unicode */
   8480                         unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
   8481                                                    PyString_GET_SIZE(temp),
   8482                                                    NULL,
   8483                                                    "strict");
   8484                         Py_DECREF(temp);
   8485                         temp = unicode;
   8486                         if (temp == NULL)
   8487                             goto onError;
   8488                     }
   8489                     else {
   8490                         Py_DECREF(temp);
   8491                         PyErr_SetString(PyExc_TypeError,
   8492                                         "%s argument has non-string str()");
   8493                         goto onError;
   8494                     }
   8495                 }
   8496                 pbuf = PyUnicode_AS_UNICODE(temp);
   8497                 len = PyUnicode_GET_SIZE(temp);
   8498                 if (prec >= 0 && len > prec)
   8499                     len = prec;
   8500                 break;
   8501 
   8502             case 'i':
   8503             case 'd':
   8504             case 'u':
   8505             case 'o':
   8506             case 'x':
   8507             case 'X':
   8508                 if (c == 'i')
   8509                     c = 'd';
   8510                 isnumok = 0;
   8511                 if (PyNumber_Check(v)) {
   8512                     PyObject *iobj=NULL;
   8513 
   8514                     if (PyInt_Check(v) || (PyLong_Check(v))) {
   8515                         iobj = v;
   8516                         Py_INCREF(iobj);
   8517                     }
   8518                     else {
   8519                         iobj = PyNumber_Int(v);
   8520                         if (iobj==NULL) iobj = PyNumber_Long(v);
   8521                     }
   8522                     if (iobj!=NULL) {
   8523                         if (PyInt_Check(iobj)) {
   8524                             isnumok = 1;
   8525                             pbuf = formatbuf;
   8526                             len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
   8527                                             flags, prec, c, iobj);
   8528                             Py_DECREF(iobj);
   8529                             if (len < 0)
   8530                                 goto onError;
   8531                             sign = 1;
   8532                         }
   8533                         else if (PyLong_Check(iobj)) {
   8534                             isnumok = 1;
   8535                             temp = formatlong(iobj, flags, prec, c);
   8536                             Py_DECREF(iobj);
   8537                             if (!temp)
   8538                                 goto onError;
   8539                             pbuf = PyUnicode_AS_UNICODE(temp);
   8540                             len = PyUnicode_GET_SIZE(temp);
   8541                             sign = 1;
   8542                         }
   8543                         else {
   8544                             Py_DECREF(iobj);
   8545                         }
   8546                     }
   8547                 }
   8548                 if (!isnumok) {
   8549                     PyErr_Format(PyExc_TypeError,
   8550                                  "%%%c format: a number is required, "
   8551                                  "not %.200s", (char)c, Py_TYPE(v)->tp_name);
   8552                     goto onError;
   8553                 }
   8554                 if (flags & F_ZERO)
   8555                     fill = '0';
   8556                 break;
   8557 
   8558             case 'e':
   8559             case 'E':
   8560             case 'f':
   8561             case 'F':
   8562             case 'g':
   8563             case 'G':
   8564                 temp = formatfloat(v, flags, prec, c);
   8565                 if (temp == NULL)
   8566                     goto onError;
   8567                 pbuf = PyUnicode_AS_UNICODE(temp);
   8568                 len = PyUnicode_GET_SIZE(temp);
   8569                 sign = 1;
   8570                 if (flags & F_ZERO)
   8571                     fill = '0';
   8572                 break;
   8573 
   8574             case 'c':
   8575                 pbuf = formatbuf;
   8576                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
   8577                 if (len < 0)
   8578                     goto onError;
   8579                 break;
   8580 
   8581             default:
   8582                 PyErr_Format(PyExc_ValueError,
   8583                              "unsupported format character '%c' (0x%x) "
   8584                              "at index %zd",
   8585                              (31<=c && c<=126) ? (char)c : '?',
   8586                              (int)c,
   8587                              (Py_ssize_t)(fmt - 1 -
   8588                                           PyUnicode_AS_UNICODE(uformat)));
   8589                 goto onError;
   8590             }
   8591             if (sign) {
   8592                 if (*pbuf == '-' || *pbuf == '+') {
   8593                     sign = *pbuf++;
   8594                     len--;
   8595                 }
   8596                 else if (flags & F_SIGN)
   8597                     sign = '+';
   8598                 else if (flags & F_BLANK)
   8599                     sign = ' ';
   8600                 else
   8601                     sign = 0;
   8602             }
   8603             if (width < len)
   8604                 width = len;
   8605             if (rescnt - (sign != 0) < width) {
   8606                 reslen -= rescnt;
   8607                 rescnt = width + fmtcnt + 100;
   8608                 reslen += rescnt;
   8609                 if (reslen < 0) {
   8610                     Py_XDECREF(temp);
   8611                     PyErr_NoMemory();
   8612                     goto onError;
   8613                 }
   8614                 if (_PyUnicode_Resize(&result, reslen) < 0) {
   8615                     Py_XDECREF(temp);
   8616                     goto onError;
   8617                 }
   8618                 res = PyUnicode_AS_UNICODE(result)
   8619                     + reslen - rescnt;
   8620             }
   8621             if (sign) {
   8622                 if (fill != ' ')
   8623                     *res++ = sign;
   8624                 rescnt--;
   8625                 if (width > len)
   8626                     width--;
   8627             }
   8628             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
   8629                 assert(pbuf[0] == '0');
   8630                 assert(pbuf[1] == c);
   8631                 if (fill != ' ') {
   8632                     *res++ = *pbuf++;
   8633                     *res++ = *pbuf++;
   8634                 }
   8635                 rescnt -= 2;
   8636                 width -= 2;
   8637                 if (width < 0)
   8638                     width = 0;
   8639                 len -= 2;
   8640             }
   8641             if (width > len && !(flags & F_LJUST)) {
   8642                 do {
   8643                     --rescnt;
   8644                     *res++ = fill;
   8645                 } while (--width > len);
   8646             }
   8647             if (fill == ' ') {
   8648                 if (sign)
   8649                     *res++ = sign;
   8650                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
   8651                     assert(pbuf[0] == '0');
   8652                     assert(pbuf[1] == c);
   8653                     *res++ = *pbuf++;
   8654                     *res++ = *pbuf++;
   8655                 }
   8656             }
   8657             Py_UNICODE_COPY(res, pbuf, len);
   8658             res += len;
   8659             rescnt -= len;
   8660             while (--width >= len) {
   8661                 --rescnt;
   8662                 *res++ = ' ';
   8663             }
   8664             if (dict && (argidx < arglen) && c != '%') {
   8665                 PyErr_SetString(PyExc_TypeError,
   8666                                 "not all arguments converted during string formatting");
   8667                 Py_XDECREF(temp);
   8668                 goto onError;
   8669             }
   8670             Py_XDECREF(temp);
   8671         } /* '%' */
   8672     } /* until end */
   8673     if (argidx < arglen && !dict) {
   8674         PyErr_SetString(PyExc_TypeError,
   8675                         "not all arguments converted during string formatting");
   8676         goto onError;
   8677     }
   8678 
   8679     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
   8680         goto onError;
   8681     if (args_owned) {
   8682         Py_DECREF(args);
   8683     }
   8684     Py_DECREF(uformat);
   8685     return (PyObject *)result;
   8686 
   8687   onError:
   8688     Py_XDECREF(result);
   8689     Py_DECREF(uformat);
   8690     if (args_owned) {
   8691         Py_DECREF(args);
   8692     }
   8693     return NULL;
   8694 }
   8695 
   8696 static PyBufferProcs unicode_as_buffer = {
   8697     (readbufferproc) unicode_buffer_getreadbuf,
   8698     (writebufferproc) unicode_buffer_getwritebuf,
   8699     (segcountproc) unicode_buffer_getsegcount,
   8700     (charbufferproc) unicode_buffer_getcharbuf,
   8701 };
   8702 
   8703 static PyObject *
   8704 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
   8705 
   8706 static PyObject *
   8707 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
   8708 {
   8709     PyObject *x = NULL;
   8710     static char *kwlist[] = {"string", "encoding", "errors", 0};
   8711     char *encoding = NULL;
   8712     char *errors = NULL;
   8713 
   8714     if (type != &PyUnicode_Type)
   8715         return unicode_subtype_new(type, args, kwds);
   8716     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
   8717                                      kwlist, &x, &encoding, &errors))
   8718         return NULL;
   8719     if (x == NULL)
   8720         return (PyObject *)_PyUnicode_New(0);
   8721     if (encoding == NULL && errors == NULL)
   8722         return PyObject_Unicode(x);
   8723     else
   8724         return PyUnicode_FromEncodedObject(x, encoding, errors);
   8725 }
   8726 
   8727 static PyObject *
   8728 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
   8729 {
   8730     PyUnicodeObject *tmp, *pnew;
   8731     Py_ssize_t n;
   8732 
   8733     assert(PyType_IsSubtype(type, &PyUnicode_Type));
   8734     tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
   8735     if (tmp == NULL)
   8736         return NULL;
   8737     assert(PyUnicode_Check(tmp));
   8738     pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
   8739     if (pnew == NULL) {
   8740         Py_DECREF(tmp);
   8741         return NULL;
   8742     }
   8743     pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
   8744     if (pnew->str == NULL) {
   8745         _Py_ForgetReference((PyObject *)pnew);
   8746         PyObject_Del(pnew);
   8747         Py_DECREF(tmp);
   8748         return PyErr_NoMemory();
   8749     }
   8750     Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
   8751     pnew->length = n;
   8752     pnew->hash = tmp->hash;
   8753     Py_DECREF(tmp);
   8754     return (PyObject *)pnew;
   8755 }
   8756 
   8757 PyDoc_STRVAR(unicode_doc,
   8758              "unicode(string [, encoding[, errors]]) -> object\n\
   8759 \n\
   8760 Create a new Unicode object from the given encoded string.\n\
   8761 encoding defaults to the current default string encoding.\n\
   8762 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
   8763 
   8764 PyTypeObject PyUnicode_Type = {
   8765     PyVarObject_HEAD_INIT(&PyType_Type, 0)
   8766     "unicode",              /* tp_name */
   8767     sizeof(PyUnicodeObject),        /* tp_size */
   8768     0,                  /* tp_itemsize */
   8769     /* Slots */
   8770     (destructor)unicode_dealloc,    /* tp_dealloc */
   8771     0,                  /* tp_print */
   8772     0,                  /* tp_getattr */
   8773     0,                  /* tp_setattr */
   8774     0,                  /* tp_compare */
   8775     unicode_repr,           /* tp_repr */
   8776     &unicode_as_number,         /* tp_as_number */
   8777     &unicode_as_sequence,       /* tp_as_sequence */
   8778     &unicode_as_mapping,        /* tp_as_mapping */
   8779     (hashfunc) unicode_hash,        /* tp_hash*/
   8780     0,                  /* tp_call*/
   8781     (reprfunc) unicode_str,     /* tp_str */
   8782     PyObject_GenericGetAttr,        /* tp_getattro */
   8783     0,                  /* tp_setattro */
   8784     &unicode_as_buffer,         /* tp_as_buffer */
   8785     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
   8786     Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
   8787     unicode_doc,            /* tp_doc */
   8788     0,                  /* tp_traverse */
   8789     0,                  /* tp_clear */
   8790     PyUnicode_RichCompare,      /* tp_richcompare */
   8791     0,                  /* tp_weaklistoffset */
   8792     0,                  /* tp_iter */
   8793     0,                  /* tp_iternext */
   8794     unicode_methods,            /* tp_methods */
   8795     0,                  /* tp_members */
   8796     0,                  /* tp_getset */
   8797     &PyBaseString_Type,         /* tp_base */
   8798     0,                  /* tp_dict */
   8799     0,                  /* tp_descr_get */
   8800     0,                  /* tp_descr_set */
   8801     0,                  /* tp_dictoffset */
   8802     0,                  /* tp_init */
   8803     0,                  /* tp_alloc */
   8804     unicode_new,            /* tp_new */
   8805     PyObject_Del,           /* tp_free */
   8806 };
   8807 
   8808 /* Initialize the Unicode implementation */
   8809 
   8810 void _PyUnicode_Init(void)
   8811 {
   8812     int i;
   8813 
   8814     /* XXX - move this array to unicodectype.c ? */
   8815     Py_UNICODE linebreak[] = {
   8816         0x000A, /* LINE FEED */
   8817         0x000D, /* CARRIAGE RETURN */
   8818         0x001C, /* FILE SEPARATOR */
   8819         0x001D, /* GROUP SEPARATOR */
   8820         0x001E, /* RECORD SEPARATOR */
   8821         0x0085, /* NEXT LINE */
   8822         0x2028, /* LINE SEPARATOR */
   8823         0x2029, /* PARAGRAPH SEPARATOR */
   8824     };
   8825 
   8826     /* Init the implementation */
   8827     free_list = NULL;
   8828     numfree = 0;
   8829     unicode_empty = _PyUnicode_New(0);
   8830     if (!unicode_empty)
   8831         return;
   8832 
   8833     strcpy(unicode_default_encoding, "ascii");
   8834     for (i = 0; i < 256; i++)
   8835         unicode_latin1[i] = NULL;
   8836     if (PyType_Ready(&PyUnicode_Type) < 0)
   8837         Py_FatalError("Can't initialize 'unicode'");
   8838 
   8839     /* initialize the linebreak bloom filter */
   8840     bloom_linebreak = make_bloom_mask(
   8841         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
   8842         );
   8843 
   8844     PyType_Ready(&EncodingMapType);
   8845 }
   8846 
   8847 /* Finalize the Unicode implementation */
   8848 
   8849 int
   8850 PyUnicode_ClearFreeList(void)
   8851 {
   8852     int freelist_size = numfree;
   8853     PyUnicodeObject *u;
   8854 
   8855     for (u = free_list; u != NULL;) {
   8856         PyUnicodeObject *v = u;
   8857         u = *(PyUnicodeObject **)u;
   8858         if (v->str)
   8859             PyObject_DEL(v->str);
   8860         Py_XDECREF(v->defenc);
   8861         PyObject_Del(v);
   8862         numfree--;
   8863     }
   8864     free_list = NULL;
   8865     assert(numfree == 0);
   8866     return freelist_size;
   8867 }
   8868 
   8869 void
   8870 _PyUnicode_Fini(void)
   8871 {
   8872     int i;
   8873 
   8874     Py_XDECREF(unicode_empty);
   8875     unicode_empty = NULL;
   8876 
   8877     for (i = 0; i < 256; i++) {
   8878         if (unicode_latin1[i]) {
   8879             Py_DECREF(unicode_latin1[i]);
   8880             unicode_latin1[i] = NULL;
   8881         }
   8882     }
   8883     (void)PyUnicode_ClearFreeList();
   8884 }
   8885 
   8886 #ifdef __cplusplus
   8887 }
   8888 #endif
   8889