Home | History | Annotate | Download | only in python2.7
      1 #ifndef Py_UNICODEOBJECT_H
      2 #define Py_UNICODEOBJECT_H
      3 
      4 #include <stdarg.h>
      5 
      6 /*
      7 
      8 Unicode implementation based on original code by Fredrik Lundh,
      9 modified by Marc-Andre Lemburg (mal (at) lemburg.com) according to the
     10 Unicode Integration Proposal (see file Misc/unicode.txt).
     11 
     12 Copyright (c) Corporation for National Research Initiatives.
     13 
     14 
     15  Original header:
     16  --------------------------------------------------------------------
     17 
     18  * Yet another Unicode string type for Python.  This type supports the
     19  * 16-bit Basic Multilingual Plane (BMP) only.
     20  *
     21  * Written by Fredrik Lundh, January 1999.
     22  *
     23  * Copyright (c) 1999 by Secret Labs AB.
     24  * Copyright (c) 1999 by Fredrik Lundh.
     25  *
     26  * fredrik (at) pythonware.com
     27  * http://www.pythonware.com
     28  *
     29  * --------------------------------------------------------------------
     30  * This Unicode String Type is
     31  *
     32  * Copyright (c) 1999 by Secret Labs AB
     33  * Copyright (c) 1999 by Fredrik Lundh
     34  *
     35  * By obtaining, using, and/or copying this software and/or its
     36  * associated documentation, you agree that you have read, understood,
     37  * and will comply with the following terms and conditions:
     38  *
     39  * Permission to use, copy, modify, and distribute this software and its
     40  * associated documentation for any purpose and without fee is hereby
     41  * granted, provided that the above copyright notice appears in all
     42  * copies, and that both that copyright notice and this permission notice
     43  * appear in supporting documentation, and that the name of Secret Labs
     44  * AB or the author not be used in advertising or publicity pertaining to
     45  * distribution of the software without specific, written prior
     46  * permission.
     47  *
     48  * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
     49  * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
     50  * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
     51  * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     52  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     53  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
     54  * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     55  * -------------------------------------------------------------------- */
     56 
     57 #include <ctype.h>
     58 
     59 /* === Internal API ======================================================= */
     60 
     61 /* --- Internal Unicode Format -------------------------------------------- */
     62 
     63 #ifndef Py_USING_UNICODE
     64 
     65 #define PyUnicode_Check(op)                 0
     66 #define PyUnicode_CheckExact(op)            0
     67 
     68 #else
     69 
     70 /* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
     71    properly set, but the default rules below doesn't set it.  I'll
     72    sort this out some other day -- fredrik (at) pythonware.com */
     73 
     74 #ifndef Py_UNICODE_SIZE
     75 #error Must define Py_UNICODE_SIZE
     76 #endif
     77 
     78 /* Setting Py_UNICODE_WIDE enables UCS-4 storage.  Otherwise, Unicode
     79    strings are stored as UCS-2 (with limited support for UTF-16) */
     80 
     81 #if Py_UNICODE_SIZE >= 4
     82 #define Py_UNICODE_WIDE
     83 #endif
     84 
     85 /* Set these flags if the platform has "wchar.h", "wctype.h" and the
     86    wchar_t type is a 16-bit unsigned type */
     87 /* #define HAVE_WCHAR_H */
     88 /* #define HAVE_USABLE_WCHAR_T */
     89 
     90 /* Defaults for various platforms */
     91 #ifndef PY_UNICODE_TYPE
     92 
     93 /* Windows has a usable wchar_t type (unless we're using UCS-4) */
     94 # if defined(MS_WIN32) && Py_UNICODE_SIZE == 2
     95 #  define HAVE_USABLE_WCHAR_T
     96 #  define PY_UNICODE_TYPE wchar_t
     97 # endif
     98 
     99 # if defined(Py_UNICODE_WIDE)
    100 #  define PY_UNICODE_TYPE Py_UCS4
    101 # endif
    102 
    103 #endif
    104 
    105 /* If the compiler provides a wchar_t type we try to support it
    106    through the interface functions PyUnicode_FromWideChar() and
    107    PyUnicode_AsWideChar(). */
    108 
    109 #ifdef HAVE_USABLE_WCHAR_T
    110 # ifndef HAVE_WCHAR_H
    111 #  define HAVE_WCHAR_H
    112 # endif
    113 #endif
    114 
    115 #ifdef HAVE_WCHAR_H
    116 /* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
    117 # ifdef _HAVE_BSDI
    118 #  include <time.h>
    119 # endif
    120 #  include <wchar.h>
    121 #endif
    122 
    123 /*
    124  * Use this typedef when you need to represent a UTF-16 surrogate pair
    125  * as single unsigned integer.
    126  */
    127 #if SIZEOF_INT >= 4
    128 typedef unsigned int Py_UCS4;
    129 #elif SIZEOF_LONG >= 4
    130 typedef unsigned long Py_UCS4;
    131 #endif
    132 
    133 /* Py_UNICODE is the native Unicode storage format (code unit) used by
    134    Python and represents a single Unicode element in the Unicode
    135    type. */
    136 
    137 typedef PY_UNICODE_TYPE Py_UNICODE;
    138 
    139 /* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
    140 
    141 /* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds
    142    produce different external names and thus cause import errors in
    143    case Python interpreters and extensions with mixed compiled in
    144    Unicode width assumptions are combined. */
    145 
    146 #ifndef Py_UNICODE_WIDE
    147 
    148 # define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
    149 # define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
    150 # define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
    151 # define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
    152 # define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
    153 # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
    154 # define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
    155 # define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
    156 # define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
    157 # define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
    158 # define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString
    159 # define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar
    160 # define PyUnicode_ClearFreeList PyUnicodeUCS2_ClearFreelist
    161 # define PyUnicode_Compare PyUnicodeUCS2_Compare
    162 # define PyUnicode_Concat PyUnicodeUCS2_Concat
    163 # define PyUnicode_Contains PyUnicodeUCS2_Contains
    164 # define PyUnicode_Count PyUnicodeUCS2_Count
    165 # define PyUnicode_Decode PyUnicodeUCS2_Decode
    166 # define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII
    167 # define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
    168 # define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
    169 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
    170 # define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
    171 # define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
    172 # define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
    173 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
    174 # define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
    175 # define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful
    176 # define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape
    177 # define PyUnicode_Encode PyUnicodeUCS2_Encode
    178 # define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII
    179 # define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap
    180 # define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
    181 # define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
    182 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
    183 # define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
    184 # define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
    185 # define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
    186 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
    187 # define PyUnicode_Find PyUnicodeUCS2_Find
    188 # define PyUnicode_Format PyUnicodeUCS2_Format
    189 # define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject
    190 # define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat
    191 # define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV
    192 # define PyUnicode_FromObject PyUnicodeUCS2_FromObject
    193 # define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal
    194 # define PyUnicode_FromString PyUnicodeUCS2_FromString
    195 # define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize
    196 # define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
    197 # define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
    198 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
    199 # define PyUnicode_GetMax PyUnicodeUCS2_GetMax
    200 # define PyUnicode_GetSize PyUnicodeUCS2_GetSize
    201 # define PyUnicode_Join PyUnicodeUCS2_Join
    202 # define PyUnicode_Partition PyUnicodeUCS2_Partition
    203 # define PyUnicode_RPartition PyUnicodeUCS2_RPartition
    204 # define PyUnicode_RSplit PyUnicodeUCS2_RSplit
    205 # define PyUnicode_Replace PyUnicodeUCS2_Replace
    206 # define PyUnicode_Resize PyUnicodeUCS2_Resize
    207 # define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare
    208 # define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
    209 # define PyUnicode_Split PyUnicodeUCS2_Split
    210 # define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
    211 # define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
    212 # define PyUnicode_Translate PyUnicodeUCS2_Translate
    213 # define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap
    214 # define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString
    215 # define _PyUnicode_Fini _PyUnicodeUCS2_Fini
    216 # define _PyUnicode_Init _PyUnicodeUCS2_Init
    217 # define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha
    218 # define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit
    219 # define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit
    220 # define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
    221 # define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
    222 # define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
    223 # define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
    224 # define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
    225 # define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
    226 # define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit
    227 # define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit
    228 # define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase
    229 # define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric
    230 # define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase
    231 # define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase
    232 
    233 #else
    234 
    235 # define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
    236 # define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
    237 # define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
    238 # define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
    239 # define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
    240 # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
    241 # define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
    242 # define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
    243 # define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
    244 # define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
    245 # define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString
    246 # define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar
    247 # define PyUnicode_ClearFreeList PyUnicodeUCS4_ClearFreelist
    248 # define PyUnicode_Compare PyUnicodeUCS4_Compare
    249 # define PyUnicode_Concat PyUnicodeUCS4_Concat
    250 # define PyUnicode_Contains PyUnicodeUCS4_Contains
    251 # define PyUnicode_Count PyUnicodeUCS4_Count
    252 # define PyUnicode_Decode PyUnicodeUCS4_Decode
    253 # define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII
    254 # define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
    255 # define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
    256 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
    257 # define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
    258 # define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
    259 # define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
    260 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
    261 # define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
    262 # define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful
    263 # define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape
    264 # define PyUnicode_Encode PyUnicodeUCS4_Encode
    265 # define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII
    266 # define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap
    267 # define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
    268 # define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
    269 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
    270 # define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
    271 # define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
    272 # define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
    273 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
    274 # define PyUnicode_Find PyUnicodeUCS4_Find
    275 # define PyUnicode_Format PyUnicodeUCS4_Format
    276 # define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject
    277 # define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat
    278 # define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV
    279 # define PyUnicode_FromObject PyUnicodeUCS4_FromObject
    280 # define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal
    281 # define PyUnicode_FromString PyUnicodeUCS4_FromString
    282 # define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize
    283 # define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
    284 # define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
    285 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
    286 # define PyUnicode_GetMax PyUnicodeUCS4_GetMax
    287 # define PyUnicode_GetSize PyUnicodeUCS4_GetSize
    288 # define PyUnicode_Join PyUnicodeUCS4_Join
    289 # define PyUnicode_Partition PyUnicodeUCS4_Partition
    290 # define PyUnicode_RPartition PyUnicodeUCS4_RPartition
    291 # define PyUnicode_RSplit PyUnicodeUCS4_RSplit
    292 # define PyUnicode_Replace PyUnicodeUCS4_Replace
    293 # define PyUnicode_Resize PyUnicodeUCS4_Resize
    294 # define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare
    295 # define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding
    296 # define PyUnicode_Split PyUnicodeUCS4_Split
    297 # define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines
    298 # define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch
    299 # define PyUnicode_Translate PyUnicodeUCS4_Translate
    300 # define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap
    301 # define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString
    302 # define _PyUnicode_Fini _PyUnicodeUCS4_Fini
    303 # define _PyUnicode_Init _PyUnicodeUCS4_Init
    304 # define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha
    305 # define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit
    306 # define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit
    307 # define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
    308 # define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
    309 # define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
    310 # define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
    311 # define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
    312 # define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
    313 # define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit
    314 # define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit
    315 # define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase
    316 # define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric
    317 # define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase
    318 # define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase
    319 
    320 
    321 #endif
    322 
    323 /* --- Internal Unicode Operations ---------------------------------------- */
    324 
    325 /* If you want Python to use the compiler's wctype.h functions instead
    326    of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
    327    configure Python using --with-wctype-functions.  This reduces the
    328    interpreter's code size. */
    329 
    330 #if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
    331 
    332 #include <wctype.h>
    333 
    334 #define Py_UNICODE_ISSPACE(ch) iswspace(ch)
    335 
    336 #define Py_UNICODE_ISLOWER(ch) iswlower(ch)
    337 #define Py_UNICODE_ISUPPER(ch) iswupper(ch)
    338 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
    339 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
    340 
    341 #define Py_UNICODE_TOLOWER(ch) towlower(ch)
    342 #define Py_UNICODE_TOUPPER(ch) towupper(ch)
    343 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
    344 
    345 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
    346 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
    347 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
    348 
    349 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
    350 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
    351 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
    352 
    353 #define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
    354 
    355 #else
    356 
    357 /* Since splitting on whitespace is an important use case, and
    358    whitespace in most situations is solely ASCII whitespace, we
    359    optimize for the common case by using a quick look-up table
    360    _Py_ascii_whitespace (see below) with an inlined check.
    361 
    362  */
    363 #define Py_UNICODE_ISSPACE(ch) \
    364     ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
    365 
    366 #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
    367 #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
    368 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
    369 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
    370 
    371 #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
    372 #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
    373 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
    374 
    375 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
    376 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
    377 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
    378 
    379 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
    380 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
    381 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
    382 
    383 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
    384 
    385 #endif
    386 
    387 #define Py_UNICODE_ISALNUM(ch) \
    388        (Py_UNICODE_ISALPHA(ch) || \
    389     Py_UNICODE_ISDECIMAL(ch) || \
    390     Py_UNICODE_ISDIGIT(ch) || \
    391     Py_UNICODE_ISNUMERIC(ch))
    392 
    393 #define Py_UNICODE_COPY(target, source, length)                         \
    394     Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
    395 
    396 #define Py_UNICODE_FILL(target, value, length) \
    397     do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
    398     for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
    399     } while (0)
    400 
    401 /* Check if substring matches at given offset.  the offset must be
    402    valid, and the substring must not be empty */
    403 
    404 #define Py_UNICODE_MATCH(string, offset, substring) \
    405     ((*((string)->str + (offset)) == *((substring)->str)) && \
    406     ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \
    407      !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE)))
    408 
    409 #ifdef __cplusplus
    410 extern "C" {
    411 #endif
    412 
    413 /* --- Unicode Type ------------------------------------------------------- */
    414 
    415 typedef struct {
    416     PyObject_HEAD
    417     Py_ssize_t length;          /* Length of raw Unicode data in buffer */
    418     Py_UNICODE *str;            /* Raw Unicode buffer */
    419     long hash;                  /* Hash value; -1 if not set */
    420     PyObject *defenc;           /* (Default) Encoded version as Python
    421                                    string, or NULL; this is used for
    422                                    implementing the buffer protocol */
    423 } PyUnicodeObject;
    424 
    425 PyAPI_DATA(PyTypeObject) PyUnicode_Type;
    426 
    427 #define PyUnicode_Check(op) \
    428                  PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
    429 #define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
    430 
    431 /* Fast access macros */
    432 #define PyUnicode_GET_SIZE(op) \
    433     (((PyUnicodeObject *)(op))->length)
    434 #define PyUnicode_GET_DATA_SIZE(op) \
    435     (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
    436 #define PyUnicode_AS_UNICODE(op) \
    437     (((PyUnicodeObject *)(op))->str)
    438 #define PyUnicode_AS_DATA(op) \
    439     ((const char *)((PyUnicodeObject *)(op))->str)
    440 
    441 /* --- Constants ---------------------------------------------------------- */
    442 
    443 /* This Unicode character will be used as replacement character during
    444    decoding if the errors argument is set to "replace". Note: the
    445    Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
    446    Unicode 3.0. */
    447 
    448 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
    449 
    450 /* === Public API ========================================================= */
    451 
    452 /* --- Plain Py_UNICODE --------------------------------------------------- */
    453 
    454 /* Create a Unicode Object from the Py_UNICODE buffer u of the given
    455    size.
    456 
    457    u may be NULL which causes the contents to be undefined. It is the
    458    user's responsibility to fill in the needed data afterwards. Note
    459    that modifying the Unicode object contents after construction is
    460    only allowed if u was set to NULL.
    461 
    462    The buffer is copied into the new object. */
    463 
    464 PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
    465     const Py_UNICODE *u,        /* Unicode buffer */
    466     Py_ssize_t size             /* size of buffer */
    467     );
    468 
    469 /* Similar to PyUnicode_FromUnicode(), but u points to Latin-1 encoded bytes */
    470 PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
    471     const char *u,        /* char buffer */
    472     Py_ssize_t size       /* size of buffer */
    473     );
    474 
    475 /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
    476    Latin-1 encoded bytes */
    477 PyAPI_FUNC(PyObject*) PyUnicode_FromString(
    478     const char *u        /* string */
    479     );
    480 
    481 /* Return a read-only pointer to the Unicode object's internal
    482    Py_UNICODE buffer. */
    483 
    484 PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
    485     PyObject *unicode           /* Unicode object */
    486     );
    487 
    488 /* Get the length of the Unicode object. */
    489 
    490 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
    491     PyObject *unicode           /* Unicode object */
    492     );
    493 
    494 /* Get the maximum ordinal for a Unicode character. */
    495 PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
    496 
    497 /* Resize an already allocated Unicode object to the new size length.
    498 
    499    *unicode is modified to point to the new (resized) object and 0
    500    returned on success.
    501 
    502    This API may only be called by the function which also called the
    503    Unicode constructor. The refcount on the object must be 1. Otherwise,
    504    an error is returned.
    505 
    506    Error handling is implemented as follows: an exception is set, -1
    507    is returned and *unicode left untouched.
    508 
    509 */
    510 
    511 PyAPI_FUNC(int) PyUnicode_Resize(
    512     PyObject **unicode,         /* Pointer to the Unicode object */
    513     Py_ssize_t length           /* New length */
    514     );
    515 
    516 /* Coerce obj to an Unicode object and return a reference with
    517    *incremented* refcount.
    518 
    519    Coercion is done in the following way:
    520 
    521    1. String and other char buffer compatible objects are decoded
    522       under the assumptions that they contain data using the current
    523       default encoding. Decoding is done in "strict" mode.
    524 
    525    2. All other objects (including Unicode objects) raise an
    526       exception.
    527 
    528    The API returns NULL in case of an error. The caller is responsible
    529    for decref'ing the returned objects.
    530 
    531 */
    532 
    533 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
    534     register PyObject *obj,     /* Object */
    535     const char *encoding,       /* encoding */
    536     const char *errors          /* error handling */
    537     );
    538 
    539 /* Coerce obj to an Unicode object and return a reference with
    540    *incremented* refcount.
    541 
    542    Unicode objects are passed back as-is (subclasses are converted to
    543    true Unicode objects), all other objects are delegated to
    544    PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
    545    using the default encoding as basis for decoding the object.
    546 
    547    The API returns NULL in case of an error. The caller is responsible
    548    for decref'ing the returned objects.
    549 
    550 */
    551 
    552 PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
    553     register PyObject *obj      /* Object */
    554     );
    555 
    556 PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char*, va_list);
    557 PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char*, ...);
    558 
    559 /* Format the object based on the format_spec, as defined in PEP 3101
    560    (Advanced String Formatting). */
    561 PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
    562                                                  Py_UNICODE *format_spec,
    563                                                  Py_ssize_t format_spec_len);
    564 
    565 /* --- wchar_t support for platforms which support it --------------------- */
    566 
    567 #ifdef HAVE_WCHAR_H
    568 
    569 /* Create a Unicode Object from the whcar_t buffer w of the given
    570    size.
    571 
    572    The buffer is copied into the new object. */
    573 
    574 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
    575     register const wchar_t *w,  /* wchar_t buffer */
    576     Py_ssize_t size             /* size of buffer */
    577     );
    578 
    579 /* Copies the Unicode Object contents into the wchar_t buffer w.  At
    580    most size wchar_t characters are copied.
    581 
    582    Note that the resulting wchar_t string may or may not be
    583    0-terminated.  It is the responsibility of the caller to make sure
    584    that the wchar_t string is 0-terminated in case this is required by
    585    the application.
    586 
    587    Returns the number of wchar_t characters copied (excluding a
    588    possibly trailing 0-termination character) or -1 in case of an
    589    error. */
    590 
    591 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
    592     PyUnicodeObject *unicode,   /* Unicode object */
    593     register wchar_t *w,        /* wchar_t buffer */
    594     Py_ssize_t size             /* size of buffer */
    595     );
    596 
    597 #endif
    598 
    599 /* --- Unicode ordinals --------------------------------------------------- */
    600 
    601 /* Create a Unicode Object from the given Unicode code point ordinal.
    602 
    603    The ordinal must be in range(0x10000) on narrow Python builds
    604    (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
    605    raised in case it is not.
    606 
    607 */
    608 
    609 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
    610 
    611 /* --- Free-list management ----------------------------------------------- */
    612 
    613 /* Clear the free list used by the Unicode implementation.
    614 
    615    This can be used to release memory used for objects on the free
    616    list back to the Python memory allocator.
    617 
    618 */
    619 
    620 PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
    621 
    622 /* === Builtin Codecs =====================================================
    623 
    624    Many of these APIs take two arguments encoding and errors. These
    625    parameters encoding and errors have the same semantics as the ones
    626    of the builtin unicode() API.
    627 
    628    Setting encoding to NULL causes the default encoding to be used.
    629 
    630    Error handling is set by errors which may also be set to NULL
    631    meaning to use the default handling defined for the codec. Default
    632    error handling for all builtin codecs is "strict" (ValueErrors are
    633    raised).
    634 
    635    The codecs all use a similar interface. Only deviation from the
    636    generic ones are documented.
    637 
    638 */
    639 
    640 /* --- Manage the default encoding ---------------------------------------- */
    641 
    642 /* Return a Python string holding the default encoded value of the
    643    Unicode object.
    644 
    645    The resulting string is cached in the Unicode object for subsequent
    646    usage by this function. The cached version is needed to implement
    647    the character buffer interface and will live (at least) as long as
    648    the Unicode object itself.
    649 
    650    The refcount of the string is *not* incremented.
    651 
    652    *** Exported for internal use by the interpreter only !!! ***
    653 
    654 */
    655 
    656 PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
    657     PyObject *, const char *);
    658 
    659 /* Returns the currently active default encoding.
    660 
    661    The default encoding is currently implemented as run-time settable
    662    process global.  This may change in future versions of the
    663    interpreter to become a parameter which is managed on a per-thread
    664    basis.
    665 
    666  */
    667 
    668 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
    669 
    670 /* Sets the currently active default encoding.
    671 
    672    Returns 0 on success, -1 in case of an error.
    673 
    674  */
    675 
    676 PyAPI_FUNC(int) PyUnicode_SetDefaultEncoding(
    677     const char *encoding        /* Encoding name in standard form */
    678     );
    679 
    680 /* --- Generic Codecs ----------------------------------------------------- */
    681 
    682 /* Create a Unicode object by decoding the encoded string s of the
    683    given size. */
    684 
    685 PyAPI_FUNC(PyObject*) PyUnicode_Decode(
    686     const char *s,              /* encoded string */
    687     Py_ssize_t size,            /* size of buffer */
    688     const char *encoding,       /* encoding */
    689     const char *errors          /* error handling */
    690     );
    691 
    692 /* Encodes a Py_UNICODE buffer of the given size and returns a
    693    Python string object. */
    694 
    695 PyAPI_FUNC(PyObject*) PyUnicode_Encode(
    696     const Py_UNICODE *s,        /* Unicode char buffer */
    697     Py_ssize_t size,            /* number of Py_UNICODE chars to encode */
    698     const char *encoding,       /* encoding */
    699     const char *errors          /* error handling */
    700     );
    701 
    702 /* Encodes a Unicode object and returns the result as Python
    703    object. */
    704 
    705 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
    706     PyObject *unicode,          /* Unicode object */
    707     const char *encoding,       /* encoding */
    708     const char *errors          /* error handling */
    709     );
    710 
    711 /* Encodes a Unicode object and returns the result as Python string
    712    object. */
    713 
    714 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
    715     PyObject *unicode,          /* Unicode object */
    716     const char *encoding,       /* encoding */
    717     const char *errors          /* error handling */
    718     );
    719 
    720 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
    721     PyObject* string            /* 256 character map */
    722    );
    723 
    724 
    725 /* --- UTF-7 Codecs ------------------------------------------------------- */
    726 
    727 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
    728     const char *string,         /* UTF-7 encoded string */
    729     Py_ssize_t length,          /* size of string */
    730     const char *errors          /* error handling */
    731     );
    732 
    733 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
    734     const char *string,         /* UTF-7 encoded string */
    735     Py_ssize_t length,          /* size of string */
    736     const char *errors,         /* error handling */
    737     Py_ssize_t *consumed        /* bytes consumed */
    738     );
    739 
    740 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
    741     const Py_UNICODE *data,     /* Unicode char buffer */
    742     Py_ssize_t length,                  /* number of Py_UNICODE chars to encode */
    743     int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
    744     int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
    745     const char *errors          /* error handling */
    746     );
    747 
    748 /* --- UTF-8 Codecs ------------------------------------------------------- */
    749 
    750 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
    751     const char *string,         /* UTF-8 encoded string */
    752     Py_ssize_t length,          /* size of string */
    753     const char *errors          /* error handling */
    754     );
    755 
    756 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
    757     const char *string,         /* UTF-8 encoded string */
    758     Py_ssize_t length,          /* size of string */
    759     const char *errors,         /* error handling */
    760     Py_ssize_t *consumed                /* bytes consumed */
    761     );
    762 
    763 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
    764     PyObject *unicode           /* Unicode object */
    765     );
    766 
    767 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
    768     const Py_UNICODE *data,     /* Unicode char buffer */
    769     Py_ssize_t length,                  /* number of Py_UNICODE chars to encode */
    770     const char *errors          /* error handling */
    771     );
    772 
    773 /* --- UTF-32 Codecs ------------------------------------------------------ */
    774 
    775 /* Decodes length bytes from a UTF-32 encoded buffer string and returns
    776    the corresponding Unicode object.
    777 
    778    errors (if non-NULL) defines the error handling. It defaults
    779    to "strict".
    780 
    781    If byteorder is non-NULL, the decoder starts decoding using the
    782    given byte order:
    783 
    784     *byteorder == -1: little endian
    785     *byteorder == 0:  native order
    786     *byteorder == 1:  big endian
    787 
    788    In native mode, the first four bytes of the stream are checked for a
    789    BOM mark. If found, the BOM mark is analysed, the byte order
    790    adjusted and the BOM skipped.  In the other modes, no BOM mark
    791    interpretation is done. After completion, *byteorder is set to the
    792    current byte order at the end of input data.
    793 
    794    If byteorder is NULL, the codec starts in native order mode.
    795 
    796 */
    797 
    798 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
    799     const char *string,         /* UTF-32 encoded string */
    800     Py_ssize_t length,          /* size of string */
    801     const char *errors,         /* error handling */
    802     int *byteorder              /* pointer to byteorder to use
    803                                    0=native;-1=LE,1=BE; updated on
    804                                    exit */
    805     );
    806 
    807 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
    808     const char *string,         /* UTF-32 encoded string */
    809     Py_ssize_t length,          /* size of string */
    810     const char *errors,         /* error handling */
    811     int *byteorder,             /* pointer to byteorder to use
    812                                    0=native;-1=LE,1=BE; updated on
    813                                    exit */
    814     Py_ssize_t *consumed        /* bytes consumed */
    815     );
    816 
    817 /* Returns a Python string using the UTF-32 encoding in native byte
    818    order. The string always starts with a BOM mark.  */
    819 
    820 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
    821     PyObject *unicode           /* Unicode object */
    822     );
    823 
    824 /* Returns a Python string object holding the UTF-32 encoded value of
    825    the Unicode data.
    826 
    827    If byteorder is not 0, output is written according to the following
    828    byte order:
    829 
    830    byteorder == -1: little endian
    831    byteorder == 0:  native byte order (writes a BOM mark)
    832    byteorder == 1:  big endian
    833 
    834    If byteorder is 0, the output string will always start with the
    835    Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
    836    prepended.
    837 
    838 */
    839 
    840 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
    841     const Py_UNICODE *data,     /* Unicode char buffer */
    842     Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
    843     const char *errors,         /* error handling */
    844     int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
    845     );
    846 
    847 /* --- UTF-16 Codecs ------------------------------------------------------ */
    848 
    849 /* Decodes length bytes from a UTF-16 encoded buffer string and returns
    850    the corresponding Unicode object.
    851 
    852    errors (if non-NULL) defines the error handling. It defaults
    853    to "strict".
    854 
    855    If byteorder is non-NULL, the decoder starts decoding using the
    856    given byte order:
    857 
    858     *byteorder == -1: little endian
    859     *byteorder == 0:  native order
    860     *byteorder == 1:  big endian
    861 
    862    In native mode, the first two bytes of the stream are checked for a
    863    BOM mark. If found, the BOM mark is analysed, the byte order
    864    adjusted and the BOM skipped.  In the other modes, no BOM mark
    865    interpretation is done. After completion, *byteorder is set to the
    866    current byte order at the end of input data.
    867 
    868    If byteorder is NULL, the codec starts in native order mode.
    869 
    870 */
    871 
    872 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
    873     const char *string,         /* UTF-16 encoded string */
    874     Py_ssize_t length,          /* size of string */
    875     const char *errors,         /* error handling */
    876     int *byteorder              /* pointer to byteorder to use
    877                                    0=native;-1=LE,1=BE; updated on
    878                                    exit */
    879     );
    880 
    881 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
    882     const char *string,         /* UTF-16 encoded string */
    883     Py_ssize_t length,          /* size of string */
    884     const char *errors,         /* error handling */
    885     int *byteorder,             /* pointer to byteorder to use
    886                                    0=native;-1=LE,1=BE; updated on
    887                                    exit */
    888     Py_ssize_t *consumed                /* bytes consumed */
    889     );
    890 
    891 /* Returns a Python string using the UTF-16 encoding in native byte
    892    order. The string always starts with a BOM mark.  */
    893 
    894 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
    895     PyObject *unicode           /* Unicode object */
    896     );
    897 
    898 /* Returns a Python string object holding the UTF-16 encoded value of
    899    the Unicode data.
    900 
    901    If byteorder is not 0, output is written according to the following
    902    byte order:
    903 
    904    byteorder == -1: little endian
    905    byteorder == 0:  native byte order (writes a BOM mark)
    906    byteorder == 1:  big endian
    907 
    908    If byteorder is 0, the output string will always start with the
    909    Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
    910    prepended.
    911 
    912    Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
    913    UCS-2. This trick makes it possible to add full UTF-16 capabilities
    914    at a later point without compromising the APIs.
    915 
    916 */
    917 
    918 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
    919     const Py_UNICODE *data,     /* Unicode char buffer */
    920     Py_ssize_t length,                  /* number of Py_UNICODE chars to encode */
    921     const char *errors,         /* error handling */
    922     int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
    923     );
    924 
    925 /* --- Unicode-Escape Codecs ---------------------------------------------- */
    926 
    927 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
    928     const char *string,         /* Unicode-Escape encoded string */
    929     Py_ssize_t length,          /* size of string */
    930     const char *errors          /* error handling */
    931     );
    932 
    933 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
    934     PyObject *unicode           /* Unicode object */
    935     );
    936 
    937 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
    938     const Py_UNICODE *data,     /* Unicode char buffer */
    939     Py_ssize_t length                   /* Number of Py_UNICODE chars to encode */
    940     );
    941 
    942 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
    943 
    944 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
    945     const char *string,         /* Raw-Unicode-Escape encoded string */
    946     Py_ssize_t length,          /* size of string */
    947     const char *errors          /* error handling */
    948     );
    949 
    950 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
    951     PyObject *unicode           /* Unicode object */
    952     );
    953 
    954 PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
    955     const Py_UNICODE *data,     /* Unicode char buffer */
    956     Py_ssize_t length                   /* Number of Py_UNICODE chars to encode */
    957     );
    958 
    959 /* --- Unicode Internal Codec ---------------------------------------------
    960 
    961     Only for internal use in _codecsmodule.c */
    962 
    963 PyObject *_PyUnicode_DecodeUnicodeInternal(
    964     const char *string,
    965     Py_ssize_t length,
    966     const char *errors
    967     );
    968 
    969 /* --- Latin-1 Codecs -----------------------------------------------------
    970 
    971    Note: Latin-1 corresponds to the first 256 Unicode ordinals.
    972 
    973 */
    974 
    975 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
    976     const char *string,         /* Latin-1 encoded string */
    977     Py_ssize_t length,          /* size of string */
    978     const char *errors          /* error handling */
    979     );
    980 
    981 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
    982     PyObject *unicode           /* Unicode object */
    983     );
    984 
    985 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
    986     const Py_UNICODE *data,     /* Unicode char buffer */
    987     Py_ssize_t length,                  /* Number of Py_UNICODE chars to encode */
    988     const char *errors          /* error handling */
    989     );
    990 
    991 /* --- ASCII Codecs -------------------------------------------------------
    992 
    993    Only 7-bit ASCII data is excepted. All other codes generate errors.
    994 
    995 */
    996 
    997 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
    998     const char *string,         /* ASCII encoded string */
    999     Py_ssize_t length,          /* size of string */
   1000     const char *errors          /* error handling */
   1001     );
   1002 
   1003 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
   1004     PyObject *unicode           /* Unicode object */
   1005     );
   1006 
   1007 PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
   1008     const Py_UNICODE *data,     /* Unicode char buffer */
   1009     Py_ssize_t length,                  /* Number of Py_UNICODE chars to encode */
   1010     const char *errors          /* error handling */
   1011     );
   1012 
   1013 /* --- Character Map Codecs -----------------------------------------------
   1014 
   1015    This codec uses mappings to encode and decode characters.
   1016 
   1017    Decoding mappings must map single string characters to single
   1018    Unicode characters, integers (which are then interpreted as Unicode
   1019    ordinals) or None (meaning "undefined mapping" and causing an
   1020    error).
   1021 
   1022    Encoding mappings must map single Unicode characters to single
   1023    string characters, integers (which are then interpreted as Latin-1
   1024    ordinals) or None (meaning "undefined mapping" and causing an
   1025    error).
   1026 
   1027    If a character lookup fails with a LookupError, the character is
   1028    copied as-is meaning that its ordinal value will be interpreted as
   1029    Unicode or Latin-1 ordinal resp. Because of this mappings only need
   1030    to contain those mappings which map characters to different code
   1031    points.
   1032 
   1033 */
   1034 
   1035 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
   1036     const char *string,         /* Encoded string */
   1037     Py_ssize_t length,          /* size of string */
   1038     PyObject *mapping,          /* character mapping
   1039                                    (char ordinal -> unicode ordinal) */
   1040     const char *errors          /* error handling */
   1041     );
   1042 
   1043 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
   1044     PyObject *unicode,          /* Unicode object */
   1045     PyObject *mapping           /* character mapping
   1046                                    (unicode ordinal -> char ordinal) */
   1047     );
   1048 
   1049 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
   1050     const Py_UNICODE *data,     /* Unicode char buffer */
   1051     Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
   1052     PyObject *mapping,          /* character mapping
   1053                                    (unicode ordinal -> char ordinal) */
   1054     const char *errors          /* error handling */
   1055     );
   1056 
   1057 /* Translate a Py_UNICODE buffer of the given length by applying a
   1058    character mapping table to it and return the resulting Unicode
   1059    object.
   1060 
   1061    The mapping table must map Unicode ordinal integers to Unicode
   1062    ordinal integers or None (causing deletion of the character).
   1063 
   1064    Mapping tables may be dictionaries or sequences. Unmapped character
   1065    ordinals (ones which cause a LookupError) are left untouched and
   1066    are copied as-is.
   1067 
   1068 */
   1069 
   1070 PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
   1071     const Py_UNICODE *data,     /* Unicode char buffer */
   1072     Py_ssize_t length,                  /* Number of Py_UNICODE chars to encode */
   1073     PyObject *table,            /* Translate table */
   1074     const char *errors          /* error handling */
   1075     );
   1076 
   1077 #ifdef MS_WIN32
   1078 
   1079 /* --- MBCS codecs for Windows -------------------------------------------- */
   1080 
   1081 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
   1082     const char *string,         /* MBCS encoded string */
   1083     Py_ssize_t length,              /* size of string */
   1084     const char *errors          /* error handling */
   1085     );
   1086 
   1087 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
   1088     const char *string,         /* MBCS encoded string */
   1089     Py_ssize_t length,          /* size of string */
   1090     const char *errors,         /* error handling */
   1091     Py_ssize_t *consumed        /* bytes consumed */
   1092     );
   1093 
   1094 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
   1095     PyObject *unicode           /* Unicode object */
   1096     );
   1097 
   1098 PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
   1099     const Py_UNICODE *data,     /* Unicode char buffer */
   1100     Py_ssize_t length,              /* Number of Py_UNICODE chars to encode */
   1101     const char *errors          /* error handling */
   1102     );
   1103 
   1104 #endif /* MS_WIN32 */
   1105 
   1106 /* --- Decimal Encoder ---------------------------------------------------- */
   1107 
   1108 /* Takes a Unicode string holding a decimal value and writes it into
   1109    an output buffer using standard ASCII digit codes.
   1110 
   1111    The output buffer has to provide at least length+1 bytes of storage
   1112    area. The output string is 0-terminated.
   1113 
   1114    The encoder converts whitespace to ' ', decimal characters to their
   1115    corresponding ASCII digit and all other Latin-1 characters except
   1116    \0 as-is. Characters outside this range (Unicode ordinals 1-256)
   1117    are treated as errors. This includes embedded NULL bytes.
   1118 
   1119    Error handling is defined by the errors argument:
   1120 
   1121       NULL or "strict": raise a ValueError
   1122       "ignore": ignore the wrong characters (these are not copied to the
   1123                 output buffer)
   1124       "replace": replaces illegal characters with '?'
   1125 
   1126    Returns 0 on success, -1 on failure.
   1127 
   1128 */
   1129 
   1130 PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
   1131     Py_UNICODE *s,              /* Unicode buffer */
   1132     Py_ssize_t length,                  /* Number of Py_UNICODE chars to encode */
   1133     char *output,               /* Output buffer; must have size >= length */
   1134     const char *errors          /* error handling */
   1135     );
   1136 
   1137 /* --- Methods & Slots ----------------------------------------------------
   1138 
   1139    These are capable of handling Unicode objects and strings on input
   1140    (we refer to them as strings in the descriptions) and return
   1141    Unicode objects or integers as apporpriate. */
   1142 
   1143 /* Concat two strings giving a new Unicode string. */
   1144 
   1145 PyAPI_FUNC(PyObject*) PyUnicode_Concat(
   1146     PyObject *left,             /* Left string */
   1147     PyObject *right             /* Right string */
   1148     );
   1149 
   1150 /* Split a string giving a list of Unicode strings.
   1151 
   1152    If sep is NULL, splitting will be done at all whitespace
   1153    substrings. Otherwise, splits occur at the given separator.
   1154 
   1155    At most maxsplit splits will be done. If negative, no limit is set.
   1156 
   1157    Separators are not included in the resulting list.
   1158 
   1159 */
   1160 
   1161 PyAPI_FUNC(PyObject*) PyUnicode_Split(
   1162     PyObject *s,                /* String to split */
   1163     PyObject *sep,              /* String separator */
   1164     Py_ssize_t maxsplit         /* Maxsplit count */
   1165     );
   1166 
   1167 /* Dito, but split at line breaks.
   1168 
   1169    CRLF is considered to be one line break. Line breaks are not
   1170    included in the resulting list. */
   1171 
   1172 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
   1173     PyObject *s,                /* String to split */
   1174     int keepends                /* If true, line end markers are included */
   1175     );
   1176 
   1177 /* Partition a string using a given separator. */
   1178 
   1179 PyAPI_FUNC(PyObject*) PyUnicode_Partition(
   1180     PyObject *s,                /* String to partition */
   1181     PyObject *sep               /* String separator */
   1182     );
   1183 
   1184 /* Partition a string using a given separator, searching from the end of the
   1185    string. */
   1186 
   1187 PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
   1188     PyObject *s,                /* String to partition */
   1189     PyObject *sep               /* String separator */
   1190     );
   1191 
   1192 /* Split a string giving a list of Unicode strings.
   1193 
   1194    If sep is NULL, splitting will be done at all whitespace
   1195    substrings. Otherwise, splits occur at the given separator.
   1196 
   1197    At most maxsplit splits will be done. But unlike PyUnicode_Split
   1198    PyUnicode_RSplit splits from the end of the string. If negative,
   1199    no limit is set.
   1200 
   1201    Separators are not included in the resulting list.
   1202 
   1203 */
   1204 
   1205 PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
   1206     PyObject *s,                /* String to split */
   1207     PyObject *sep,              /* String separator */
   1208     Py_ssize_t maxsplit         /* Maxsplit count */
   1209     );
   1210 
   1211 /* Translate a string by applying a character mapping table to it and
   1212    return the resulting Unicode object.
   1213 
   1214    The mapping table must map Unicode ordinal integers to Unicode
   1215    ordinal integers or None (causing deletion of the character).
   1216 
   1217    Mapping tables may be dictionaries or sequences. Unmapped character
   1218    ordinals (ones which cause a LookupError) are left untouched and
   1219    are copied as-is.
   1220 
   1221 */
   1222 
   1223 PyAPI_FUNC(PyObject *) PyUnicode_Translate(
   1224     PyObject *str,              /* String */
   1225     PyObject *table,            /* Translate table */
   1226     const char *errors          /* error handling */
   1227     );
   1228 
   1229 /* Join a sequence of strings using the given separator and return
   1230    the resulting Unicode string. */
   1231 
   1232 PyAPI_FUNC(PyObject*) PyUnicode_Join(
   1233     PyObject *separator,        /* Separator string */
   1234     PyObject *seq               /* Sequence object */
   1235     );
   1236 
   1237 /* Return 1 if substr matches str[start:end] at the given tail end, 0
   1238    otherwise. */
   1239 
   1240 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
   1241     PyObject *str,              /* String */
   1242     PyObject *substr,           /* Prefix or Suffix string */
   1243     Py_ssize_t start,           /* Start index */
   1244     Py_ssize_t end,             /* Stop index */
   1245     int direction               /* Tail end: -1 prefix, +1 suffix */
   1246     );
   1247 
   1248 /* Return the first position of substr in str[start:end] using the
   1249    given search direction or -1 if not found. -2 is returned in case
   1250    an error occurred and an exception is set. */
   1251 
   1252 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
   1253     PyObject *str,              /* String */
   1254     PyObject *substr,           /* Substring to find */
   1255     Py_ssize_t start,           /* Start index */
   1256     Py_ssize_t end,             /* Stop index */
   1257     int direction               /* Find direction: +1 forward, -1 backward */
   1258     );
   1259 
   1260 /* Count the number of occurrences of substr in str[start:end]. */
   1261 
   1262 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
   1263     PyObject *str,              /* String */
   1264     PyObject *substr,           /* Substring to count */
   1265     Py_ssize_t start,           /* Start index */
   1266     Py_ssize_t end              /* Stop index */
   1267     );
   1268 
   1269 /* Replace at most maxcount occurrences of substr in str with replstr
   1270    and return the resulting Unicode object. */
   1271 
   1272 PyAPI_FUNC(PyObject *) PyUnicode_Replace(
   1273     PyObject *str,              /* String */
   1274     PyObject *substr,           /* Substring to find */
   1275     PyObject *replstr,          /* Substring to replace */
   1276     Py_ssize_t maxcount         /* Max. number of replacements to apply;
   1277                                    -1 = all */
   1278     );
   1279 
   1280 /* Compare two strings and return -1, 0, 1 for less than, equal,
   1281    greater than resp. */
   1282 
   1283 PyAPI_FUNC(int) PyUnicode_Compare(
   1284     PyObject *left,             /* Left string */
   1285     PyObject *right             /* Right string */
   1286     );
   1287 
   1288 /* Rich compare two strings and return one of the following:
   1289 
   1290    - NULL in case an exception was raised
   1291    - Py_True or Py_False for successfuly comparisons
   1292    - Py_NotImplemented in case the type combination is unknown
   1293 
   1294    Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
   1295    case the conversion of the arguments to Unicode fails with a
   1296    UnicodeDecodeError.
   1297 
   1298    Possible values for op:
   1299 
   1300      Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
   1301 
   1302 */
   1303 
   1304 PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
   1305     PyObject *left,             /* Left string */
   1306     PyObject *right,            /* Right string */
   1307     int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
   1308     );
   1309 
   1310 /* Apply a argument tuple or dictionary to a format string and return
   1311    the resulting Unicode string. */
   1312 
   1313 PyAPI_FUNC(PyObject *) PyUnicode_Format(
   1314     PyObject *format,           /* Format string */
   1315     PyObject *args              /* Argument tuple or dictionary */
   1316     );
   1317 
   1318 /* Checks whether element is contained in container and return 1/0
   1319    accordingly.
   1320 
   1321    element has to coerce to an one element Unicode string. -1 is
   1322    returned in case of an error. */
   1323 
   1324 PyAPI_FUNC(int) PyUnicode_Contains(
   1325     PyObject *container,        /* Container string */
   1326     PyObject *element           /* Element string */
   1327     );
   1328 
   1329 /* Externally visible for str.strip(unicode) */
   1330 PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
   1331     PyUnicodeObject *self,
   1332     int striptype,
   1333     PyObject *sepobj
   1334     );
   1335 
   1336 /* === Characters Type APIs =============================================== */
   1337 
   1338 /* Helper array used by Py_UNICODE_ISSPACE(). */
   1339 
   1340 PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
   1341 
   1342 /* These should not be used directly. Use the Py_UNICODE_IS* and
   1343    Py_UNICODE_TO* macros instead.
   1344 
   1345    These APIs are implemented in Objects/unicodectype.c.
   1346 
   1347 */
   1348 
   1349 PyAPI_FUNC(int) _PyUnicode_IsLowercase(
   1350     Py_UNICODE ch       /* Unicode character */
   1351     );
   1352 
   1353 PyAPI_FUNC(int) _PyUnicode_IsUppercase(
   1354     Py_UNICODE ch       /* Unicode character */
   1355     );
   1356 
   1357 PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
   1358     Py_UNICODE ch       /* Unicode character */
   1359     );
   1360 
   1361 PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
   1362     const Py_UNICODE ch         /* Unicode character */
   1363     );
   1364 
   1365 PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
   1366     const Py_UNICODE ch         /* Unicode character */
   1367     );
   1368 
   1369 PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToLowercase(
   1370     Py_UNICODE ch       /* Unicode character */
   1371     );
   1372 
   1373 PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToUppercase(
   1374     Py_UNICODE ch       /* Unicode character */
   1375     );
   1376 
   1377 PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToTitlecase(
   1378     Py_UNICODE ch       /* Unicode character */
   1379     );
   1380 
   1381 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
   1382     Py_UNICODE ch       /* Unicode character */
   1383     );
   1384 
   1385 PyAPI_FUNC(int) _PyUnicode_ToDigit(
   1386     Py_UNICODE ch       /* Unicode character */
   1387     );
   1388 
   1389 PyAPI_FUNC(double) _PyUnicode_ToNumeric(
   1390     Py_UNICODE ch       /* Unicode character */
   1391     );
   1392 
   1393 PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
   1394     Py_UNICODE ch       /* Unicode character */
   1395     );
   1396 
   1397 PyAPI_FUNC(int) _PyUnicode_IsDigit(
   1398     Py_UNICODE ch       /* Unicode character */
   1399     );
   1400 
   1401 PyAPI_FUNC(int) _PyUnicode_IsNumeric(
   1402     Py_UNICODE ch       /* Unicode character */
   1403     );
   1404 
   1405 PyAPI_FUNC(int) _PyUnicode_IsAlpha(
   1406     Py_UNICODE ch       /* Unicode character */
   1407     );
   1408 
   1409 #ifdef __cplusplus
   1410 }
   1411 #endif
   1412 #endif /* Py_USING_UNICODE */
   1413 #endif /* !Py_UNICODEOBJECT_H */
   1414