1 /* 2 3 Unicode implementation based on original code by Fredrik Lundh, 4 modified by Marc-Andre Lemburg <mal (at) lemburg.com> according to the 5 Unicode Integration Proposal (see file Misc/unicode.txt). 6 7 Major speed upgrades to the method implementations at the Reykjavik 8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 9 10 Copyright (c) Corporation for National Research Initiatives. 11 12 -------------------------------------------------------------------- 13 The original string type implementation is: 14 15 Copyright (c) 1999 by Secret Labs AB 16 Copyright (c) 1999 by Fredrik Lundh 17 18 By obtaining, using, and/or copying this software and/or its 19 associated documentation, you agree that you have read, understood, 20 and will comply with the following terms and conditions: 21 22 Permission to use, copy, modify, and distribute this software and its 23 associated documentation for any purpose and without fee is hereby 24 granted, provided that the above copyright notice appears in all 25 copies, and that both that copyright notice and this permission notice 26 appear in supporting documentation, and that the name of Secret Labs 27 AB or the author not be used in advertising or publicity pertaining to 28 distribution of the software without specific, written prior 29 permission. 30 31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 38 -------------------------------------------------------------------- 39 40 */ 41 42 #define PY_SSIZE_T_CLEAN 43 #include "Python.h" 44 45 #include "unicodeobject.h" 46 #include "ucnhash.h" 47 48 #ifdef MS_WINDOWS 49 #include <windows.h> 50 #endif 51 52 /* Limit for the Unicode object free list */ 53 54 #define PyUnicode_MAXFREELIST 1024 55 56 /* Limit for the Unicode object free list stay alive optimization. 57 58 The implementation will keep allocated Unicode memory intact for 59 all objects on the free list having a size less than this 60 limit. This reduces malloc() overhead for small Unicode objects. 61 62 At worst this will result in PyUnicode_MAXFREELIST * 63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 64 malloc()-overhead) bytes of unused garbage. 65 66 Setting the limit to 0 effectively turns the feature off. 67 68 Note: This is an experimental feature ! If you get core dumps when 69 using Unicode objects, turn this feature off. 70 71 */ 72 73 #define KEEPALIVE_SIZE_LIMIT 9 74 75 /* Endianness switches; defaults to little endian */ 76 77 #ifdef WORDS_BIGENDIAN 78 # define BYTEORDER_IS_BIG_ENDIAN 79 #else 80 # define BYTEORDER_IS_LITTLE_ENDIAN 81 #endif 82 83 /* --- Globals ------------------------------------------------------------ 84 85 NOTE: In the interpreter's initialization phase, some globals are currently 86 initialized dynamically as needed. In the process Unicode objects may 87 be created before the Unicode type is ready. 88 89 */ 90 91 92 #ifdef __cplusplus 93 extern "C" { 94 #endif 95 96 /* Free list for Unicode objects */ 97 static PyUnicodeObject *free_list = NULL; 98 static int numfree = 0; 99 100 /* The empty Unicode object is shared to improve performance. */ 101 static PyUnicodeObject *unicode_empty = NULL; 102 103 #define _Py_RETURN_UNICODE_EMPTY() \ 104 do { \ 105 if (unicode_empty != NULL) \ 106 Py_INCREF(unicode_empty); \ 107 else { \ 108 unicode_empty = _PyUnicode_New(0); \ 109 if (unicode_empty != NULL) \ 110 Py_INCREF(unicode_empty); \ 111 } \ 112 return (PyObject *)unicode_empty; \ 113 } while (0) 114 115 /* Single character Unicode strings in the Latin-1 range are being 116 shared as well. */ 117 static PyUnicodeObject *unicode_latin1[256] = {NULL}; 118 119 /* Default encoding to use and assume when NULL is passed as encoding 120 parameter; it is initialized by _PyUnicode_Init(). 121 122 Always use the PyUnicode_SetDefaultEncoding() and 123 PyUnicode_GetDefaultEncoding() APIs to access this global. 124 125 */ 126 static char unicode_default_encoding[100 + 1] = "ascii"; 127 128 /* Fast detection of the most frequent whitespace characters */ 129 const unsigned char _Py_ascii_whitespace[] = { 130 0, 0, 0, 0, 0, 0, 0, 0, 131 /* case 0x0009: * CHARACTER TABULATION */ 132 /* case 0x000A: * LINE FEED */ 133 /* case 0x000B: * LINE TABULATION */ 134 /* case 0x000C: * FORM FEED */ 135 /* case 0x000D: * CARRIAGE RETURN */ 136 0, 1, 1, 1, 1, 1, 0, 0, 137 0, 0, 0, 0, 0, 0, 0, 0, 138 /* case 0x001C: * FILE SEPARATOR */ 139 /* case 0x001D: * GROUP SEPARATOR */ 140 /* case 0x001E: * RECORD SEPARATOR */ 141 /* case 0x001F: * UNIT SEPARATOR */ 142 0, 0, 0, 0, 1, 1, 1, 1, 143 /* case 0x0020: * SPACE */ 144 1, 0, 0, 0, 0, 0, 0, 0, 145 0, 0, 0, 0, 0, 0, 0, 0, 146 0, 0, 0, 0, 0, 0, 0, 0, 147 0, 0, 0, 0, 0, 0, 0, 0, 148 149 0, 0, 0, 0, 0, 0, 0, 0, 150 0, 0, 0, 0, 0, 0, 0, 0, 151 0, 0, 0, 0, 0, 0, 0, 0, 152 0, 0, 0, 0, 0, 0, 0, 0, 153 0, 0, 0, 0, 0, 0, 0, 0, 154 0, 0, 0, 0, 0, 0, 0, 0, 155 0, 0, 0, 0, 0, 0, 0, 0, 156 0, 0, 0, 0, 0, 0, 0, 0 157 }; 158 159 /* Same for linebreaks */ 160 static unsigned char ascii_linebreak[] = { 161 0, 0, 0, 0, 0, 0, 0, 0, 162 /* 0x000A, * LINE FEED */ 163 /* 0x000B, * LINE TABULATION */ 164 /* 0x000C, * FORM FEED */ 165 /* 0x000D, * CARRIAGE RETURN */ 166 0, 0, 1, 1, 1, 1, 0, 0, 167 0, 0, 0, 0, 0, 0, 0, 0, 168 /* 0x001C, * FILE SEPARATOR */ 169 /* 0x001D, * GROUP SEPARATOR */ 170 /* 0x001E, * RECORD SEPARATOR */ 171 0, 0, 0, 0, 1, 1, 1, 0, 172 0, 0, 0, 0, 0, 0, 0, 0, 173 0, 0, 0, 0, 0, 0, 0, 0, 174 0, 0, 0, 0, 0, 0, 0, 0, 175 0, 0, 0, 0, 0, 0, 0, 0, 176 177 0, 0, 0, 0, 0, 0, 0, 0, 178 0, 0, 0, 0, 0, 0, 0, 0, 179 0, 0, 0, 0, 0, 0, 0, 0, 180 0, 0, 0, 0, 0, 0, 0, 0, 181 0, 0, 0, 0, 0, 0, 0, 0, 182 0, 0, 0, 0, 0, 0, 0, 0, 183 0, 0, 0, 0, 0, 0, 0, 0, 184 0, 0, 0, 0, 0, 0, 0, 0 185 }; 186 187 188 Py_UNICODE 189 PyUnicode_GetMax(void) 190 { 191 #ifdef Py_UNICODE_WIDE 192 return 0x10FFFF; 193 #else 194 /* This is actually an illegal character, so it should 195 not be passed to unichr. */ 196 return 0xFFFF; 197 #endif 198 } 199 200 /* --- Bloom Filters ----------------------------------------------------- */ 201 202 /* stuff to implement simple "bloom filters" for Unicode characters. 203 to keep things simple, we use a single bitmask, using the least 5 204 bits from each unicode characters as the bit index. */ 205 206 /* the linebreak mask is set up by Unicode_Init below */ 207 208 #if LONG_BIT >= 128 209 #define BLOOM_WIDTH 128 210 #elif LONG_BIT >= 64 211 #define BLOOM_WIDTH 64 212 #elif LONG_BIT >= 32 213 #define BLOOM_WIDTH 32 214 #else 215 #error "LONG_BIT is smaller than 32" 216 #endif 217 218 #define BLOOM_MASK unsigned long 219 220 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0; 221 222 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 223 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 224 225 #define BLOOM_LINEBREAK(ch) \ 226 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 228 229 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) 230 { 231 /* calculate simple bloom-style bitmask for a given unicode string */ 232 233 BLOOM_MASK mask; 234 Py_ssize_t i; 235 236 mask = 0; 237 for (i = 0; i < len; i++) 238 BLOOM_ADD(mask, ptr[i]); 239 240 return mask; 241 } 242 243 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) 244 { 245 Py_ssize_t i; 246 247 for (i = 0; i < setlen; i++) 248 if (set[i] == chr) 249 return 1; 250 251 return 0; 252 } 253 254 #define BLOOM_MEMBER(mask, chr, set, setlen) \ 255 BLOOM(mask, chr) && unicode_member(chr, set, setlen) 256 257 /* --- Unicode Object ----------------------------------------------------- */ 258 259 static 260 int unicode_resize(register PyUnicodeObject *unicode, 261 Py_ssize_t length) 262 { 263 void *oldstr; 264 265 /* Shortcut if there's nothing much to do. */ 266 if (unicode->length == length) 267 goto reset; 268 269 /* Resizing shared object (unicode_empty or single character 270 objects) in-place is not allowed. Use PyUnicode_Resize() 271 instead ! */ 272 273 if (unicode == unicode_empty || 274 (unicode->length == 1 && 275 unicode->str[0] < 256U && 276 unicode_latin1[unicode->str[0]] == unicode)) { 277 PyErr_SetString(PyExc_SystemError, 278 "can't resize shared unicode objects"); 279 return -1; 280 } 281 282 /* We allocate one more byte to make sure the string is Ux0000 terminated. 283 The overallocation is also used by fastsearch, which assumes that it's 284 safe to look at str[length] (without making any assumptions about what 285 it contains). */ 286 287 oldstr = unicode->str; 288 unicode->str = PyObject_REALLOC(unicode->str, 289 sizeof(Py_UNICODE) * (length + 1)); 290 if (!unicode->str) { 291 unicode->str = (Py_UNICODE *)oldstr; 292 PyErr_NoMemory(); 293 return -1; 294 } 295 unicode->str[length] = 0; 296 unicode->length = length; 297 298 reset: 299 /* Reset the object caches */ 300 if (unicode->defenc) { 301 Py_CLEAR(unicode->defenc); 302 } 303 unicode->hash = -1; 304 305 return 0; 306 } 307 308 /* We allocate one more byte to make sure the string is 309 Ux0000 terminated; some code relies on that. 310 311 XXX This allocator could further be enhanced by assuring that the 312 free list never reduces its size below 1. 313 314 */ 315 316 static 317 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) 318 { 319 register PyUnicodeObject *unicode; 320 321 /* Optimization for empty strings */ 322 if (length == 0 && unicode_empty != NULL) { 323 Py_INCREF(unicode_empty); 324 return unicode_empty; 325 } 326 327 /* Ensure we won't overflow the size. */ 328 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 329 return (PyUnicodeObject *)PyErr_NoMemory(); 330 } 331 332 /* Unicode freelist & memory allocation */ 333 if (free_list) { 334 unicode = free_list; 335 free_list = *(PyUnicodeObject **)unicode; 336 numfree--; 337 if (unicode->str) { 338 /* Keep-Alive optimization: we only upsize the buffer, 339 never downsize it. */ 340 if ((unicode->length < length) && 341 unicode_resize(unicode, length) < 0) { 342 PyObject_DEL(unicode->str); 343 unicode->str = NULL; 344 } 345 } 346 else { 347 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 348 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 349 } 350 (void)PyObject_INIT(unicode, &PyUnicode_Type); 351 } 352 else { 353 size_t new_size; 354 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 355 if (unicode == NULL) 356 return NULL; 357 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 359 } 360 361 if (!unicode->str) { 362 PyErr_NoMemory(); 363 goto onError; 364 } 365 /* Initialize the first element to guard against cases where 366 * the caller fails before initializing str -- unicode_resize() 367 * reads str[0], and the Keep-Alive optimization can keep memory 368 * allocated for str alive across a call to unicode_dealloc(unicode). 369 * We don't want unicode_resize to read uninitialized memory in 370 * that case. 371 */ 372 unicode->str[0] = 0; 373 unicode->str[length] = 0; 374 unicode->length = length; 375 unicode->hash = -1; 376 unicode->defenc = NULL; 377 return unicode; 378 379 onError: 380 /* XXX UNREF/NEWREF interface should be more symmetrical */ 381 _Py_DEC_REFTOTAL; 382 _Py_ForgetReference((PyObject *)unicode); 383 PyObject_Del(unicode); 384 return NULL; 385 } 386 387 static 388 void unicode_dealloc(register PyUnicodeObject *unicode) 389 { 390 if (PyUnicode_CheckExact(unicode) && 391 numfree < PyUnicode_MAXFREELIST) { 392 /* Keep-Alive optimization */ 393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 394 PyObject_DEL(unicode->str); 395 unicode->str = NULL; 396 unicode->length = 0; 397 } 398 if (unicode->defenc) { 399 Py_CLEAR(unicode->defenc); 400 } 401 /* Add to free list */ 402 *(PyUnicodeObject **)unicode = free_list; 403 free_list = unicode; 404 numfree++; 405 } 406 else { 407 PyObject_DEL(unicode->str); 408 Py_XDECREF(unicode->defenc); 409 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 410 } 411 } 412 413 static 414 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length) 415 { 416 register PyUnicodeObject *v; 417 418 /* Argument checks */ 419 if (unicode == NULL) { 420 PyErr_BadInternalCall(); 421 return -1; 422 } 423 v = *unicode; 424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) { 425 PyErr_BadInternalCall(); 426 return -1; 427 } 428 429 /* Resizing unicode_empty and single character objects is not 430 possible since these are being shared. We simply return a fresh 431 copy with the same Unicode content. */ 432 if (v->length != length && 433 (v == unicode_empty || v->length == 1)) { 434 PyUnicodeObject *w = _PyUnicode_New(length); 435 if (w == NULL) 436 return -1; 437 Py_UNICODE_COPY(w->str, v->str, 438 length < v->length ? length : v->length); 439 Py_SETREF(*unicode, w); 440 return 0; 441 } 442 443 /* Note that we don't have to modify *unicode for unshared Unicode 444 objects, since we can modify them in-place. */ 445 return unicode_resize(v, length); 446 } 447 448 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) 449 { 450 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length); 451 } 452 453 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 454 Py_ssize_t size) 455 { 456 PyUnicodeObject *unicode; 457 458 /* If the Unicode data is known at construction time, we can apply 459 some optimizations which share commonly used objects. */ 460 if (u != NULL) { 461 462 /* Optimization for empty strings */ 463 if (size == 0) 464 _Py_RETURN_UNICODE_EMPTY(); 465 466 /* Single character Unicode objects in the Latin-1 range are 467 shared when using this constructor */ 468 if (size == 1 && *u < 256) { 469 unicode = unicode_latin1[*u]; 470 if (!unicode) { 471 unicode = _PyUnicode_New(1); 472 if (!unicode) 473 return NULL; 474 unicode->str[0] = *u; 475 unicode_latin1[*u] = unicode; 476 } 477 Py_INCREF(unicode); 478 return (PyObject *)unicode; 479 } 480 } 481 482 unicode = _PyUnicode_New(size); 483 if (!unicode) 484 return NULL; 485 486 /* Copy the Unicode data into the new object */ 487 if (u != NULL) 488 Py_UNICODE_COPY(unicode->str, u, size); 489 490 return (PyObject *)unicode; 491 } 492 493 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 494 { 495 PyUnicodeObject *unicode; 496 497 if (size < 0) { 498 PyErr_SetString(PyExc_SystemError, 499 "Negative size passed to PyUnicode_FromStringAndSize"); 500 return NULL; 501 } 502 503 /* If the Unicode data is known at construction time, we can apply 504 some optimizations which share commonly used objects. 505 Also, this means the input must be UTF-8, so fall back to the 506 UTF-8 decoder at the end. */ 507 if (u != NULL) { 508 509 /* Optimization for empty strings */ 510 if (size == 0) 511 _Py_RETURN_UNICODE_EMPTY(); 512 513 /* Single characters are shared when using this constructor. 514 Restrict to ASCII, since the input must be UTF-8. */ 515 if (size == 1 && Py_CHARMASK(*u) < 128) { 516 unicode = unicode_latin1[Py_CHARMASK(*u)]; 517 if (!unicode) { 518 unicode = _PyUnicode_New(1); 519 if (!unicode) 520 return NULL; 521 unicode->str[0] = Py_CHARMASK(*u); 522 unicode_latin1[Py_CHARMASK(*u)] = unicode; 523 } 524 Py_INCREF(unicode); 525 return (PyObject *)unicode; 526 } 527 528 return PyUnicode_DecodeUTF8(u, size, NULL); 529 } 530 531 unicode = _PyUnicode_New(size); 532 if (!unicode) 533 return NULL; 534 535 return (PyObject *)unicode; 536 } 537 538 PyObject *PyUnicode_FromString(const char *u) 539 { 540 size_t size = strlen(u); 541 if (size > PY_SSIZE_T_MAX) { 542 PyErr_SetString(PyExc_OverflowError, "input too long"); 543 return NULL; 544 } 545 546 return PyUnicode_FromStringAndSize(u, size); 547 } 548 549 /* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed 550 * by 'ptr', possibly combining surrogate pairs on narrow builds. 551 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character 552 * that should be returned and 'end' pointing to the end of the buffer. 553 * ('end' is used on narrow builds to detect a lone surrogate at the 554 * end of the buffer that should be returned unchanged.) 555 * The ptr and end arguments should be side-effect free and ptr must an lvalue. 556 * The type of the returned char is always Py_UCS4. 557 * 558 * Note: the macro advances ptr to next char, so it might have side-effects 559 * (especially if used with other macros). 560 */ 561 562 /* helper macros used by _Py_UNICODE_NEXT */ 563 #define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF) 564 #define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF) 565 /* Join two surrogate characters and return a single Py_UCS4 value. */ 566 #define _Py_UNICODE_JOIN_SURROGATES(high, low) \ 567 (((((Py_UCS4)(high) & 0x03FF) << 10) | \ 568 ((Py_UCS4)(low) & 0x03FF)) + 0x10000) 569 570 #ifdef Py_UNICODE_WIDE 571 #define _Py_UNICODE_NEXT(ptr, end) *(ptr)++ 572 #else 573 #define _Py_UNICODE_NEXT(ptr, end) \ 574 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \ 575 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \ 576 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \ 577 (Py_UCS4)*(ptr)++) 578 #endif 579 580 #ifdef HAVE_WCHAR_H 581 582 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4) 583 # define CONVERT_WCHAR_TO_SURROGATES 584 #endif 585 586 #ifdef CONVERT_WCHAR_TO_SURROGATES 587 588 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need 589 to convert from UTF32 to UTF16. */ 590 591 PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 592 Py_ssize_t size) 593 { 594 PyUnicodeObject *unicode; 595 register Py_ssize_t i; 596 Py_ssize_t alloc; 597 const wchar_t *orig_w; 598 599 if (w == NULL) { 600 PyErr_BadInternalCall(); 601 return NULL; 602 } 603 604 alloc = size; 605 orig_w = w; 606 for (i = size; i > 0; i--) { 607 if (*w > 0xFFFF) 608 alloc++; 609 w++; 610 } 611 w = orig_w; 612 unicode = _PyUnicode_New(alloc); 613 if (!unicode) 614 return NULL; 615 616 /* Copy the wchar_t data into the new object */ 617 { 618 register Py_UNICODE *u; 619 u = PyUnicode_AS_UNICODE(unicode); 620 for (i = size; i > 0; i--) { 621 if (*w > 0xFFFF) { 622 wchar_t ordinal = *w++; 623 ordinal -= 0x10000; 624 *u++ = 0xD800 | (ordinal >> 10); 625 *u++ = 0xDC00 | (ordinal & 0x3FF); 626 } 627 else 628 *u++ = *w++; 629 } 630 } 631 return (PyObject *)unicode; 632 } 633 634 #else 635 636 PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 637 Py_ssize_t size) 638 { 639 PyUnicodeObject *unicode; 640 641 if (w == NULL) { 642 PyErr_BadInternalCall(); 643 return NULL; 644 } 645 646 unicode = _PyUnicode_New(size); 647 if (!unicode) 648 return NULL; 649 650 /* Copy the wchar_t data into the new object */ 651 #ifdef HAVE_USABLE_WCHAR_T 652 memcpy(unicode->str, w, size * sizeof(wchar_t)); 653 #else 654 { 655 register Py_UNICODE *u; 656 register Py_ssize_t i; 657 u = PyUnicode_AS_UNICODE(unicode); 658 for (i = size; i > 0; i--) 659 *u++ = *w++; 660 } 661 #endif 662 663 return (PyObject *)unicode; 664 } 665 666 #endif /* CONVERT_WCHAR_TO_SURROGATES */ 667 668 #undef CONVERT_WCHAR_TO_SURROGATES 669 670 static void 671 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c) 672 { 673 *fmt++ = '%'; 674 if (width) { 675 if (zeropad) 676 *fmt++ = '0'; 677 fmt += sprintf(fmt, "%d", width); 678 } 679 if (precision) 680 fmt += sprintf(fmt, ".%d", precision); 681 if (longflag) 682 *fmt++ = 'l'; 683 else if (size_tflag) { 684 char *f = PY_FORMAT_SIZE_T; 685 while (*f) 686 *fmt++ = *f++; 687 } 688 *fmt++ = c; 689 *fmt = '\0'; 690 } 691 692 #define appendstring(string) \ 693 do { \ 694 for (copy = string;*copy; copy++) { \ 695 *s++ = (unsigned char)*copy; \ 696 } \ 697 } while (0) 698 699 PyObject * 700 PyUnicode_FromFormatV(const char *format, va_list vargs) 701 { 702 va_list count; 703 Py_ssize_t callcount = 0; 704 PyObject **callresults = NULL; 705 PyObject **callresult = NULL; 706 Py_ssize_t n = 0; 707 int width = 0; 708 int precision = 0; 709 int zeropad; 710 const char* f; 711 Py_UNICODE *s; 712 PyObject *string; 713 /* used by sprintf */ 714 char buffer[21]; 715 /* use abuffer instead of buffer, if we need more space 716 * (which can happen if there's a format specifier with width). */ 717 char *abuffer = NULL; 718 char *realbuffer; 719 Py_ssize_t abuffersize = 0; 720 char fmt[60]; /* should be enough for %0width.precisionld */ 721 const char *copy; 722 723 #ifdef VA_LIST_IS_ARRAY 724 Py_MEMCPY(count, vargs, sizeof(va_list)); 725 #else 726 #ifdef __va_copy 727 __va_copy(count, vargs); 728 #else 729 count = vargs; 730 #endif 731 #endif 732 /* step 1: count the number of %S/%R/%s format specifications 733 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these 734 * objects once during step 3 and put the result in an array) */ 735 for (f = format; *f; f++) { 736 if (*f == '%') { 737 f++; 738 while (*f && *f != '%' && !isalpha((unsigned)*f)) 739 f++; 740 if (!*f) 741 break; 742 if (*f == 's' || *f=='S' || *f=='R') 743 ++callcount; 744 } 745 } 746 /* step 2: allocate memory for the results of 747 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 748 if (callcount) { 749 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); 750 if (!callresults) { 751 PyErr_NoMemory(); 752 return NULL; 753 } 754 callresult = callresults; 755 } 756 /* step 3: figure out how large a buffer we need */ 757 for (f = format; *f; f++) { 758 if (*f == '%') { 759 const char* p = f++; 760 width = 0; 761 while (isdigit((unsigned)*f)) 762 width = (width*10) + *f++ - '0'; 763 precision = 0; 764 if (*f == '.') { 765 f++; 766 while (isdigit((unsigned)*f)) 767 precision = (precision*10) + *f++ - '0'; 768 } 769 770 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since 771 * they don't affect the amount of space we reserve. 772 */ 773 if ((*f == 'l' || *f == 'z') && 774 (f[1] == 'd' || f[1] == 'u')) 775 ++f; 776 777 switch (*f) { 778 case 'c': 779 { 780 int ordinal = va_arg(count, int); 781 #ifdef Py_UNICODE_WIDE 782 if (ordinal < 0 || ordinal > 0x10ffff) { 783 PyErr_SetString(PyExc_OverflowError, 784 "%c arg not in range(0x110000) " 785 "(wide Python build)"); 786 goto fail; 787 } 788 #else 789 if (ordinal < 0 || ordinal > 0xffff) { 790 PyErr_SetString(PyExc_OverflowError, 791 "%c arg not in range(0x10000) " 792 "(narrow Python build)"); 793 goto fail; 794 } 795 #endif 796 /* fall through... */ 797 } 798 case '%': 799 n++; 800 break; 801 case 'd': case 'u': case 'i': case 'x': 802 (void) va_arg(count, int); 803 if (width < precision) 804 width = precision; 805 /* 20 bytes is enough to hold a 64-bit 806 integer. Decimal takes the most space. 807 This isn't enough for octal. 808 If a width is specified we need more 809 (which we allocate later). */ 810 if (width < 20) 811 width = 20; 812 n += width; 813 if (abuffersize < width) 814 abuffersize = width; 815 break; 816 case 's': 817 { 818 /* UTF-8 */ 819 const char *s = va_arg(count, const char*); 820 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); 821 if (!str) 822 goto fail; 823 n += PyUnicode_GET_SIZE(str); 824 /* Remember the str and switch to the next slot */ 825 *callresult++ = str; 826 break; 827 } 828 case 'U': 829 { 830 PyObject *obj = va_arg(count, PyObject *); 831 assert(obj && PyUnicode_Check(obj)); 832 n += PyUnicode_GET_SIZE(obj); 833 break; 834 } 835 case 'V': 836 { 837 PyObject *obj = va_arg(count, PyObject *); 838 const char *str = va_arg(count, const char *); 839 assert(obj || str); 840 assert(!obj || PyUnicode_Check(obj)); 841 if (obj) 842 n += PyUnicode_GET_SIZE(obj); 843 else 844 n += strlen(str); 845 break; 846 } 847 case 'S': 848 { 849 PyObject *obj = va_arg(count, PyObject *); 850 PyObject *str; 851 assert(obj); 852 str = PyObject_Str(obj); 853 if (!str) 854 goto fail; 855 n += PyString_GET_SIZE(str); 856 /* Remember the str and switch to the next slot */ 857 *callresult++ = str; 858 break; 859 } 860 case 'R': 861 { 862 PyObject *obj = va_arg(count, PyObject *); 863 PyObject *repr; 864 assert(obj); 865 repr = PyObject_Repr(obj); 866 if (!repr) 867 goto fail; 868 n += PyUnicode_GET_SIZE(repr); 869 /* Remember the repr and switch to the next slot */ 870 *callresult++ = repr; 871 break; 872 } 873 case 'p': 874 (void) va_arg(count, int); 875 /* maximum 64-bit pointer representation: 876 * 0xffffffffffffffff 877 * so 19 characters is enough. 878 * XXX I count 18 -- what's the extra for? 879 */ 880 n += 19; 881 break; 882 default: 883 /* if we stumble upon an unknown 884 formatting code, copy the rest of 885 the format string to the output 886 string. (we cannot just skip the 887 code, since there's no way to know 888 what's in the argument list) */ 889 n += strlen(p); 890 goto expand; 891 } 892 } else 893 n++; 894 } 895 expand: 896 if (abuffersize > 20) { 897 /* add 1 for sprintf's trailing null byte */ 898 abuffer = PyObject_Malloc(abuffersize + 1); 899 if (!abuffer) { 900 PyErr_NoMemory(); 901 goto fail; 902 } 903 realbuffer = abuffer; 904 } 905 else 906 realbuffer = buffer; 907 /* step 4: fill the buffer */ 908 /* Since we've analyzed how much space we need for the worst case, 909 we don't have to resize the string. 910 There can be no errors beyond this point. */ 911 string = PyUnicode_FromUnicode(NULL, n); 912 if (!string) 913 goto fail; 914 915 s = PyUnicode_AS_UNICODE(string); 916 callresult = callresults; 917 918 for (f = format; *f; f++) { 919 if (*f == '%') { 920 const char* p = f++; 921 int longflag = 0; 922 int size_tflag = 0; 923 zeropad = (*f == '0'); 924 /* parse the width.precision part */ 925 width = 0; 926 while (isdigit((unsigned)*f)) 927 width = (width*10) + *f++ - '0'; 928 precision = 0; 929 if (*f == '.') { 930 f++; 931 while (isdigit((unsigned)*f)) 932 precision = (precision*10) + *f++ - '0'; 933 } 934 /* handle the long flag, but only for %ld and %lu. 935 others can be added when necessary. */ 936 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { 937 longflag = 1; 938 ++f; 939 } 940 /* handle the size_t flag. */ 941 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { 942 size_tflag = 1; 943 ++f; 944 } 945 946 switch (*f) { 947 case 'c': 948 *s++ = va_arg(vargs, int); 949 break; 950 case 'd': 951 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd'); 952 if (longflag) 953 sprintf(realbuffer, fmt, va_arg(vargs, long)); 954 else if (size_tflag) 955 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t)); 956 else 957 sprintf(realbuffer, fmt, va_arg(vargs, int)); 958 appendstring(realbuffer); 959 break; 960 case 'u': 961 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u'); 962 if (longflag) 963 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long)); 964 else if (size_tflag) 965 sprintf(realbuffer, fmt, va_arg(vargs, size_t)); 966 else 967 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int)); 968 appendstring(realbuffer); 969 break; 970 case 'i': 971 makefmt(fmt, 0, 0, zeropad, width, precision, 'i'); 972 sprintf(realbuffer, fmt, va_arg(vargs, int)); 973 appendstring(realbuffer); 974 break; 975 case 'x': 976 makefmt(fmt, 0, 0, zeropad, width, precision, 'x'); 977 sprintf(realbuffer, fmt, va_arg(vargs, int)); 978 appendstring(realbuffer); 979 break; 980 case 's': 981 { 982 /* unused, since we already have the result */ 983 (void) va_arg(vargs, char *); 984 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult), 985 PyUnicode_GET_SIZE(*callresult)); 986 s += PyUnicode_GET_SIZE(*callresult); 987 /* We're done with the unicode()/repr() => forget it */ 988 Py_DECREF(*callresult); 989 /* switch to next unicode()/repr() result */ 990 ++callresult; 991 break; 992 } 993 case 'U': 994 { 995 PyObject *obj = va_arg(vargs, PyObject *); 996 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 997 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 998 s += size; 999 break; 1000 } 1001 case 'V': 1002 { 1003 PyObject *obj = va_arg(vargs, PyObject *); 1004 const char *str = va_arg(vargs, const char *); 1005 if (obj) { 1006 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 1007 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 1008 s += size; 1009 } else { 1010 appendstring(str); 1011 } 1012 break; 1013 } 1014 case 'S': 1015 case 'R': 1016 { 1017 const char *str = PyString_AS_STRING(*callresult); 1018 /* unused, since we already have the result */ 1019 (void) va_arg(vargs, PyObject *); 1020 appendstring(str); 1021 /* We're done with the unicode()/repr() => forget it */ 1022 Py_DECREF(*callresult); 1023 /* switch to next unicode()/repr() result */ 1024 ++callresult; 1025 break; 1026 } 1027 case 'p': 1028 sprintf(buffer, "%p", va_arg(vargs, void*)); 1029 /* %p is ill-defined: ensure leading 0x. */ 1030 if (buffer[1] == 'X') 1031 buffer[1] = 'x'; 1032 else if (buffer[1] != 'x') { 1033 memmove(buffer+2, buffer, strlen(buffer)+1); 1034 buffer[0] = '0'; 1035 buffer[1] = 'x'; 1036 } 1037 appendstring(buffer); 1038 break; 1039 case '%': 1040 *s++ = '%'; 1041 break; 1042 default: 1043 appendstring(p); 1044 goto end; 1045 } 1046 } else 1047 *s++ = *f; 1048 } 1049 1050 end: 1051 if (callresults) 1052 PyObject_Free(callresults); 1053 if (abuffer) 1054 PyObject_Free(abuffer); 1055 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string)); 1056 return string; 1057 fail: 1058 if (callresults) { 1059 PyObject **callresult2 = callresults; 1060 while (callresult2 < callresult) { 1061 Py_DECREF(*callresult2); 1062 ++callresult2; 1063 } 1064 PyObject_Free(callresults); 1065 } 1066 if (abuffer) 1067 PyObject_Free(abuffer); 1068 return NULL; 1069 } 1070 1071 #undef appendstring 1072 1073 PyObject * 1074 PyUnicode_FromFormat(const char *format, ...) 1075 { 1076 PyObject* ret; 1077 va_list vargs; 1078 1079 #ifdef HAVE_STDARG_PROTOTYPES 1080 va_start(vargs, format); 1081 #else 1082 va_start(vargs); 1083 #endif 1084 ret = PyUnicode_FromFormatV(format, vargs); 1085 va_end(vargs); 1086 return ret; 1087 } 1088 1089 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode, 1090 wchar_t *w, 1091 Py_ssize_t size) 1092 { 1093 if (unicode == NULL) { 1094 PyErr_BadInternalCall(); 1095 return -1; 1096 } 1097 1098 /* If possible, try to copy the 0-termination as well */ 1099 if (size > PyUnicode_GET_SIZE(unicode)) 1100 size = PyUnicode_GET_SIZE(unicode) + 1; 1101 1102 #ifdef HAVE_USABLE_WCHAR_T 1103 memcpy(w, unicode->str, size * sizeof(wchar_t)); 1104 #else 1105 { 1106 register Py_UNICODE *u; 1107 register Py_ssize_t i; 1108 u = PyUnicode_AS_UNICODE(unicode); 1109 for (i = size; i > 0; i--) 1110 *w++ = *u++; 1111 } 1112 #endif 1113 1114 if (size > PyUnicode_GET_SIZE(unicode)) 1115 return PyUnicode_GET_SIZE(unicode); 1116 else 1117 return size; 1118 } 1119 1120 #endif 1121 1122 PyObject *PyUnicode_FromOrdinal(int ordinal) 1123 { 1124 Py_UNICODE s[1]; 1125 1126 #ifdef Py_UNICODE_WIDE 1127 if (ordinal < 0 || ordinal > 0x10ffff) { 1128 PyErr_SetString(PyExc_ValueError, 1129 "unichr() arg not in range(0x110000) " 1130 "(wide Python build)"); 1131 return NULL; 1132 } 1133 #else 1134 if (ordinal < 0 || ordinal > 0xffff) { 1135 PyErr_SetString(PyExc_ValueError, 1136 "unichr() arg not in range(0x10000) " 1137 "(narrow Python build)"); 1138 return NULL; 1139 } 1140 #endif 1141 1142 s[0] = (Py_UNICODE)ordinal; 1143 return PyUnicode_FromUnicode(s, 1); 1144 } 1145 1146 PyObject *PyUnicode_FromObject(register PyObject *obj) 1147 { 1148 /* XXX Perhaps we should make this API an alias of 1149 PyObject_Unicode() instead ?! */ 1150 if (PyUnicode_CheckExact(obj)) { 1151 Py_INCREF(obj); 1152 return obj; 1153 } 1154 if (PyUnicode_Check(obj)) { 1155 /* For a Unicode subtype that's not a Unicode object, 1156 return a true Unicode object with the same data. */ 1157 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 1158 PyUnicode_GET_SIZE(obj)); 1159 } 1160 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); 1161 } 1162 1163 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 1164 const char *encoding, 1165 const char *errors) 1166 { 1167 const char *s = NULL; 1168 Py_ssize_t len; 1169 PyObject *v; 1170 1171 if (obj == NULL) { 1172 PyErr_BadInternalCall(); 1173 return NULL; 1174 } 1175 1176 #if 0 1177 /* For b/w compatibility we also accept Unicode objects provided 1178 that no encodings is given and then redirect to 1179 PyObject_Unicode() which then applies the additional logic for 1180 Unicode subclasses. 1181 1182 NOTE: This API should really only be used for object which 1183 represent *encoded* Unicode ! 1184 1185 */ 1186 if (PyUnicode_Check(obj)) { 1187 if (encoding) { 1188 PyErr_SetString(PyExc_TypeError, 1189 "decoding Unicode is not supported"); 1190 return NULL; 1191 } 1192 return PyObject_Unicode(obj); 1193 } 1194 #else 1195 if (PyUnicode_Check(obj)) { 1196 PyErr_SetString(PyExc_TypeError, 1197 "decoding Unicode is not supported"); 1198 return NULL; 1199 } 1200 #endif 1201 1202 /* Coerce object */ 1203 if (PyString_Check(obj)) { 1204 s = PyString_AS_STRING(obj); 1205 len = PyString_GET_SIZE(obj); 1206 } 1207 else if (PyByteArray_Check(obj)) { 1208 /* Python 2.x specific */ 1209 PyErr_Format(PyExc_TypeError, 1210 "decoding bytearray is not supported"); 1211 return NULL; 1212 } 1213 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 1214 /* Overwrite the error message with something more useful in 1215 case of a TypeError. */ 1216 if (PyErr_ExceptionMatches(PyExc_TypeError)) 1217 PyErr_Format(PyExc_TypeError, 1218 "coercing to Unicode: need string or buffer, " 1219 "%.80s found", 1220 Py_TYPE(obj)->tp_name); 1221 goto onError; 1222 } 1223 1224 /* Convert to Unicode */ 1225 if (len == 0) 1226 _Py_RETURN_UNICODE_EMPTY(); 1227 1228 v = PyUnicode_Decode(s, len, encoding, errors); 1229 return v; 1230 1231 onError: 1232 return NULL; 1233 } 1234 1235 PyObject *PyUnicode_Decode(const char *s, 1236 Py_ssize_t size, 1237 const char *encoding, 1238 const char *errors) 1239 { 1240 PyObject *buffer = NULL, *unicode; 1241 1242 if (encoding == NULL) 1243 encoding = PyUnicode_GetDefaultEncoding(); 1244 1245 /* Shortcuts for common default encodings */ 1246 if (strcmp(encoding, "utf-8") == 0) 1247 return PyUnicode_DecodeUTF8(s, size, errors); 1248 else if (strcmp(encoding, "latin-1") == 0) 1249 return PyUnicode_DecodeLatin1(s, size, errors); 1250 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1251 else if (strcmp(encoding, "mbcs") == 0) 1252 return PyUnicode_DecodeMBCS(s, size, errors); 1253 #endif 1254 else if (strcmp(encoding, "ascii") == 0) 1255 return PyUnicode_DecodeASCII(s, size, errors); 1256 1257 /* Decode via the codec registry */ 1258 buffer = PyBuffer_FromMemory((void *)s, size); 1259 if (buffer == NULL) 1260 goto onError; 1261 unicode = _PyCodec_DecodeText(buffer, encoding, errors); 1262 if (unicode == NULL) 1263 goto onError; 1264 if (!PyUnicode_Check(unicode)) { 1265 PyErr_Format(PyExc_TypeError, 1266 "decoder did not return an unicode object (type=%.400s)", 1267 Py_TYPE(unicode)->tp_name); 1268 Py_DECREF(unicode); 1269 goto onError; 1270 } 1271 Py_DECREF(buffer); 1272 return unicode; 1273 1274 onError: 1275 Py_XDECREF(buffer); 1276 return NULL; 1277 } 1278 1279 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode, 1280 const char *encoding, 1281 const char *errors) 1282 { 1283 PyObject *v; 1284 1285 if (!PyUnicode_Check(unicode)) { 1286 PyErr_BadArgument(); 1287 goto onError; 1288 } 1289 1290 if (PyErr_WarnPy3k("decoding Unicode is not supported in 3.x", 1) < 0) 1291 goto onError; 1292 1293 if (encoding == NULL) 1294 encoding = PyUnicode_GetDefaultEncoding(); 1295 1296 /* Decode via the codec registry */ 1297 v = _PyCodec_DecodeText(unicode, encoding, errors); 1298 if (v == NULL) 1299 goto onError; 1300 return v; 1301 1302 onError: 1303 return NULL; 1304 } 1305 1306 PyObject *PyUnicode_Encode(const Py_UNICODE *s, 1307 Py_ssize_t size, 1308 const char *encoding, 1309 const char *errors) 1310 { 1311 PyObject *v, *unicode; 1312 1313 unicode = PyUnicode_FromUnicode(s, size); 1314 if (unicode == NULL) 1315 return NULL; 1316 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 1317 Py_DECREF(unicode); 1318 return v; 1319 } 1320 1321 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode, 1322 const char *encoding, 1323 const char *errors) 1324 { 1325 PyObject *v; 1326 1327 if (!PyUnicode_Check(unicode)) { 1328 PyErr_BadArgument(); 1329 goto onError; 1330 } 1331 1332 if (encoding == NULL) 1333 encoding = PyUnicode_GetDefaultEncoding(); 1334 1335 /* Encode via the codec registry */ 1336 v = _PyCodec_EncodeText(unicode, encoding, errors); 1337 if (v == NULL) 1338 goto onError; 1339 return v; 1340 1341 onError: 1342 return NULL; 1343 } 1344 1345 PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 1346 const char *encoding, 1347 const char *errors) 1348 { 1349 PyObject *v; 1350 1351 if (!PyUnicode_Check(unicode)) { 1352 PyErr_BadArgument(); 1353 goto onError; 1354 } 1355 1356 if (encoding == NULL) 1357 encoding = PyUnicode_GetDefaultEncoding(); 1358 1359 /* Shortcuts for common default encodings */ 1360 if (errors == NULL) { 1361 if (strcmp(encoding, "utf-8") == 0) 1362 return PyUnicode_AsUTF8String(unicode); 1363 else if (strcmp(encoding, "latin-1") == 0) 1364 return PyUnicode_AsLatin1String(unicode); 1365 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1366 else if (strcmp(encoding, "mbcs") == 0) 1367 return PyUnicode_AsMBCSString(unicode); 1368 #endif 1369 else if (strcmp(encoding, "ascii") == 0) 1370 return PyUnicode_AsASCIIString(unicode); 1371 } 1372 1373 /* Encode via the codec registry */ 1374 v = _PyCodec_EncodeText(unicode, encoding, errors); 1375 if (v == NULL) 1376 goto onError; 1377 if (!PyString_Check(v)) { 1378 PyErr_Format(PyExc_TypeError, 1379 "encoder did not return a string object (type=%.400s)", 1380 Py_TYPE(v)->tp_name); 1381 Py_DECREF(v); 1382 goto onError; 1383 } 1384 return v; 1385 1386 onError: 1387 return NULL; 1388 } 1389 1390 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 1391 const char *errors) 1392 { 1393 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 1394 1395 if (v) 1396 return v; 1397 v = PyUnicode_AsEncodedString(unicode, NULL, errors); 1398 if (v && errors == NULL) 1399 ((PyUnicodeObject *)unicode)->defenc = v; 1400 return v; 1401 } 1402 1403 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 1404 { 1405 if (!PyUnicode_Check(unicode)) { 1406 PyErr_BadArgument(); 1407 goto onError; 1408 } 1409 return PyUnicode_AS_UNICODE(unicode); 1410 1411 onError: 1412 return NULL; 1413 } 1414 1415 Py_ssize_t PyUnicode_GetSize(PyObject *unicode) 1416 { 1417 if (!PyUnicode_Check(unicode)) { 1418 PyErr_BadArgument(); 1419 goto onError; 1420 } 1421 return PyUnicode_GET_SIZE(unicode); 1422 1423 onError: 1424 return -1; 1425 } 1426 1427 const char *PyUnicode_GetDefaultEncoding(void) 1428 { 1429 return unicode_default_encoding; 1430 } 1431 1432 int PyUnicode_SetDefaultEncoding(const char *encoding) 1433 { 1434 PyObject *v; 1435 1436 /* Make sure the encoding is valid. As side effect, this also 1437 loads the encoding into the codec registry cache. */ 1438 v = _PyCodec_Lookup(encoding); 1439 if (v == NULL) 1440 goto onError; 1441 Py_DECREF(v); 1442 strncpy(unicode_default_encoding, 1443 encoding, 1444 sizeof(unicode_default_encoding) - 1); 1445 return 0; 1446 1447 onError: 1448 return -1; 1449 } 1450 1451 /* error handling callback helper: 1452 build arguments, call the callback and check the arguments, 1453 if no exception occurred, copy the replacement to the output 1454 and adjust various state variables. 1455 return 0 on success, -1 on error 1456 */ 1457 1458 static 1459 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 1460 const char *encoding, const char *reason, 1461 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, 1462 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 1463 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 1464 { 1465 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple"; 1466 1467 PyObject *restuple = NULL; 1468 PyObject *repunicode = NULL; 1469 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 1470 Py_ssize_t requiredsize; 1471 Py_ssize_t newpos; 1472 Py_UNICODE *repptr; 1473 Py_ssize_t repsize; 1474 int res = -1; 1475 1476 if (*errorHandler == NULL) { 1477 *errorHandler = PyCodec_LookupError(errors); 1478 if (*errorHandler == NULL) 1479 goto onError; 1480 } 1481 1482 if (*exceptionObject == NULL) { 1483 *exceptionObject = PyUnicodeDecodeError_Create( 1484 encoding, input, insize, *startinpos, *endinpos, reason); 1485 if (*exceptionObject == NULL) 1486 goto onError; 1487 } 1488 else { 1489 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) 1490 goto onError; 1491 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) 1492 goto onError; 1493 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 1494 goto onError; 1495 } 1496 1497 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 1498 if (restuple == NULL) 1499 goto onError; 1500 if (!PyTuple_Check(restuple)) { 1501 PyErr_SetString(PyExc_TypeError, &argparse[4]); 1502 goto onError; 1503 } 1504 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 1505 goto onError; 1506 if (newpos<0) 1507 newpos = insize+newpos; 1508 if (newpos<0 || newpos>insize) { 1509 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 1510 goto onError; 1511 } 1512 1513 /* need more space? (at least enough for what we 1514 have+the replacement+the rest of the string (starting 1515 at the new input position), so we won't have to check space 1516 when there are no errors in the rest of the string) */ 1517 repptr = PyUnicode_AS_UNICODE(repunicode); 1518 repsize = PyUnicode_GET_SIZE(repunicode); 1519 requiredsize = *outpos; 1520 if (requiredsize > PY_SSIZE_T_MAX - repsize) 1521 goto overflow; 1522 requiredsize += repsize; 1523 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos)) 1524 goto overflow; 1525 requiredsize += insize - newpos; 1526 if (requiredsize > outsize) { 1527 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize) 1528 requiredsize = 2*outsize; 1529 if (_PyUnicode_Resize(output, requiredsize) < 0) 1530 goto onError; 1531 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 1532 } 1533 *endinpos = newpos; 1534 *inptr = input + newpos; 1535 Py_UNICODE_COPY(*outptr, repptr, repsize); 1536 *outptr += repsize; 1537 *outpos += repsize; 1538 /* we made it! */ 1539 res = 0; 1540 1541 onError: 1542 Py_XDECREF(restuple); 1543 return res; 1544 1545 overflow: 1546 PyErr_SetString(PyExc_OverflowError, 1547 "decoded result is too long for a Python string"); 1548 goto onError; 1549 } 1550 1551 /* --- UTF-7 Codec -------------------------------------------------------- */ 1552 1553 /* See RFC2152 for details. We encode conservatively and decode liberally. */ 1554 1555 /* Three simple macros defining base-64. */ 1556 1557 /* Is c a base-64 character? */ 1558 1559 #define IS_BASE64(c) \ 1560 (((c) >= 'A' && (c) <= 'Z') || \ 1561 ((c) >= 'a' && (c) <= 'z') || \ 1562 ((c) >= '0' && (c) <= '9') || \ 1563 (c) == '+' || (c) == '/') 1564 1565 /* given that c is a base-64 character, what is its base-64 value? */ 1566 1567 #define FROM_BASE64(c) \ 1568 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 1569 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 1570 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 1571 (c) == '+' ? 62 : 63) 1572 1573 /* What is the base-64 character of the bottom 6 bits of n? */ 1574 1575 #define TO_BASE64(n) \ 1576 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 1577 1578 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 1579 * decoded as itself. We are permissive on decoding; the only ASCII 1580 * byte not decoding to itself is the + which begins a base64 1581 * string. */ 1582 1583 #define DECODE_DIRECT(c) \ 1584 ((c) <= 127 && (c) != '+') 1585 1586 /* The UTF-7 encoder treats ASCII characters differently according to 1587 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 1588 * the above). See RFC2152. This array identifies these different 1589 * sets: 1590 * 0 : "Set D" 1591 * alphanumeric and '(),-./:? 1592 * 1 : "Set O" 1593 * !"#$%&*;<=>@[]^_`{|} 1594 * 2 : "whitespace" 1595 * ht nl cr sp 1596 * 3 : special (must be base64 encoded) 1597 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 1598 */ 1599 1600 static 1601 char utf7_category[128] = { 1602 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 1603 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 1604 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 1605 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1606 /* sp ! " # $ % & ' ( ) * + , - . / */ 1607 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 1608 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 1609 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1610 /* @ A B C D E F G H I J K L M N O */ 1611 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1612 /* P Q R S T U V W X Y Z [ \ ] ^ _ */ 1613 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 1614 /* ` a b c d e f g h i j k l m n o */ 1615 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1616 /* p q r s t u v w x y z { | } ~ del */ 1617 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 1618 }; 1619 1620 /* ENCODE_DIRECT: this character should be encoded as itself. The 1621 * answer depends on whether we are encoding set O as itself, and also 1622 * on whether we are encoding whitespace as itself. RFC2152 makes it 1623 * clear that the answers to these questions vary between 1624 * applications, so this code needs to be flexible. */ 1625 1626 #define ENCODE_DIRECT(c, directO, directWS) \ 1627 ((c) < 128 && (c) > 0 && \ 1628 ((utf7_category[(c)] == 0) || \ 1629 (directWS && (utf7_category[(c)] == 2)) || \ 1630 (directO && (utf7_category[(c)] == 1)))) 1631 1632 PyObject *PyUnicode_DecodeUTF7(const char *s, 1633 Py_ssize_t size, 1634 const char *errors) 1635 { 1636 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 1637 } 1638 1639 /* The decoder. The only state we preserve is our read position, 1640 * i.e. how many characters we have consumed. So if we end in the 1641 * middle of a shift sequence we have to back off the read position 1642 * and the output to the beginning of the sequence, otherwise we lose 1643 * all the shift state (seen bits, number of bits seen, high 1644 * surrogate). */ 1645 1646 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, 1647 Py_ssize_t size, 1648 const char *errors, 1649 Py_ssize_t *consumed) 1650 { 1651 const char *starts = s; 1652 Py_ssize_t startinpos; 1653 Py_ssize_t endinpos; 1654 Py_ssize_t outpos; 1655 const char *e; 1656 PyUnicodeObject *unicode; 1657 Py_UNICODE *p; 1658 const char *errmsg = ""; 1659 int inShift = 0; 1660 Py_UNICODE *shiftOutStart; 1661 unsigned int base64bits = 0; 1662 unsigned long base64buffer = 0; 1663 Py_UNICODE surrogate = 0; 1664 PyObject *errorHandler = NULL; 1665 PyObject *exc = NULL; 1666 1667 unicode = _PyUnicode_New(size); 1668 if (!unicode) 1669 return NULL; 1670 if (size == 0) { 1671 if (consumed) 1672 *consumed = 0; 1673 return (PyObject *)unicode; 1674 } 1675 1676 p = unicode->str; 1677 shiftOutStart = p; 1678 e = s + size; 1679 1680 while (s < e) { 1681 Py_UNICODE ch = (unsigned char) *s; 1682 1683 if (inShift) { /* in a base-64 section */ 1684 if (IS_BASE64(ch)) { /* consume a base-64 character */ 1685 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 1686 base64bits += 6; 1687 s++; 1688 if (base64bits >= 16) { 1689 /* we have enough bits for a UTF-16 value */ 1690 Py_UNICODE outCh = (Py_UNICODE) 1691 (base64buffer >> (base64bits-16)); 1692 base64bits -= 16; 1693 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 1694 assert(outCh <= 0xffff); 1695 if (surrogate) { 1696 /* expecting a second surrogate */ 1697 if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 1698 #ifdef Py_UNICODE_WIDE 1699 *p++ = (((surrogate & 0x3FF)<<10) 1700 | (outCh & 0x3FF)) + 0x10000; 1701 #else 1702 *p++ = surrogate; 1703 *p++ = outCh; 1704 #endif 1705 surrogate = 0; 1706 continue; 1707 } 1708 else { 1709 *p++ = surrogate; 1710 surrogate = 0; 1711 } 1712 } 1713 if (outCh >= 0xD800 && outCh <= 0xDBFF) { 1714 /* first surrogate */ 1715 surrogate = outCh; 1716 } 1717 else { 1718 *p++ = outCh; 1719 } 1720 } 1721 } 1722 else { /* now leaving a base-64 section */ 1723 inShift = 0; 1724 if (base64bits > 0) { /* left-over bits */ 1725 if (base64bits >= 6) { 1726 /* We've seen at least one base-64 character */ 1727 s++; 1728 errmsg = "partial character in shift sequence"; 1729 goto utf7Error; 1730 } 1731 else { 1732 /* Some bits remain; they should be zero */ 1733 if (base64buffer != 0) { 1734 s++; 1735 errmsg = "non-zero padding bits in shift sequence"; 1736 goto utf7Error; 1737 } 1738 } 1739 } 1740 if (surrogate && DECODE_DIRECT(ch)) 1741 *p++ = surrogate; 1742 surrogate = 0; 1743 if (ch == '-') { 1744 /* '-' is absorbed; other terminating 1745 characters are preserved */ 1746 s++; 1747 } 1748 } 1749 } 1750 else if ( ch == '+' ) { 1751 startinpos = s-starts; 1752 s++; /* consume '+' */ 1753 if (s < e && *s == '-') { /* '+-' encodes '+' */ 1754 s++; 1755 *p++ = '+'; 1756 } 1757 else { /* begin base64-encoded section */ 1758 inShift = 1; 1759 surrogate = 0; 1760 shiftOutStart = p; 1761 base64bits = 0; 1762 base64buffer = 0; 1763 } 1764 } 1765 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 1766 *p++ = ch; 1767 s++; 1768 } 1769 else { 1770 startinpos = s-starts; 1771 s++; 1772 errmsg = "unexpected special character"; 1773 goto utf7Error; 1774 } 1775 continue; 1776 utf7Error: 1777 outpos = p-PyUnicode_AS_UNICODE(unicode); 1778 endinpos = s-starts; 1779 if (unicode_decode_call_errorhandler( 1780 errors, &errorHandler, 1781 "utf7", errmsg, 1782 starts, size, &startinpos, &endinpos, &exc, &s, 1783 &unicode, &outpos, &p)) 1784 goto onError; 1785 } 1786 1787 /* end of string */ 1788 1789 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 1790 /* if we're in an inconsistent state, that's an error */ 1791 inShift = 0; 1792 if (surrogate || 1793 (base64bits >= 6) || 1794 (base64bits > 0 && base64buffer != 0)) { 1795 outpos = p-PyUnicode_AS_UNICODE(unicode); 1796 endinpos = size; 1797 if (unicode_decode_call_errorhandler( 1798 errors, &errorHandler, 1799 "utf7", "unterminated shift sequence", 1800 starts, size, &startinpos, &endinpos, &exc, &s, 1801 &unicode, &outpos, &p)) 1802 goto onError; 1803 } 1804 } 1805 1806 /* return state */ 1807 if (consumed) { 1808 if (inShift) { 1809 p = shiftOutStart; /* back off output */ 1810 *consumed = startinpos; 1811 } 1812 else { 1813 *consumed = s-starts; 1814 } 1815 } 1816 1817 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 1818 goto onError; 1819 1820 Py_XDECREF(errorHandler); 1821 Py_XDECREF(exc); 1822 return (PyObject *)unicode; 1823 1824 onError: 1825 Py_XDECREF(errorHandler); 1826 Py_XDECREF(exc); 1827 Py_DECREF(unicode); 1828 return NULL; 1829 } 1830 1831 1832 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 1833 Py_ssize_t size, 1834 int base64SetO, 1835 int base64WhiteSpace, 1836 const char *errors) 1837 { 1838 PyObject *v; 1839 /* It might be possible to tighten this worst case */ 1840 Py_ssize_t allocated = 8 * size; 1841 int inShift = 0; 1842 Py_ssize_t i = 0; 1843 unsigned int base64bits = 0; 1844 unsigned long base64buffer = 0; 1845 char * out; 1846 char * start; 1847 1848 if (allocated / 8 != size) 1849 return PyErr_NoMemory(); 1850 1851 if (size == 0) 1852 return PyString_FromStringAndSize(NULL, 0); 1853 1854 v = PyString_FromStringAndSize(NULL, allocated); 1855 if (v == NULL) 1856 return NULL; 1857 1858 start = out = PyString_AS_STRING(v); 1859 for (;i < size; ++i) { 1860 Py_UNICODE ch = s[i]; 1861 1862 if (inShift) { 1863 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 1864 /* shifting out */ 1865 if (base64bits) { /* output remaining bits */ 1866 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 1867 base64buffer = 0; 1868 base64bits = 0; 1869 } 1870 inShift = 0; 1871 /* Characters not in the BASE64 set implicitly unshift the sequence 1872 so no '-' is required, except if the character is itself a '-' */ 1873 if (IS_BASE64(ch) || ch == '-') { 1874 *out++ = '-'; 1875 } 1876 *out++ = (char) ch; 1877 } 1878 else { 1879 goto encode_char; 1880 } 1881 } 1882 else { /* not in a shift sequence */ 1883 if (ch == '+') { 1884 *out++ = '+'; 1885 *out++ = '-'; 1886 } 1887 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 1888 *out++ = (char) ch; 1889 } 1890 else { 1891 *out++ = '+'; 1892 inShift = 1; 1893 goto encode_char; 1894 } 1895 } 1896 continue; 1897 encode_char: 1898 #ifdef Py_UNICODE_WIDE 1899 if (ch >= 0x10000) { 1900 /* code first surrogate */ 1901 base64bits += 16; 1902 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 1903 while (base64bits >= 6) { 1904 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 1905 base64bits -= 6; 1906 } 1907 /* prepare second surrogate */ 1908 ch = 0xDC00 | ((ch-0x10000) & 0x3FF); 1909 } 1910 #endif 1911 base64bits += 16; 1912 base64buffer = (base64buffer << 16) | ch; 1913 while (base64bits >= 6) { 1914 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 1915 base64bits -= 6; 1916 } 1917 } 1918 if (base64bits) 1919 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 1920 if (inShift) 1921 *out++ = '-'; 1922 1923 if (_PyString_Resize(&v, out - start)) 1924 return NULL; 1925 return v; 1926 } 1927 1928 #undef IS_BASE64 1929 #undef FROM_BASE64 1930 #undef TO_BASE64 1931 #undef DECODE_DIRECT 1932 #undef ENCODE_DIRECT 1933 1934 /* --- UTF-8 Codec -------------------------------------------------------- */ 1935 1936 static 1937 char utf8_code_length[256] = { 1938 /* Map UTF-8 encoded prefix byte to sequence length. Zero means 1939 illegal prefix. See RFC 3629 for details */ 1940 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 1941 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1942 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1943 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1944 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1945 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1946 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1947 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ 1948 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ 1949 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1950 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1951 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ 1952 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ 1953 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ 1954 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ 1955 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 1956 }; 1957 1958 PyObject *PyUnicode_DecodeUTF8(const char *s, 1959 Py_ssize_t size, 1960 const char *errors) 1961 { 1962 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 1963 } 1964 1965 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, 1966 Py_ssize_t size, 1967 const char *errors, 1968 Py_ssize_t *consumed) 1969 { 1970 const char *starts = s; 1971 int n; 1972 int k; 1973 Py_ssize_t startinpos; 1974 Py_ssize_t endinpos; 1975 Py_ssize_t outpos; 1976 const char *e; 1977 PyUnicodeObject *unicode; 1978 Py_UNICODE *p; 1979 const char *errmsg = ""; 1980 PyObject *errorHandler = NULL; 1981 PyObject *exc = NULL; 1982 1983 /* Note: size will always be longer than the resulting Unicode 1984 character count */ 1985 unicode = _PyUnicode_New(size); 1986 if (!unicode) 1987 return NULL; 1988 if (size == 0) { 1989 if (consumed) 1990 *consumed = 0; 1991 return (PyObject *)unicode; 1992 } 1993 1994 /* Unpack UTF-8 encoded data */ 1995 p = unicode->str; 1996 e = s + size; 1997 1998 while (s < e) { 1999 Py_UCS4 ch = (unsigned char)*s; 2000 2001 if (ch < 0x80) { 2002 *p++ = (Py_UNICODE)ch; 2003 s++; 2004 continue; 2005 } 2006 2007 n = utf8_code_length[ch]; 2008 2009 if (s + n > e) { 2010 if (consumed) 2011 break; 2012 else { 2013 errmsg = "unexpected end of data"; 2014 startinpos = s-starts; 2015 endinpos = startinpos+1; 2016 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) 2017 endinpos++; 2018 goto utf8Error; 2019 } 2020 } 2021 2022 switch (n) { 2023 2024 case 0: 2025 errmsg = "invalid start byte"; 2026 startinpos = s-starts; 2027 endinpos = startinpos+1; 2028 goto utf8Error; 2029 2030 case 1: 2031 errmsg = "internal error"; 2032 startinpos = s-starts; 2033 endinpos = startinpos+1; 2034 goto utf8Error; 2035 2036 case 2: 2037 if ((s[1] & 0xc0) != 0x80) { 2038 errmsg = "invalid continuation byte"; 2039 startinpos = s-starts; 2040 endinpos = startinpos + 1; 2041 goto utf8Error; 2042 } 2043 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 2044 assert ((ch > 0x007F) && (ch <= 0x07FF)); 2045 *p++ = (Py_UNICODE)ch; 2046 break; 2047 2048 case 3: 2049 /* XXX: surrogates shouldn't be valid UTF-8! 2050 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 2051 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt 2052 Uncomment the 2 lines below to make them invalid, 2053 code points: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */ 2054 if ((s[1] & 0xc0) != 0x80 || 2055 (s[2] & 0xc0) != 0x80 || 2056 ((unsigned char)s[0] == 0xE0 && 2057 (unsigned char)s[1] < 0xA0)/* || 2058 ((unsigned char)s[0] == 0xED && 2059 (unsigned char)s[1] > 0x9F)*/) { 2060 errmsg = "invalid continuation byte"; 2061 startinpos = s-starts; 2062 endinpos = startinpos + 1; 2063 2064 /* if s[1] first two bits are 1 and 0, then the invalid 2065 continuation byte is s[2], so increment endinpos by 1, 2066 if not, s[1] is invalid and endinpos doesn't need to 2067 be incremented. */ 2068 if ((s[1] & 0xC0) == 0x80) 2069 endinpos++; 2070 goto utf8Error; 2071 } 2072 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 2073 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 2074 *p++ = (Py_UNICODE)ch; 2075 break; 2076 2077 case 4: 2078 if ((s[1] & 0xc0) != 0x80 || 2079 (s[2] & 0xc0) != 0x80 || 2080 (s[3] & 0xc0) != 0x80 || 2081 ((unsigned char)s[0] == 0xF0 && 2082 (unsigned char)s[1] < 0x90) || 2083 ((unsigned char)s[0] == 0xF4 && 2084 (unsigned char)s[1] > 0x8F)) { 2085 errmsg = "invalid continuation byte"; 2086 startinpos = s-starts; 2087 endinpos = startinpos + 1; 2088 if ((s[1] & 0xC0) == 0x80) { 2089 endinpos++; 2090 if ((s[2] & 0xC0) == 0x80) 2091 endinpos++; 2092 } 2093 goto utf8Error; 2094 } 2095 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 2096 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 2097 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 2098 2099 #ifdef Py_UNICODE_WIDE 2100 *p++ = (Py_UNICODE)ch; 2101 #else 2102 /* compute and append the two surrogates: */ 2103 2104 /* translate from 10000..10FFFF to 0..FFFF */ 2105 ch -= 0x10000; 2106 2107 /* high surrogate = top 10 bits added to D800 */ 2108 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 2109 2110 /* low surrogate = bottom 10 bits added to DC00 */ 2111 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 2112 #endif 2113 break; 2114 } 2115 s += n; 2116 continue; 2117 2118 utf8Error: 2119 outpos = p-PyUnicode_AS_UNICODE(unicode); 2120 if (unicode_decode_call_errorhandler( 2121 errors, &errorHandler, 2122 "utf8", errmsg, 2123 starts, size, &startinpos, &endinpos, &exc, &s, 2124 &unicode, &outpos, &p)) 2125 goto onError; 2126 } 2127 if (consumed) 2128 *consumed = s-starts; 2129 2130 /* Adjust length */ 2131 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2132 goto onError; 2133 2134 Py_XDECREF(errorHandler); 2135 Py_XDECREF(exc); 2136 return (PyObject *)unicode; 2137 2138 onError: 2139 Py_XDECREF(errorHandler); 2140 Py_XDECREF(exc); 2141 Py_DECREF(unicode); 2142 return NULL; 2143 } 2144 2145 /* Allocation strategy: if the string is short, convert into a stack buffer 2146 and allocate exactly as much space needed at the end. Else allocate the 2147 maximum possible needed (4 result bytes per Unicode character), and return 2148 the excess memory at the end. 2149 */ 2150 PyObject * 2151 PyUnicode_EncodeUTF8(const Py_UNICODE *s, 2152 Py_ssize_t size, 2153 const char *errors) 2154 { 2155 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 2156 2157 Py_ssize_t i; /* index into s of next input byte */ 2158 PyObject *v; /* result string object */ 2159 char *p; /* next free byte in output buffer */ 2160 Py_ssize_t nallocated; /* number of result bytes allocated */ 2161 Py_ssize_t nneeded; /* number of result bytes needed */ 2162 char stackbuf[MAX_SHORT_UNICHARS * 4]; 2163 2164 assert(s != NULL); 2165 assert(size >= 0); 2166 2167 if (size <= MAX_SHORT_UNICHARS) { 2168 /* Write into the stack buffer; nallocated can't overflow. 2169 * At the end, we'll allocate exactly as much heap space as it 2170 * turns out we need. 2171 */ 2172 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 2173 v = NULL; /* will allocate after we're done */ 2174 p = stackbuf; 2175 } 2176 else { 2177 /* Overallocate on the heap, and give the excess back at the end. */ 2178 nallocated = size * 4; 2179 if (nallocated / 4 != size) /* overflow! */ 2180 return PyErr_NoMemory(); 2181 v = PyString_FromStringAndSize(NULL, nallocated); 2182 if (v == NULL) 2183 return NULL; 2184 p = PyString_AS_STRING(v); 2185 } 2186 2187 for (i = 0; i < size;) { 2188 Py_UCS4 ch = s[i++]; 2189 2190 if (ch < 0x80) 2191 /* Encode ASCII */ 2192 *p++ = (char) ch; 2193 2194 else if (ch < 0x0800) { 2195 /* Encode Latin-1 */ 2196 *p++ = (char)(0xc0 | (ch >> 6)); 2197 *p++ = (char)(0x80 | (ch & 0x3f)); 2198 } 2199 else { 2200 /* Encode UCS2 Unicode ordinals */ 2201 if (ch < 0x10000) { 2202 /* Special case: check for high surrogate */ 2203 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { 2204 Py_UCS4 ch2 = s[i]; 2205 /* Check for low surrogate and combine the two to 2206 form a UCS4 value */ 2207 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2208 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 2209 i++; 2210 goto encodeUCS4; 2211 } 2212 /* Fall through: handles isolated high surrogates */ 2213 } 2214 *p++ = (char)(0xe0 | (ch >> 12)); 2215 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2216 *p++ = (char)(0x80 | (ch & 0x3f)); 2217 continue; 2218 } 2219 encodeUCS4: 2220 /* Encode UCS4 Unicode ordinals */ 2221 *p++ = (char)(0xf0 | (ch >> 18)); 2222 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 2223 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2224 *p++ = (char)(0x80 | (ch & 0x3f)); 2225 } 2226 } 2227 2228 if (v == NULL) { 2229 /* This was stack allocated. */ 2230 nneeded = p - stackbuf; 2231 assert(nneeded <= nallocated); 2232 v = PyString_FromStringAndSize(stackbuf, nneeded); 2233 } 2234 else { 2235 /* Cut back to size actually needed. */ 2236 nneeded = p - PyString_AS_STRING(v); 2237 assert(nneeded <= nallocated); 2238 if (_PyString_Resize(&v, nneeded)) 2239 return NULL; 2240 } 2241 return v; 2242 2243 #undef MAX_SHORT_UNICHARS 2244 } 2245 2246 PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 2247 { 2248 if (!PyUnicode_Check(unicode)) { 2249 PyErr_BadArgument(); 2250 return NULL; 2251 } 2252 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 2253 PyUnicode_GET_SIZE(unicode), 2254 NULL); 2255 } 2256 2257 /* --- UTF-32 Codec ------------------------------------------------------- */ 2258 2259 PyObject * 2260 PyUnicode_DecodeUTF32(const char *s, 2261 Py_ssize_t size, 2262 const char *errors, 2263 int *byteorder) 2264 { 2265 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 2266 } 2267 2268 PyObject * 2269 PyUnicode_DecodeUTF32Stateful(const char *s, 2270 Py_ssize_t size, 2271 const char *errors, 2272 int *byteorder, 2273 Py_ssize_t *consumed) 2274 { 2275 const char *starts = s; 2276 Py_ssize_t startinpos; 2277 Py_ssize_t endinpos; 2278 Py_ssize_t outpos; 2279 PyUnicodeObject *unicode; 2280 Py_UNICODE *p; 2281 #ifndef Py_UNICODE_WIDE 2282 int pairs = 0; 2283 const unsigned char *qq; 2284 #else 2285 const int pairs = 0; 2286 #endif 2287 const unsigned char *q, *e; 2288 int bo = 0; /* assume native ordering by default */ 2289 const char *errmsg = ""; 2290 /* Offsets from q for retrieving bytes in the right order. */ 2291 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 2292 int iorder[] = {0, 1, 2, 3}; 2293 #else 2294 int iorder[] = {3, 2, 1, 0}; 2295 #endif 2296 PyObject *errorHandler = NULL; 2297 PyObject *exc = NULL; 2298 2299 q = (unsigned char *)s; 2300 e = q + size; 2301 2302 if (byteorder) 2303 bo = *byteorder; 2304 2305 /* Check for BOM marks (U+FEFF) in the input and adjust current 2306 byte order setting accordingly. In native mode, the leading BOM 2307 mark is skipped, in all other modes, it is copied to the output 2308 stream as-is (giving a ZWNBSP character). */ 2309 if (bo == 0) { 2310 if (size >= 4) { 2311 const Py_UCS4 bom = ((unsigned int)q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2312 (q[iorder[1]] << 8) | q[iorder[0]]; 2313 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 2314 if (bom == 0x0000FEFF) { 2315 q += 4; 2316 bo = -1; 2317 } 2318 else if (bom == 0xFFFE0000) { 2319 q += 4; 2320 bo = 1; 2321 } 2322 #else 2323 if (bom == 0x0000FEFF) { 2324 q += 4; 2325 bo = 1; 2326 } 2327 else if (bom == 0xFFFE0000) { 2328 q += 4; 2329 bo = -1; 2330 } 2331 #endif 2332 } 2333 } 2334 2335 if (bo == -1) { 2336 /* force LE */ 2337 iorder[0] = 0; 2338 iorder[1] = 1; 2339 iorder[2] = 2; 2340 iorder[3] = 3; 2341 } 2342 else if (bo == 1) { 2343 /* force BE */ 2344 iorder[0] = 3; 2345 iorder[1] = 2; 2346 iorder[2] = 1; 2347 iorder[3] = 0; 2348 } 2349 2350 /* On narrow builds we split characters outside the BMP into two 2351 code points => count how much extra space we need. */ 2352 #ifndef Py_UNICODE_WIDE 2353 for (qq = q; e - qq >= 4; qq += 4) 2354 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0) 2355 pairs++; 2356 #endif 2357 2358 /* This might be one to much, because of a BOM */ 2359 unicode = _PyUnicode_New((size+3)/4+pairs); 2360 if (!unicode) 2361 return NULL; 2362 if (size == 0) 2363 return (PyObject *)unicode; 2364 2365 /* Unpack UTF-32 encoded data */ 2366 p = unicode->str; 2367 2368 while (q < e) { 2369 Py_UCS4 ch; 2370 /* remaining bytes at the end? (size should be divisible by 4) */ 2371 if (e-q<4) { 2372 if (consumed) 2373 break; 2374 errmsg = "truncated data"; 2375 startinpos = ((const char *)q)-starts; 2376 endinpos = ((const char *)e)-starts; 2377 goto utf32Error; 2378 /* The remaining input chars are ignored if the callback 2379 chooses to skip the input */ 2380 } 2381 ch = ((unsigned int)q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2382 (q[iorder[1]] << 8) | q[iorder[0]]; 2383 2384 if (ch >= 0x110000) 2385 { 2386 errmsg = "code point not in range(0x110000)"; 2387 startinpos = ((const char *)q)-starts; 2388 endinpos = startinpos+4; 2389 goto utf32Error; 2390 } 2391 #ifndef Py_UNICODE_WIDE 2392 if (ch >= 0x10000) 2393 { 2394 *p++ = 0xD800 | ((ch-0x10000) >> 10); 2395 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 2396 } 2397 else 2398 #endif 2399 *p++ = ch; 2400 q += 4; 2401 continue; 2402 utf32Error: 2403 outpos = p-PyUnicode_AS_UNICODE(unicode); 2404 if (unicode_decode_call_errorhandler( 2405 errors, &errorHandler, 2406 "utf32", errmsg, 2407 starts, size, &startinpos, &endinpos, &exc, (const char **)&q, 2408 &unicode, &outpos, &p)) 2409 goto onError; 2410 } 2411 2412 if (byteorder) 2413 *byteorder = bo; 2414 2415 if (consumed) 2416 *consumed = (const char *)q-starts; 2417 2418 /* Adjust length */ 2419 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2420 goto onError; 2421 2422 Py_XDECREF(errorHandler); 2423 Py_XDECREF(exc); 2424 return (PyObject *)unicode; 2425 2426 onError: 2427 Py_DECREF(unicode); 2428 Py_XDECREF(errorHandler); 2429 Py_XDECREF(exc); 2430 return NULL; 2431 } 2432 2433 PyObject * 2434 PyUnicode_EncodeUTF32(const Py_UNICODE *s, 2435 Py_ssize_t size, 2436 const char *errors, 2437 int byteorder) 2438 { 2439 PyObject *v; 2440 unsigned char *p; 2441 Py_ssize_t nsize, bytesize; 2442 #ifndef Py_UNICODE_WIDE 2443 Py_ssize_t i, pairs; 2444 #else 2445 const int pairs = 0; 2446 #endif 2447 /* Offsets from p for storing byte pairs in the right order. */ 2448 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 2449 int iorder[] = {0, 1, 2, 3}; 2450 #else 2451 int iorder[] = {3, 2, 1, 0}; 2452 #endif 2453 2454 #define STORECHAR(CH) \ 2455 do { \ 2456 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 2457 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 2458 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 2459 p[iorder[0]] = (CH) & 0xff; \ 2460 p += 4; \ 2461 } while(0) 2462 2463 /* In narrow builds we can output surrogate pairs as one code point, 2464 so we need less space. */ 2465 #ifndef Py_UNICODE_WIDE 2466 for (i = pairs = 0; i < size-1; i++) 2467 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 2468 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 2469 pairs++; 2470 #endif 2471 nsize = (size - pairs + (byteorder == 0)); 2472 bytesize = nsize * 4; 2473 if (bytesize / 4 != nsize) 2474 return PyErr_NoMemory(); 2475 v = PyString_FromStringAndSize(NULL, bytesize); 2476 if (v == NULL) 2477 return NULL; 2478 2479 p = (unsigned char *)PyString_AS_STRING(v); 2480 if (byteorder == 0) 2481 STORECHAR(0xFEFF); 2482 if (size == 0) 2483 return v; 2484 2485 if (byteorder == -1) { 2486 /* force LE */ 2487 iorder[0] = 0; 2488 iorder[1] = 1; 2489 iorder[2] = 2; 2490 iorder[3] = 3; 2491 } 2492 else if (byteorder == 1) { 2493 /* force BE */ 2494 iorder[0] = 3; 2495 iorder[1] = 2; 2496 iorder[2] = 1; 2497 iorder[3] = 0; 2498 } 2499 2500 while (size-- > 0) { 2501 Py_UCS4 ch = *s++; 2502 #ifndef Py_UNICODE_WIDE 2503 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 2504 Py_UCS4 ch2 = *s; 2505 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2506 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2507 s++; 2508 size--; 2509 } 2510 } 2511 #endif 2512 STORECHAR(ch); 2513 } 2514 return v; 2515 #undef STORECHAR 2516 } 2517 2518 PyObject *PyUnicode_AsUTF32String(PyObject *unicode) 2519 { 2520 if (!PyUnicode_Check(unicode)) { 2521 PyErr_BadArgument(); 2522 return NULL; 2523 } 2524 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 2525 PyUnicode_GET_SIZE(unicode), 2526 NULL, 2527 0); 2528 } 2529 2530 /* --- UTF-16 Codec ------------------------------------------------------- */ 2531 2532 PyObject * 2533 PyUnicode_DecodeUTF16(const char *s, 2534 Py_ssize_t size, 2535 const char *errors, 2536 int *byteorder) 2537 { 2538 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 2539 } 2540 2541 PyObject * 2542 PyUnicode_DecodeUTF16Stateful(const char *s, 2543 Py_ssize_t size, 2544 const char *errors, 2545 int *byteorder, 2546 Py_ssize_t *consumed) 2547 { 2548 const char *starts = s; 2549 Py_ssize_t startinpos; 2550 Py_ssize_t endinpos; 2551 Py_ssize_t outpos; 2552 PyUnicodeObject *unicode; 2553 Py_UNICODE *p; 2554 const unsigned char *q, *e; 2555 int bo = 0; /* assume native ordering by default */ 2556 const char *errmsg = ""; 2557 /* Offsets from q for retrieving byte pairs in the right order. */ 2558 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 2559 int ihi = 1, ilo = 0; 2560 #else 2561 int ihi = 0, ilo = 1; 2562 #endif 2563 PyObject *errorHandler = NULL; 2564 PyObject *exc = NULL; 2565 2566 /* Note: size will always be longer than the resulting Unicode 2567 character count */ 2568 unicode = _PyUnicode_New(size); 2569 if (!unicode) 2570 return NULL; 2571 if (size == 0) 2572 return (PyObject *)unicode; 2573 2574 /* Unpack UTF-16 encoded data */ 2575 p = unicode->str; 2576 q = (unsigned char *)s; 2577 e = q + size; 2578 2579 if (byteorder) 2580 bo = *byteorder; 2581 2582 /* Check for BOM marks (U+FEFF) in the input and adjust current 2583 byte order setting accordingly. In native mode, the leading BOM 2584 mark is skipped, in all other modes, it is copied to the output 2585 stream as-is (giving a ZWNBSP character). */ 2586 if (bo == 0) { 2587 if (size >= 2) { 2588 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 2589 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 2590 if (bom == 0xFEFF) { 2591 q += 2; 2592 bo = -1; 2593 } 2594 else if (bom == 0xFFFE) { 2595 q += 2; 2596 bo = 1; 2597 } 2598 #else 2599 if (bom == 0xFEFF) { 2600 q += 2; 2601 bo = 1; 2602 } 2603 else if (bom == 0xFFFE) { 2604 q += 2; 2605 bo = -1; 2606 } 2607 #endif 2608 } 2609 } 2610 2611 if (bo == -1) { 2612 /* force LE */ 2613 ihi = 1; 2614 ilo = 0; 2615 } 2616 else if (bo == 1) { 2617 /* force BE */ 2618 ihi = 0; 2619 ilo = 1; 2620 } 2621 2622 while (q < e) { 2623 Py_UNICODE ch; 2624 /* remaining bytes at the end? (size should be even) */ 2625 if (e-q<2) { 2626 if (consumed) 2627 break; 2628 errmsg = "truncated data"; 2629 startinpos = ((const char *)q)-starts; 2630 endinpos = ((const char *)e)-starts; 2631 goto utf16Error; 2632 /* The remaining input chars are ignored if the callback 2633 chooses to skip the input */ 2634 } 2635 ch = (q[ihi] << 8) | q[ilo]; 2636 2637 q += 2; 2638 2639 if (ch < 0xD800 || ch > 0xDFFF) { 2640 *p++ = ch; 2641 continue; 2642 } 2643 2644 /* UTF-16 code pair: */ 2645 if (e - q < 2) { 2646 q -= 2; 2647 if (consumed) 2648 break; 2649 errmsg = "unexpected end of data"; 2650 startinpos = ((const char *)q)-starts; 2651 endinpos = ((const char *)e)-starts; 2652 goto utf16Error; 2653 } 2654 if (0xD800 <= ch && ch <= 0xDBFF) { 2655 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 2656 q += 2; 2657 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2658 #ifndef Py_UNICODE_WIDE 2659 *p++ = ch; 2660 *p++ = ch2; 2661 #else 2662 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2663 #endif 2664 continue; 2665 } 2666 else { 2667 errmsg = "illegal UTF-16 surrogate"; 2668 startinpos = (((const char *)q)-4)-starts; 2669 endinpos = startinpos+2; 2670 goto utf16Error; 2671 } 2672 2673 } 2674 errmsg = "illegal encoding"; 2675 startinpos = (((const char *)q)-2)-starts; 2676 endinpos = startinpos+2; 2677 /* Fall through to report the error */ 2678 2679 utf16Error: 2680 outpos = p-PyUnicode_AS_UNICODE(unicode); 2681 if (unicode_decode_call_errorhandler( 2682 errors, &errorHandler, 2683 "utf16", errmsg, 2684 starts, size, &startinpos, &endinpos, &exc, (const char **)&q, 2685 &unicode, &outpos, &p)) 2686 goto onError; 2687 } 2688 2689 if (byteorder) 2690 *byteorder = bo; 2691 2692 if (consumed) 2693 *consumed = (const char *)q-starts; 2694 2695 /* Adjust length */ 2696 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2697 goto onError; 2698 2699 Py_XDECREF(errorHandler); 2700 Py_XDECREF(exc); 2701 return (PyObject *)unicode; 2702 2703 onError: 2704 Py_DECREF(unicode); 2705 Py_XDECREF(errorHandler); 2706 Py_XDECREF(exc); 2707 return NULL; 2708 } 2709 2710 PyObject * 2711 PyUnicode_EncodeUTF16(const Py_UNICODE *s, 2712 Py_ssize_t size, 2713 const char *errors, 2714 int byteorder) 2715 { 2716 PyObject *v; 2717 unsigned char *p; 2718 Py_ssize_t nsize, bytesize; 2719 #ifdef Py_UNICODE_WIDE 2720 Py_ssize_t i, pairs; 2721 #else 2722 const int pairs = 0; 2723 #endif 2724 /* Offsets from p for storing byte pairs in the right order. */ 2725 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 2726 int ihi = 1, ilo = 0; 2727 #else 2728 int ihi = 0, ilo = 1; 2729 #endif 2730 2731 #define STORECHAR(CH) \ 2732 do { \ 2733 p[ihi] = ((CH) >> 8) & 0xff; \ 2734 p[ilo] = (CH) & 0xff; \ 2735 p += 2; \ 2736 } while(0) 2737 2738 #ifdef Py_UNICODE_WIDE 2739 for (i = pairs = 0; i < size; i++) 2740 if (s[i] >= 0x10000) 2741 pairs++; 2742 #endif 2743 /* 2 * (size + pairs + (byteorder == 0)) */ 2744 if (size > PY_SSIZE_T_MAX || 2745 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 2746 return PyErr_NoMemory(); 2747 nsize = size + pairs + (byteorder == 0); 2748 bytesize = nsize * 2; 2749 if (bytesize / 2 != nsize) 2750 return PyErr_NoMemory(); 2751 v = PyString_FromStringAndSize(NULL, bytesize); 2752 if (v == NULL) 2753 return NULL; 2754 2755 p = (unsigned char *)PyString_AS_STRING(v); 2756 if (byteorder == 0) 2757 STORECHAR(0xFEFF); 2758 if (size == 0) 2759 return v; 2760 2761 if (byteorder == -1) { 2762 /* force LE */ 2763 ihi = 1; 2764 ilo = 0; 2765 } 2766 else if (byteorder == 1) { 2767 /* force BE */ 2768 ihi = 0; 2769 ilo = 1; 2770 } 2771 2772 while (size-- > 0) { 2773 Py_UNICODE ch = *s++; 2774 Py_UNICODE ch2 = 0; 2775 #ifdef Py_UNICODE_WIDE 2776 if (ch >= 0x10000) { 2777 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 2778 ch = 0xD800 | ((ch-0x10000) >> 10); 2779 } 2780 #endif 2781 STORECHAR(ch); 2782 if (ch2) 2783 STORECHAR(ch2); 2784 } 2785 return v; 2786 #undef STORECHAR 2787 } 2788 2789 PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 2790 { 2791 if (!PyUnicode_Check(unicode)) { 2792 PyErr_BadArgument(); 2793 return NULL; 2794 } 2795 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 2796 PyUnicode_GET_SIZE(unicode), 2797 NULL, 2798 0); 2799 } 2800 2801 /* --- Unicode Escape Codec ----------------------------------------------- */ 2802 2803 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 2804 2805 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 2806 Py_ssize_t size, 2807 const char *errors) 2808 { 2809 const char *starts = s; 2810 Py_ssize_t startinpos; 2811 Py_ssize_t endinpos; 2812 Py_ssize_t outpos; 2813 PyUnicodeObject *v; 2814 Py_UNICODE *p; 2815 const char *end; 2816 char* message; 2817 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 2818 PyObject *errorHandler = NULL; 2819 PyObject *exc = NULL; 2820 2821 /* Escaped strings will always be longer than the resulting 2822 Unicode string, so we start with size here and then reduce the 2823 length after conversion to the true value. 2824 (but if the error callback returns a long replacement string 2825 we'll have to allocate more space) */ 2826 v = _PyUnicode_New(size); 2827 if (v == NULL) 2828 goto onError; 2829 if (size == 0) 2830 return (PyObject *)v; 2831 2832 p = PyUnicode_AS_UNICODE(v); 2833 end = s + size; 2834 2835 while (s < end) { 2836 unsigned char c; 2837 Py_UNICODE x; 2838 int digits; 2839 2840 /* Non-escape characters are interpreted as Unicode ordinals */ 2841 if (*s != '\\') { 2842 *p++ = (unsigned char) *s++; 2843 continue; 2844 } 2845 2846 startinpos = s-starts; 2847 /* \ - Escapes */ 2848 s++; 2849 c = *s++; 2850 if (s > end) 2851 c = '\0'; /* Invalid after \ */ 2852 switch (c) { 2853 2854 /* \x escapes */ 2855 case '\n': break; 2856 case '\\': *p++ = '\\'; break; 2857 case '\'': *p++ = '\''; break; 2858 case '\"': *p++ = '\"'; break; 2859 case 'b': *p++ = '\b'; break; 2860 case 'f': *p++ = '\014'; break; /* FF */ 2861 case 't': *p++ = '\t'; break; 2862 case 'n': *p++ = '\n'; break; 2863 case 'r': *p++ = '\r'; break; 2864 case 'v': *p++ = '\013'; break; /* VT */ 2865 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 2866 2867 /* \OOO (octal) escapes */ 2868 case '0': case '1': case '2': case '3': 2869 case '4': case '5': case '6': case '7': 2870 x = s[-1] - '0'; 2871 if (s < end && '0' <= *s && *s <= '7') { 2872 x = (x<<3) + *s++ - '0'; 2873 if (s < end && '0' <= *s && *s <= '7') 2874 x = (x<<3) + *s++ - '0'; 2875 } 2876 *p++ = x; 2877 break; 2878 2879 /* hex escapes */ 2880 /* \xXX */ 2881 case 'x': 2882 digits = 2; 2883 message = "truncated \\xXX escape"; 2884 goto hexescape; 2885 2886 /* \uXXXX */ 2887 case 'u': 2888 digits = 4; 2889 message = "truncated \\uXXXX escape"; 2890 goto hexescape; 2891 2892 /* \UXXXXXXXX */ 2893 case 'U': 2894 digits = 8; 2895 message = "truncated \\UXXXXXXXX escape"; 2896 hexescape: 2897 chr = 0; 2898 if (end - s < digits) { 2899 /* count only hex digits */ 2900 for (; s < end; ++s) { 2901 c = (unsigned char)*s; 2902 if (!Py_ISXDIGIT(c)) 2903 goto error; 2904 } 2905 goto error; 2906 } 2907 for (; digits--; ++s) { 2908 c = (unsigned char)*s; 2909 if (!Py_ISXDIGIT(c)) 2910 goto error; 2911 chr = (chr<<4) & ~0xF; 2912 if (c >= '0' && c <= '9') 2913 chr += c - '0'; 2914 else if (c >= 'a' && c <= 'f') 2915 chr += 10 + c - 'a'; 2916 else 2917 chr += 10 + c - 'A'; 2918 } 2919 if (chr == 0xffffffff && PyErr_Occurred()) 2920 /* _decoding_error will have already written into the 2921 target buffer. */ 2922 break; 2923 store: 2924 /* when we get here, chr is a 32-bit unicode character */ 2925 if (chr <= 0xffff) 2926 /* UCS-2 character */ 2927 *p++ = (Py_UNICODE) chr; 2928 else if (chr <= 0x10ffff) { 2929 /* UCS-4 character. Either store directly, or as 2930 surrogate pair. */ 2931 #ifdef Py_UNICODE_WIDE 2932 *p++ = chr; 2933 #else 2934 chr -= 0x10000L; 2935 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 2936 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 2937 #endif 2938 } else { 2939 message = "illegal Unicode character"; 2940 goto error; 2941 } 2942 break; 2943 2944 /* \N{name} */ 2945 case 'N': 2946 message = "malformed \\N character escape"; 2947 if (ucnhash_CAPI == NULL) { 2948 /* load the unicode data module */ 2949 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1); 2950 if (ucnhash_CAPI == NULL) 2951 goto ucnhashError; 2952 } 2953 if (*s == '{') { 2954 const char *start = s+1; 2955 /* look for the closing brace */ 2956 while (*s != '}' && s < end) 2957 s++; 2958 if (s > start && s < end && *s == '}') { 2959 /* found a name. look it up in the unicode database */ 2960 message = "unknown Unicode character name"; 2961 s++; 2962 if (s - start - 1 <= INT_MAX && 2963 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) 2964 goto store; 2965 } 2966 } 2967 goto error; 2968 2969 default: 2970 if (s > end) { 2971 message = "\\ at end of string"; 2972 s--; 2973 goto error; 2974 } 2975 else { 2976 *p++ = '\\'; 2977 *p++ = (unsigned char)s[-1]; 2978 } 2979 break; 2980 } 2981 continue; 2982 2983 error: 2984 endinpos = s-starts; 2985 outpos = p-PyUnicode_AS_UNICODE(v); 2986 if (unicode_decode_call_errorhandler( 2987 errors, &errorHandler, 2988 "unicodeescape", message, 2989 starts, size, &startinpos, &endinpos, &exc, &s, 2990 &v, &outpos, &p)) 2991 goto onError; 2992 continue; 2993 } 2994 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 2995 goto onError; 2996 Py_XDECREF(errorHandler); 2997 Py_XDECREF(exc); 2998 return (PyObject *)v; 2999 3000 ucnhashError: 3001 PyErr_SetString( 3002 PyExc_UnicodeError, 3003 "\\N escapes not supported (can't load unicodedata module)" 3004 ); 3005 Py_XDECREF(v); 3006 Py_XDECREF(errorHandler); 3007 Py_XDECREF(exc); 3008 return NULL; 3009 3010 onError: 3011 Py_XDECREF(v); 3012 Py_XDECREF(errorHandler); 3013 Py_XDECREF(exc); 3014 return NULL; 3015 } 3016 3017 /* Return a Unicode-Escape string version of the Unicode object. 3018 3019 If quotes is true, the string is enclosed in u"" or u'' quotes as 3020 appropriate. 3021 3022 */ 3023 3024 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, 3025 Py_ssize_t size, 3026 Py_UNICODE ch) 3027 { 3028 /* like wcschr, but doesn't stop at NULL characters */ 3029 3030 while (size-- > 0) { 3031 if (*s == ch) 3032 return s; 3033 s++; 3034 } 3035 3036 return NULL; 3037 } 3038 3039 static 3040 PyObject *unicodeescape_string(const Py_UNICODE *s, 3041 Py_ssize_t size, 3042 int quotes) 3043 { 3044 PyObject *repr; 3045 char *p; 3046 3047 static const char *hexdigit = "0123456789abcdef"; 3048 #ifdef Py_UNICODE_WIDE 3049 const Py_ssize_t expandsize = 10; 3050 #else 3051 const Py_ssize_t expandsize = 6; 3052 #endif 3053 3054 /* XXX(nnorwitz): rather than over-allocating, it would be 3055 better to choose a different scheme. Perhaps scan the 3056 first N-chars of the string and allocate based on that size. 3057 */ 3058 /* Initial allocation is based on the longest-possible unichr 3059 escape. 3060 3061 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 3062 unichr, so in this case it's the longest unichr escape. In 3063 narrow (UTF-16) builds this is five chars per source unichr 3064 since there are two unichrs in the surrogate pair, so in narrow 3065 (UTF-16) builds it's not the longest unichr escape. 3066 3067 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 3068 so in the narrow (UTF-16) build case it's the longest unichr 3069 escape. 3070 */ 3071 3072 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 3073 return PyErr_NoMemory(); 3074 3075 repr = PyString_FromStringAndSize(NULL, 3076 2 3077 + expandsize*size 3078 + 1); 3079 if (repr == NULL) 3080 return NULL; 3081 3082 p = PyString_AS_STRING(repr); 3083 3084 if (quotes) { 3085 *p++ = 'u'; 3086 *p++ = (findchar(s, size, '\'') && 3087 !findchar(s, size, '"')) ? '"' : '\''; 3088 } 3089 while (size-- > 0) { 3090 Py_UNICODE ch = *s++; 3091 3092 /* Escape quotes and backslashes */ 3093 if ((quotes && 3094 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') { 3095 *p++ = '\\'; 3096 *p++ = (char) ch; 3097 continue; 3098 } 3099 3100 #ifdef Py_UNICODE_WIDE 3101 /* Map 21-bit characters to '\U00xxxxxx' */ 3102 else if (ch >= 0x10000) { 3103 *p++ = '\\'; 3104 *p++ = 'U'; 3105 *p++ = hexdigit[(ch >> 28) & 0x0000000F]; 3106 *p++ = hexdigit[(ch >> 24) & 0x0000000F]; 3107 *p++ = hexdigit[(ch >> 20) & 0x0000000F]; 3108 *p++ = hexdigit[(ch >> 16) & 0x0000000F]; 3109 *p++ = hexdigit[(ch >> 12) & 0x0000000F]; 3110 *p++ = hexdigit[(ch >> 8) & 0x0000000F]; 3111 *p++ = hexdigit[(ch >> 4) & 0x0000000F]; 3112 *p++ = hexdigit[ch & 0x0000000F]; 3113 continue; 3114 } 3115 #else 3116 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 3117 else if (ch >= 0xD800 && ch < 0xDC00) { 3118 Py_UNICODE ch2; 3119 Py_UCS4 ucs; 3120 3121 ch2 = *s++; 3122 size--; 3123 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 3124 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 3125 *p++ = '\\'; 3126 *p++ = 'U'; 3127 *p++ = hexdigit[(ucs >> 28) & 0x0000000F]; 3128 *p++ = hexdigit[(ucs >> 24) & 0x0000000F]; 3129 *p++ = hexdigit[(ucs >> 20) & 0x0000000F]; 3130 *p++ = hexdigit[(ucs >> 16) & 0x0000000F]; 3131 *p++ = hexdigit[(ucs >> 12) & 0x0000000F]; 3132 *p++ = hexdigit[(ucs >> 8) & 0x0000000F]; 3133 *p++ = hexdigit[(ucs >> 4) & 0x0000000F]; 3134 *p++ = hexdigit[ucs & 0x0000000F]; 3135 continue; 3136 } 3137 /* Fall through: isolated surrogates are copied as-is */ 3138 s--; 3139 size++; 3140 } 3141 #endif 3142 3143 /* Map 16-bit characters to '\uxxxx' */ 3144 if (ch >= 256) { 3145 *p++ = '\\'; 3146 *p++ = 'u'; 3147 *p++ = hexdigit[(ch >> 12) & 0x000F]; 3148 *p++ = hexdigit[(ch >> 8) & 0x000F]; 3149 *p++ = hexdigit[(ch >> 4) & 0x000F]; 3150 *p++ = hexdigit[ch & 0x000F]; 3151 } 3152 3153 /* Map special whitespace to '\t', \n', '\r' */ 3154 else if (ch == '\t') { 3155 *p++ = '\\'; 3156 *p++ = 't'; 3157 } 3158 else if (ch == '\n') { 3159 *p++ = '\\'; 3160 *p++ = 'n'; 3161 } 3162 else if (ch == '\r') { 3163 *p++ = '\\'; 3164 *p++ = 'r'; 3165 } 3166 3167 /* Map non-printable US ASCII to '\xhh' */ 3168 else if (ch < ' ' || ch >= 0x7F) { 3169 *p++ = '\\'; 3170 *p++ = 'x'; 3171 *p++ = hexdigit[(ch >> 4) & 0x000F]; 3172 *p++ = hexdigit[ch & 0x000F]; 3173 } 3174 3175 /* Copy everything else as-is */ 3176 else 3177 *p++ = (char) ch; 3178 } 3179 if (quotes) 3180 *p++ = PyString_AS_STRING(repr)[1]; 3181 3182 *p = '\0'; 3183 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr))) 3184 return NULL; 3185 return repr; 3186 } 3187 3188 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 3189 Py_ssize_t size) 3190 { 3191 return unicodeescape_string(s, size, 0); 3192 } 3193 3194 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 3195 { 3196 if (!PyUnicode_Check(unicode)) { 3197 PyErr_BadArgument(); 3198 return NULL; 3199 } 3200 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3201 PyUnicode_GET_SIZE(unicode)); 3202 } 3203 3204 /* --- Raw Unicode Escape Codec ------------------------------------------- */ 3205 3206 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 3207 Py_ssize_t size, 3208 const char *errors) 3209 { 3210 const char *starts = s; 3211 Py_ssize_t startinpos; 3212 Py_ssize_t endinpos; 3213 Py_ssize_t outpos; 3214 PyUnicodeObject *v; 3215 Py_UNICODE *p; 3216 const char *end; 3217 const char *bs; 3218 PyObject *errorHandler = NULL; 3219 PyObject *exc = NULL; 3220 3221 /* Escaped strings will always be longer than the resulting 3222 Unicode string, so we start with size here and then reduce the 3223 length after conversion to the true value. (But decoding error 3224 handler might have to resize the string) */ 3225 v = _PyUnicode_New(size); 3226 if (v == NULL) 3227 goto onError; 3228 if (size == 0) 3229 return (PyObject *)v; 3230 p = PyUnicode_AS_UNICODE(v); 3231 end = s + size; 3232 while (s < end) { 3233 unsigned char c; 3234 Py_UCS4 x; 3235 int i; 3236 int count; 3237 3238 /* Non-escape characters are interpreted as Unicode ordinals */ 3239 if (*s != '\\') { 3240 *p++ = (unsigned char)*s++; 3241 continue; 3242 } 3243 startinpos = s-starts; 3244 3245 /* \u-escapes are only interpreted iff the number of leading 3246 backslashes if odd */ 3247 bs = s; 3248 for (;s < end;) { 3249 if (*s != '\\') 3250 break; 3251 *p++ = (unsigned char)*s++; 3252 } 3253 if (((s - bs) & 1) == 0 || 3254 s >= end || 3255 (*s != 'u' && *s != 'U')) { 3256 continue; 3257 } 3258 p--; 3259 count = *s=='u' ? 4 : 8; 3260 s++; 3261 3262 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 3263 outpos = p-PyUnicode_AS_UNICODE(v); 3264 for (x = 0, i = 0; i < count; ++i, ++s) { 3265 c = (unsigned char)*s; 3266 if (!isxdigit(c)) { 3267 endinpos = s-starts; 3268 if (unicode_decode_call_errorhandler( 3269 errors, &errorHandler, 3270 "rawunicodeescape", "truncated \\uXXXX", 3271 starts, size, &startinpos, &endinpos, &exc, &s, 3272 &v, &outpos, &p)) 3273 goto onError; 3274 goto nextByte; 3275 } 3276 x = (x<<4) & ~0xF; 3277 if (c >= '0' && c <= '9') 3278 x += c - '0'; 3279 else if (c >= 'a' && c <= 'f') 3280 x += 10 + c - 'a'; 3281 else 3282 x += 10 + c - 'A'; 3283 } 3284 if (x <= 0xffff) 3285 /* UCS-2 character */ 3286 *p++ = (Py_UNICODE) x; 3287 else if (x <= 0x10ffff) { 3288 /* UCS-4 character. Either store directly, or as 3289 surrogate pair. */ 3290 #ifdef Py_UNICODE_WIDE 3291 *p++ = (Py_UNICODE) x; 3292 #else 3293 x -= 0x10000L; 3294 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 3295 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 3296 #endif 3297 } else { 3298 endinpos = s-starts; 3299 outpos = p-PyUnicode_AS_UNICODE(v); 3300 if (unicode_decode_call_errorhandler( 3301 errors, &errorHandler, 3302 "rawunicodeescape", "\\Uxxxxxxxx out of range", 3303 starts, size, &startinpos, &endinpos, &exc, &s, 3304 &v, &outpos, &p)) 3305 goto onError; 3306 } 3307 nextByte: 3308 ; 3309 } 3310 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3311 goto onError; 3312 Py_XDECREF(errorHandler); 3313 Py_XDECREF(exc); 3314 return (PyObject *)v; 3315 3316 onError: 3317 Py_XDECREF(v); 3318 Py_XDECREF(errorHandler); 3319 Py_XDECREF(exc); 3320 return NULL; 3321 } 3322 3323 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 3324 Py_ssize_t size) 3325 { 3326 PyObject *repr; 3327 char *p; 3328 char *q; 3329 3330 static const char *hexdigit = "0123456789abcdef"; 3331 #ifdef Py_UNICODE_WIDE 3332 const Py_ssize_t expandsize = 10; 3333 #else 3334 const Py_ssize_t expandsize = 6; 3335 #endif 3336 3337 if (size > PY_SSIZE_T_MAX / expandsize) 3338 return PyErr_NoMemory(); 3339 3340 repr = PyString_FromStringAndSize(NULL, expandsize * size); 3341 if (repr == NULL) 3342 return NULL; 3343 if (size == 0) 3344 return repr; 3345 3346 p = q = PyString_AS_STRING(repr); 3347 while (size-- > 0) { 3348 Py_UNICODE ch = *s++; 3349 #ifdef Py_UNICODE_WIDE 3350 /* Map 32-bit characters to '\Uxxxxxxxx' */ 3351 if (ch >= 0x10000) { 3352 *p++ = '\\'; 3353 *p++ = 'U'; 3354 *p++ = hexdigit[(ch >> 28) & 0xf]; 3355 *p++ = hexdigit[(ch >> 24) & 0xf]; 3356 *p++ = hexdigit[(ch >> 20) & 0xf]; 3357 *p++ = hexdigit[(ch >> 16) & 0xf]; 3358 *p++ = hexdigit[(ch >> 12) & 0xf]; 3359 *p++ = hexdigit[(ch >> 8) & 0xf]; 3360 *p++ = hexdigit[(ch >> 4) & 0xf]; 3361 *p++ = hexdigit[ch & 15]; 3362 } 3363 else 3364 #else 3365 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 3366 if (ch >= 0xD800 && ch < 0xDC00) { 3367 Py_UNICODE ch2; 3368 Py_UCS4 ucs; 3369 3370 ch2 = *s++; 3371 size--; 3372 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 3373 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 3374 *p++ = '\\'; 3375 *p++ = 'U'; 3376 *p++ = hexdigit[(ucs >> 28) & 0xf]; 3377 *p++ = hexdigit[(ucs >> 24) & 0xf]; 3378 *p++ = hexdigit[(ucs >> 20) & 0xf]; 3379 *p++ = hexdigit[(ucs >> 16) & 0xf]; 3380 *p++ = hexdigit[(ucs >> 12) & 0xf]; 3381 *p++ = hexdigit[(ucs >> 8) & 0xf]; 3382 *p++ = hexdigit[(ucs >> 4) & 0xf]; 3383 *p++ = hexdigit[ucs & 0xf]; 3384 continue; 3385 } 3386 /* Fall through: isolated surrogates are copied as-is */ 3387 s--; 3388 size++; 3389 } 3390 #endif 3391 /* Map 16-bit characters to '\uxxxx' */ 3392 if (ch >= 256) { 3393 *p++ = '\\'; 3394 *p++ = 'u'; 3395 *p++ = hexdigit[(ch >> 12) & 0xf]; 3396 *p++ = hexdigit[(ch >> 8) & 0xf]; 3397 *p++ = hexdigit[(ch >> 4) & 0xf]; 3398 *p++ = hexdigit[ch & 15]; 3399 } 3400 /* Copy everything else as-is */ 3401 else 3402 *p++ = (char) ch; 3403 } 3404 *p = '\0'; 3405 if (_PyString_Resize(&repr, p - q)) 3406 return NULL; 3407 return repr; 3408 } 3409 3410 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 3411 { 3412 if (!PyUnicode_Check(unicode)) { 3413 PyErr_BadArgument(); 3414 return NULL; 3415 } 3416 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3417 PyUnicode_GET_SIZE(unicode)); 3418 } 3419 3420 /* --- Unicode Internal Codec ------------------------------------------- */ 3421 3422 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, 3423 Py_ssize_t size, 3424 const char *errors) 3425 { 3426 const char *starts = s; 3427 Py_ssize_t startinpos; 3428 Py_ssize_t endinpos; 3429 Py_ssize_t outpos; 3430 PyUnicodeObject *v; 3431 Py_UNICODE *p; 3432 const char *end; 3433 const char *reason; 3434 PyObject *errorHandler = NULL; 3435 PyObject *exc = NULL; 3436 3437 #ifdef Py_UNICODE_WIDE 3438 Py_UNICODE unimax = PyUnicode_GetMax(); 3439 #endif 3440 3441 /* XXX overflow detection missing */ 3442 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 3443 if (v == NULL) 3444 goto onError; 3445 if (PyUnicode_GetSize((PyObject *)v) == 0) 3446 return (PyObject *)v; 3447 p = PyUnicode_AS_UNICODE(v); 3448 end = s + size; 3449 3450 while (s < end) { 3451 if (end-s < Py_UNICODE_SIZE) { 3452 endinpos = end-starts; 3453 reason = "truncated input"; 3454 goto error; 3455 } 3456 memcpy(p, s, sizeof(Py_UNICODE)); 3457 #ifdef Py_UNICODE_WIDE 3458 /* We have to sanity check the raw data, otherwise doom looms for 3459 some malformed UCS-4 data. */ 3460 if (*p > unimax || *p < 0) { 3461 endinpos = s - starts + Py_UNICODE_SIZE; 3462 reason = "illegal code point (> 0x10FFFF)"; 3463 goto error; 3464 } 3465 #endif 3466 p++; 3467 s += Py_UNICODE_SIZE; 3468 continue; 3469 3470 error: 3471 startinpos = s - starts; 3472 outpos = p - PyUnicode_AS_UNICODE(v); 3473 if (unicode_decode_call_errorhandler( 3474 errors, &errorHandler, 3475 "unicode_internal", reason, 3476 starts, size, &startinpos, &endinpos, &exc, &s, 3477 &v, &outpos, &p)) { 3478 goto onError; 3479 } 3480 } 3481 3482 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3483 goto onError; 3484 Py_XDECREF(errorHandler); 3485 Py_XDECREF(exc); 3486 return (PyObject *)v; 3487 3488 onError: 3489 Py_XDECREF(v); 3490 Py_XDECREF(errorHandler); 3491 Py_XDECREF(exc); 3492 return NULL; 3493 } 3494 3495 /* --- Latin-1 Codec ------------------------------------------------------ */ 3496 3497 PyObject *PyUnicode_DecodeLatin1(const char *s, 3498 Py_ssize_t size, 3499 const char *errors) 3500 { 3501 PyUnicodeObject *v; 3502 Py_UNICODE *p; 3503 3504 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 3505 if (size == 1) { 3506 Py_UNICODE r = *(unsigned char*)s; 3507 return PyUnicode_FromUnicode(&r, 1); 3508 } 3509 3510 v = _PyUnicode_New(size); 3511 if (v == NULL) 3512 goto onError; 3513 if (size == 0) 3514 return (PyObject *)v; 3515 p = PyUnicode_AS_UNICODE(v); 3516 while (size-- > 0) 3517 *p++ = (unsigned char)*s++; 3518 return (PyObject *)v; 3519 3520 onError: 3521 Py_XDECREF(v); 3522 return NULL; 3523 } 3524 3525 /* create or adjust a UnicodeEncodeError */ 3526 static void make_encode_exception(PyObject **exceptionObject, 3527 const char *encoding, 3528 const Py_UNICODE *unicode, Py_ssize_t size, 3529 Py_ssize_t startpos, Py_ssize_t endpos, 3530 const char *reason) 3531 { 3532 if (*exceptionObject == NULL) { 3533 *exceptionObject = PyUnicodeEncodeError_Create( 3534 encoding, unicode, size, startpos, endpos, reason); 3535 } 3536 else { 3537 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 3538 goto onError; 3539 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 3540 goto onError; 3541 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 3542 goto onError; 3543 return; 3544 onError: 3545 Py_CLEAR(*exceptionObject); 3546 } 3547 } 3548 3549 /* raises a UnicodeEncodeError */ 3550 static void raise_encode_exception(PyObject **exceptionObject, 3551 const char *encoding, 3552 const Py_UNICODE *unicode, Py_ssize_t size, 3553 Py_ssize_t startpos, Py_ssize_t endpos, 3554 const char *reason) 3555 { 3556 make_encode_exception(exceptionObject, 3557 encoding, unicode, size, startpos, endpos, reason); 3558 if (*exceptionObject != NULL) 3559 PyCodec_StrictErrors(*exceptionObject); 3560 } 3561 3562 /* error handling callback helper: 3563 build arguments, call the callback and check the arguments, 3564 put the result into newpos and return the replacement string, which 3565 has to be freed by the caller */ 3566 static PyObject *unicode_encode_call_errorhandler(const char *errors, 3567 PyObject **errorHandler, 3568 const char *encoding, const char *reason, 3569 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 3570 Py_ssize_t startpos, Py_ssize_t endpos, 3571 Py_ssize_t *newpos) 3572 { 3573 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple"; 3574 3575 PyObject *restuple; 3576 PyObject *resunicode; 3577 3578 if (*errorHandler == NULL) { 3579 *errorHandler = PyCodec_LookupError(errors); 3580 if (*errorHandler == NULL) 3581 return NULL; 3582 } 3583 3584 make_encode_exception(exceptionObject, 3585 encoding, unicode, size, startpos, endpos, reason); 3586 if (*exceptionObject == NULL) 3587 return NULL; 3588 3589 restuple = PyObject_CallFunctionObjArgs( 3590 *errorHandler, *exceptionObject, NULL); 3591 if (restuple == NULL) 3592 return NULL; 3593 if (!PyTuple_Check(restuple)) { 3594 PyErr_SetString(PyExc_TypeError, &argparse[4]); 3595 Py_DECREF(restuple); 3596 return NULL; 3597 } 3598 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 3599 &resunicode, newpos)) { 3600 Py_DECREF(restuple); 3601 return NULL; 3602 } 3603 if (*newpos<0) 3604 *newpos = size+*newpos; 3605 if (*newpos<0 || *newpos>size) { 3606 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 3607 Py_DECREF(restuple); 3608 return NULL; 3609 } 3610 Py_INCREF(resunicode); 3611 Py_DECREF(restuple); 3612 return resunicode; 3613 } 3614 3615 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, 3616 Py_ssize_t size, 3617 const char *errors, 3618 int limit) 3619 { 3620 /* output object */ 3621 PyObject *res; 3622 /* pointers to the beginning and end+1 of input */ 3623 const Py_UNICODE *startp = p; 3624 const Py_UNICODE *endp = p + size; 3625 /* pointer to the beginning of the unencodable characters */ 3626 /* const Py_UNICODE *badp = NULL; */ 3627 /* pointer into the output */ 3628 char *str; 3629 /* current output position */ 3630 Py_ssize_t respos = 0; 3631 Py_ssize_t ressize; 3632 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 3633 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 3634 PyObject *errorHandler = NULL; 3635 PyObject *exc = NULL; 3636 /* the following variable is used for caching string comparisons 3637 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 3638 int known_errorHandler = -1; 3639 3640 /* allocate enough for a simple encoding without 3641 replacements, if we need more, we'll resize */ 3642 res = PyString_FromStringAndSize(NULL, size); 3643 if (res == NULL) 3644 goto onError; 3645 if (size == 0) 3646 return res; 3647 str = PyString_AS_STRING(res); 3648 ressize = size; 3649 3650 while (p<endp) { 3651 Py_UNICODE c = *p; 3652 3653 /* can we encode this? */ 3654 if (c<limit) { 3655 /* no overflow check, because we know that the space is enough */ 3656 *str++ = (char)c; 3657 ++p; 3658 } 3659 else { 3660 Py_ssize_t unicodepos = p-startp; 3661 Py_ssize_t requiredsize; 3662 PyObject *repunicode; 3663 Py_ssize_t repsize; 3664 Py_ssize_t newpos; 3665 Py_ssize_t respos; 3666 Py_UNICODE *uni2; 3667 /* startpos for collecting unencodable chars */ 3668 const Py_UNICODE *collstart = p; 3669 const Py_UNICODE *collend = p; 3670 /* find all unecodable characters */ 3671 while ((collend < endp) && ((*collend) >= limit)) 3672 ++collend; 3673 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 3674 if (known_errorHandler==-1) { 3675 if ((errors==NULL) || (!strcmp(errors, "strict"))) 3676 known_errorHandler = 1; 3677 else if (!strcmp(errors, "replace")) 3678 known_errorHandler = 2; 3679 else if (!strcmp(errors, "ignore")) 3680 known_errorHandler = 3; 3681 else if (!strcmp(errors, "xmlcharrefreplace")) 3682 known_errorHandler = 4; 3683 else 3684 known_errorHandler = 0; 3685 } 3686 switch (known_errorHandler) { 3687 case 1: /* strict */ 3688 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 3689 goto onError; 3690 case 2: /* replace */ 3691 while (collstart++ < collend) 3692 *str++ = '?'; /* fall through */ 3693 case 3: /* ignore */ 3694 p = collend; 3695 break; 3696 case 4: /* xmlcharrefreplace */ 3697 respos = str - PyString_AS_STRING(res); 3698 /* determine replacement size (temporarily (mis)uses p) */ 3699 requiredsize = respos; 3700 for (p = collstart; p < collend;) { 3701 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend); 3702 Py_ssize_t incr; 3703 if (ch < 10) 3704 incr = 2+1+1; 3705 else if (ch < 100) 3706 incr = 2+2+1; 3707 else if (ch < 1000) 3708 incr = 2+3+1; 3709 else if (ch < 10000) 3710 incr = 2+4+1; 3711 else if (ch < 100000) 3712 incr = 2+5+1; 3713 else if (ch < 1000000) 3714 incr = 2+6+1; 3715 else 3716 incr = 2+7+1; 3717 if (requiredsize > PY_SSIZE_T_MAX - incr) 3718 goto overflow; 3719 requiredsize += incr; 3720 } 3721 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend)) 3722 goto overflow; 3723 requiredsize += endp - collend; 3724 if (requiredsize > ressize) { 3725 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) 3726 requiredsize = 2*ressize; 3727 if (_PyString_Resize(&res, requiredsize)) 3728 goto onError; 3729 str = PyString_AS_STRING(res) + respos; 3730 ressize = requiredsize; 3731 } 3732 /* generate replacement (temporarily (mis)uses p) */ 3733 for (p = collstart; p < collend;) { 3734 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend); 3735 str += sprintf(str, "&#%d;", (int)ch); 3736 } 3737 p = collend; 3738 break; 3739 default: 3740 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 3741 encoding, reason, startp, size, &exc, 3742 collstart-startp, collend-startp, &newpos); 3743 if (repunicode == NULL) 3744 goto onError; 3745 /* need more space? (at least enough for what we have+the 3746 replacement+the rest of the string, so we won't have to 3747 check space for encodable characters) */ 3748 respos = str - PyString_AS_STRING(res); 3749 repsize = PyUnicode_GET_SIZE(repunicode); 3750 if (respos > PY_SSIZE_T_MAX - repsize) 3751 goto overflow; 3752 requiredsize = respos + repsize; 3753 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend)) 3754 goto overflow; 3755 requiredsize += endp - collend; 3756 if (requiredsize > ressize) { 3757 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) 3758 requiredsize = 2*ressize; 3759 if (_PyString_Resize(&res, requiredsize)) { 3760 Py_DECREF(repunicode); 3761 goto onError; 3762 } 3763 str = PyString_AS_STRING(res) + respos; 3764 ressize = requiredsize; 3765 } 3766 /* check if there is anything unencodable in the replacement 3767 and copy it to the output */ 3768 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2, ++str) { 3769 c = *uni2; 3770 if (c >= limit) { 3771 raise_encode_exception(&exc, encoding, startp, size, 3772 unicodepos, unicodepos+1, reason); 3773 Py_DECREF(repunicode); 3774 goto onError; 3775 } 3776 *str = (char)c; 3777 } 3778 p = startp + newpos; 3779 Py_DECREF(repunicode); 3780 } 3781 } 3782 } 3783 /* Resize if we allocated to much */ 3784 respos = str - PyString_AS_STRING(res); 3785 if (respos < ressize) 3786 /* If this falls res will be NULL */ 3787 _PyString_Resize(&res, respos); 3788 Py_XDECREF(errorHandler); 3789 Py_XDECREF(exc); 3790 return res; 3791 3792 overflow: 3793 PyErr_SetString(PyExc_OverflowError, 3794 "encoded result is too long for a Python string"); 3795 3796 onError: 3797 Py_XDECREF(res); 3798 Py_XDECREF(errorHandler); 3799 Py_XDECREF(exc); 3800 return NULL; 3801 } 3802 3803 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 3804 Py_ssize_t size, 3805 const char *errors) 3806 { 3807 return unicode_encode_ucs1(p, size, errors, 256); 3808 } 3809 3810 PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 3811 { 3812 if (!PyUnicode_Check(unicode)) { 3813 PyErr_BadArgument(); 3814 return NULL; 3815 } 3816 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 3817 PyUnicode_GET_SIZE(unicode), 3818 NULL); 3819 } 3820 3821 /* --- 7-bit ASCII Codec -------------------------------------------------- */ 3822 3823 PyObject *PyUnicode_DecodeASCII(const char *s, 3824 Py_ssize_t size, 3825 const char *errors) 3826 { 3827 const char *starts = s; 3828 PyUnicodeObject *v; 3829 Py_UNICODE *p; 3830 Py_ssize_t startinpos; 3831 Py_ssize_t endinpos; 3832 Py_ssize_t outpos; 3833 const char *e; 3834 PyObject *errorHandler = NULL; 3835 PyObject *exc = NULL; 3836 3837 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 3838 if (size == 1 && *(unsigned char*)s < 128) { 3839 Py_UNICODE r = *(unsigned char*)s; 3840 return PyUnicode_FromUnicode(&r, 1); 3841 } 3842 3843 v = _PyUnicode_New(size); 3844 if (v == NULL) 3845 goto onError; 3846 if (size == 0) 3847 return (PyObject *)v; 3848 p = PyUnicode_AS_UNICODE(v); 3849 e = s + size; 3850 while (s < e) { 3851 register unsigned char c = (unsigned char)*s; 3852 if (c < 128) { 3853 *p++ = c; 3854 ++s; 3855 } 3856 else { 3857 startinpos = s-starts; 3858 endinpos = startinpos + 1; 3859 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 3860 if (unicode_decode_call_errorhandler( 3861 errors, &errorHandler, 3862 "ascii", "ordinal not in range(128)", 3863 starts, size, &startinpos, &endinpos, &exc, &s, 3864 &v, &outpos, &p)) 3865 goto onError; 3866 } 3867 } 3868 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) 3869 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3870 goto onError; 3871 Py_XDECREF(errorHandler); 3872 Py_XDECREF(exc); 3873 return (PyObject *)v; 3874 3875 onError: 3876 Py_XDECREF(v); 3877 Py_XDECREF(errorHandler); 3878 Py_XDECREF(exc); 3879 return NULL; 3880 } 3881 3882 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 3883 Py_ssize_t size, 3884 const char *errors) 3885 { 3886 return unicode_encode_ucs1(p, size, errors, 128); 3887 } 3888 3889 PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 3890 { 3891 if (!PyUnicode_Check(unicode)) { 3892 PyErr_BadArgument(); 3893 return NULL; 3894 } 3895 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 3896 PyUnicode_GET_SIZE(unicode), 3897 NULL); 3898 } 3899 3900 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 3901 3902 /* --- MBCS codecs for Windows -------------------------------------------- */ 3903 3904 #if SIZEOF_INT < SIZEOF_SIZE_T 3905 #define NEED_RETRY 3906 #endif 3907 3908 /* XXX This code is limited to "true" double-byte encodings, as 3909 a) it assumes an incomplete character consists of a single byte, and 3910 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 3911 encodings, see IsDBCSLeadByteEx documentation. */ 3912 3913 static int is_dbcs_lead_byte(const char *s, int offset) 3914 { 3915 const char *curr = s + offset; 3916 3917 if (IsDBCSLeadByte(*curr)) { 3918 const char *prev = CharPrev(s, curr); 3919 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 3920 } 3921 return 0; 3922 } 3923 3924 /* 3925 * Decode MBCS string into unicode object. If 'final' is set, converts 3926 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 3927 */ 3928 static int decode_mbcs(PyUnicodeObject **v, 3929 const char *s, /* MBCS string */ 3930 int size, /* sizeof MBCS string */ 3931 int final) 3932 { 3933 Py_UNICODE *p; 3934 Py_ssize_t n = 0; 3935 int usize = 0; 3936 3937 assert(size >= 0); 3938 3939 /* Skip trailing lead-byte unless 'final' is set */ 3940 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 3941 --size; 3942 3943 /* First get the size of the result */ 3944 if (size > 0) { 3945 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 3946 if (usize == 0) { 3947 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3948 return -1; 3949 } 3950 } 3951 3952 if (*v == NULL) { 3953 /* Create unicode object */ 3954 *v = _PyUnicode_New(usize); 3955 if (*v == NULL) 3956 return -1; 3957 } 3958 else { 3959 /* Extend unicode object */ 3960 n = PyUnicode_GET_SIZE(*v); 3961 if (_PyUnicode_Resize(v, n + usize) < 0) 3962 return -1; 3963 } 3964 3965 /* Do the conversion */ 3966 if (size > 0) { 3967 p = PyUnicode_AS_UNICODE(*v) + n; 3968 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 3969 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3970 return -1; 3971 } 3972 } 3973 3974 return size; 3975 } 3976 3977 PyObject *PyUnicode_DecodeMBCSStateful(const char *s, 3978 Py_ssize_t size, 3979 const char *errors, 3980 Py_ssize_t *consumed) 3981 { 3982 PyUnicodeObject *v = NULL; 3983 int done; 3984 3985 if (consumed) 3986 *consumed = 0; 3987 3988 #ifdef NEED_RETRY 3989 retry: 3990 if (size > INT_MAX) 3991 done = decode_mbcs(&v, s, INT_MAX, 0); 3992 else 3993 #endif 3994 done = decode_mbcs(&v, s, (int)size, !consumed); 3995 3996 if (done < 0) { 3997 Py_XDECREF(v); 3998 return NULL; 3999 } 4000 4001 if (consumed) 4002 *consumed += done; 4003 4004 #ifdef NEED_RETRY 4005 if (size > INT_MAX) { 4006 s += done; 4007 size -= done; 4008 goto retry; 4009 } 4010 #endif 4011 4012 return (PyObject *)v; 4013 } 4014 4015 PyObject *PyUnicode_DecodeMBCS(const char *s, 4016 Py_ssize_t size, 4017 const char *errors) 4018 { 4019 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 4020 } 4021 4022 /* 4023 * Convert unicode into string object (MBCS). 4024 * Returns 0 if succeed, -1 otherwise. 4025 */ 4026 static int encode_mbcs(PyObject **repr, 4027 const Py_UNICODE *p, /* unicode */ 4028 int size) /* size of unicode */ 4029 { 4030 int mbcssize = 0; 4031 Py_ssize_t n = 0; 4032 4033 assert(size >= 0); 4034 4035 /* First get the size of the result */ 4036 if (size > 0) { 4037 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 4038 if (mbcssize == 0) { 4039 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4040 return -1; 4041 } 4042 } 4043 4044 if (*repr == NULL) { 4045 /* Create string object */ 4046 *repr = PyString_FromStringAndSize(NULL, mbcssize); 4047 if (*repr == NULL) 4048 return -1; 4049 } 4050 else { 4051 /* Extend string object */ 4052 n = PyString_Size(*repr); 4053 if (_PyString_Resize(repr, n + mbcssize) < 0) 4054 return -1; 4055 } 4056 4057 /* Do the conversion */ 4058 if (size > 0) { 4059 char *s = PyString_AS_STRING(*repr) + n; 4060 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 4061 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4062 return -1; 4063 } 4064 } 4065 4066 return 0; 4067 } 4068 4069 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 4070 Py_ssize_t size, 4071 const char *errors) 4072 { 4073 PyObject *repr = NULL; 4074 int ret; 4075 4076 #ifdef NEED_RETRY 4077 retry: 4078 if (size > INT_MAX) 4079 ret = encode_mbcs(&repr, p, INT_MAX); 4080 else 4081 #endif 4082 ret = encode_mbcs(&repr, p, (int)size); 4083 4084 if (ret < 0) { 4085 Py_XDECREF(repr); 4086 return NULL; 4087 } 4088 4089 #ifdef NEED_RETRY 4090 if (size > INT_MAX) { 4091 p += INT_MAX; 4092 size -= INT_MAX; 4093 goto retry; 4094 } 4095 #endif 4096 4097 return repr; 4098 } 4099 4100 PyObject *PyUnicode_AsMBCSString(PyObject *unicode) 4101 { 4102 if (!PyUnicode_Check(unicode)) { 4103 PyErr_BadArgument(); 4104 return NULL; 4105 } 4106 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 4107 PyUnicode_GET_SIZE(unicode), 4108 NULL); 4109 } 4110 4111 #undef NEED_RETRY 4112 4113 #endif /* MS_WINDOWS */ 4114 4115 /* --- Character Mapping Codec -------------------------------------------- */ 4116 4117 PyObject *PyUnicode_DecodeCharmap(const char *s, 4118 Py_ssize_t size, 4119 PyObject *mapping, 4120 const char *errors) 4121 { 4122 const char *starts = s; 4123 Py_ssize_t startinpos; 4124 Py_ssize_t endinpos; 4125 Py_ssize_t outpos; 4126 const char *e; 4127 PyUnicodeObject *v; 4128 Py_UNICODE *p; 4129 Py_ssize_t extrachars = 0; 4130 PyObject *errorHandler = NULL; 4131 PyObject *exc = NULL; 4132 Py_UNICODE *mapstring = NULL; 4133 Py_ssize_t maplen = 0; 4134 4135 /* Default to Latin-1 */ 4136 if (mapping == NULL) 4137 return PyUnicode_DecodeLatin1(s, size, errors); 4138 4139 v = _PyUnicode_New(size); 4140 if (v == NULL) 4141 goto onError; 4142 if (size == 0) 4143 return (PyObject *)v; 4144 p = PyUnicode_AS_UNICODE(v); 4145 e = s + size; 4146 if (PyUnicode_CheckExact(mapping)) { 4147 mapstring = PyUnicode_AS_UNICODE(mapping); 4148 maplen = PyUnicode_GET_SIZE(mapping); 4149 while (s < e) { 4150 unsigned char ch = *s; 4151 Py_UNICODE x = 0xfffe; /* illegal value */ 4152 4153 if (ch < maplen) 4154 x = mapstring[ch]; 4155 4156 if (x == 0xfffe) { 4157 /* undefined mapping */ 4158 outpos = p-PyUnicode_AS_UNICODE(v); 4159 startinpos = s-starts; 4160 endinpos = startinpos+1; 4161 if (unicode_decode_call_errorhandler( 4162 errors, &errorHandler, 4163 "charmap", "character maps to <undefined>", 4164 starts, size, &startinpos, &endinpos, &exc, &s, 4165 &v, &outpos, &p)) { 4166 goto onError; 4167 } 4168 continue; 4169 } 4170 *p++ = x; 4171 ++s; 4172 } 4173 } 4174 else { 4175 while (s < e) { 4176 unsigned char ch = *s; 4177 PyObject *w, *x; 4178 4179 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 4180 w = PyInt_FromLong((long)ch); 4181 if (w == NULL) 4182 goto onError; 4183 x = PyObject_GetItem(mapping, w); 4184 Py_DECREF(w); 4185 if (x == NULL) { 4186 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4187 /* No mapping found means: mapping is undefined. */ 4188 PyErr_Clear(); 4189 goto Undefined; 4190 } else 4191 goto onError; 4192 } 4193 4194 /* Apply mapping */ 4195 if (x == Py_None) 4196 goto Undefined; 4197 if (PyInt_Check(x)) { 4198 long value = PyInt_AS_LONG(x); 4199 if (value == 0xFFFE) 4200 goto Undefined; 4201 if (value < 0 || value > 0x10FFFF) { 4202 PyErr_SetString(PyExc_TypeError, 4203 "character mapping must be in range(0x110000)"); 4204 Py_DECREF(x); 4205 goto onError; 4206 } 4207 4208 #ifndef Py_UNICODE_WIDE 4209 if (value > 0xFFFF) { 4210 /* see the code for 1-n mapping below */ 4211 if (extrachars < 2) { 4212 /* resize first */ 4213 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 4214 Py_ssize_t needed = 10 - extrachars; 4215 extrachars += needed; 4216 /* XXX overflow detection missing */ 4217 if (_PyUnicode_Resize(&v, 4218 PyUnicode_GET_SIZE(v) + needed) < 0) { 4219 Py_DECREF(x); 4220 goto onError; 4221 } 4222 p = PyUnicode_AS_UNICODE(v) + oldpos; 4223 } 4224 value -= 0x10000; 4225 *p++ = 0xD800 | (value >> 10); 4226 *p++ = 0xDC00 | (value & 0x3FF); 4227 extrachars -= 2; 4228 } 4229 else 4230 #endif 4231 *p++ = (Py_UNICODE)value; 4232 } 4233 else if (PyUnicode_Check(x)) { 4234 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 4235 4236 if (targetsize == 1) { 4237 /* 1-1 mapping */ 4238 Py_UNICODE value = *PyUnicode_AS_UNICODE(x); 4239 if (value == 0xFFFE) 4240 goto Undefined; 4241 *p++ = value; 4242 } 4243 else if (targetsize > 1) { 4244 /* 1-n mapping */ 4245 if (targetsize > extrachars) { 4246 /* resize first */ 4247 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 4248 Py_ssize_t needed = (targetsize - extrachars) + \ 4249 (targetsize << 2); 4250 extrachars += needed; 4251 /* XXX overflow detection missing */ 4252 if (_PyUnicode_Resize(&v, 4253 PyUnicode_GET_SIZE(v) + needed) < 0) { 4254 Py_DECREF(x); 4255 goto onError; 4256 } 4257 p = PyUnicode_AS_UNICODE(v) + oldpos; 4258 } 4259 Py_UNICODE_COPY(p, 4260 PyUnicode_AS_UNICODE(x), 4261 targetsize); 4262 p += targetsize; 4263 extrachars -= targetsize; 4264 } 4265 /* 1-0 mapping: skip the character */ 4266 } 4267 else { 4268 /* wrong return value */ 4269 PyErr_SetString(PyExc_TypeError, 4270 "character mapping must return integer, None or unicode"); 4271 Py_DECREF(x); 4272 goto onError; 4273 } 4274 Py_DECREF(x); 4275 ++s; 4276 continue; 4277 Undefined: 4278 /* undefined mapping */ 4279 Py_XDECREF(x); 4280 outpos = p-PyUnicode_AS_UNICODE(v); 4281 startinpos = s-starts; 4282 endinpos = startinpos+1; 4283 if (unicode_decode_call_errorhandler( 4284 errors, &errorHandler, 4285 "charmap", "character maps to <undefined>", 4286 starts, size, &startinpos, &endinpos, &exc, &s, 4287 &v, &outpos, &p)) { 4288 goto onError; 4289 } 4290 } 4291 } 4292 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 4293 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4294 goto onError; 4295 Py_XDECREF(errorHandler); 4296 Py_XDECREF(exc); 4297 return (PyObject *)v; 4298 4299 onError: 4300 Py_XDECREF(errorHandler); 4301 Py_XDECREF(exc); 4302 Py_XDECREF(v); 4303 return NULL; 4304 } 4305 4306 /* Charmap encoding: the lookup table */ 4307 4308 struct encoding_map{ 4309 PyObject_HEAD 4310 unsigned char level1[32]; 4311 int count2, count3; 4312 unsigned char level23[1]; 4313 }; 4314 4315 static PyObject* 4316 encoding_map_size(PyObject *obj, PyObject* args) 4317 { 4318 struct encoding_map *map = (struct encoding_map*)obj; 4319 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 + 4320 128*map->count3); 4321 } 4322 4323 static PyMethodDef encoding_map_methods[] = { 4324 {"size", encoding_map_size, METH_NOARGS, 4325 PyDoc_STR("Return the size (in bytes) of this object") }, 4326 { 0 } 4327 }; 4328 4329 static void 4330 encoding_map_dealloc(PyObject* o) 4331 { 4332 PyObject_FREE(o); 4333 } 4334 4335 static PyTypeObject EncodingMapType = { 4336 PyVarObject_HEAD_INIT(NULL, 0) 4337 "EncodingMap", /*tp_name*/ 4338 sizeof(struct encoding_map), /*tp_basicsize*/ 4339 0, /*tp_itemsize*/ 4340 /* methods */ 4341 encoding_map_dealloc, /*tp_dealloc*/ 4342 0, /*tp_print*/ 4343 0, /*tp_getattr*/ 4344 0, /*tp_setattr*/ 4345 0, /*tp_compare*/ 4346 0, /*tp_repr*/ 4347 0, /*tp_as_number*/ 4348 0, /*tp_as_sequence*/ 4349 0, /*tp_as_mapping*/ 4350 0, /*tp_hash*/ 4351 0, /*tp_call*/ 4352 0, /*tp_str*/ 4353 0, /*tp_getattro*/ 4354 0, /*tp_setattro*/ 4355 0, /*tp_as_buffer*/ 4356 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 4357 0, /*tp_doc*/ 4358 0, /*tp_traverse*/ 4359 0, /*tp_clear*/ 4360 0, /*tp_richcompare*/ 4361 0, /*tp_weaklistoffset*/ 4362 0, /*tp_iter*/ 4363 0, /*tp_iternext*/ 4364 encoding_map_methods, /*tp_methods*/ 4365 0, /*tp_members*/ 4366 0, /*tp_getset*/ 4367 0, /*tp_base*/ 4368 0, /*tp_dict*/ 4369 0, /*tp_descr_get*/ 4370 0, /*tp_descr_set*/ 4371 0, /*tp_dictoffset*/ 4372 0, /*tp_init*/ 4373 0, /*tp_alloc*/ 4374 0, /*tp_new*/ 4375 0, /*tp_free*/ 4376 0, /*tp_is_gc*/ 4377 }; 4378 4379 PyObject* 4380 PyUnicode_BuildEncodingMap(PyObject* string) 4381 { 4382 Py_UNICODE *decode; 4383 PyObject *result; 4384 struct encoding_map *mresult; 4385 int i; 4386 int need_dict = 0; 4387 unsigned char level1[32]; 4388 unsigned char level2[512]; 4389 unsigned char *mlevel1, *mlevel2, *mlevel3; 4390 int count2 = 0, count3 = 0; 4391 4392 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) { 4393 PyErr_BadArgument(); 4394 return NULL; 4395 } 4396 decode = PyUnicode_AS_UNICODE(string); 4397 memset(level1, 0xFF, sizeof level1); 4398 memset(level2, 0xFF, sizeof level2); 4399 4400 /* If there isn't a one-to-one mapping of NULL to \0, 4401 or if there are non-BMP characters, we need to use 4402 a mapping dictionary. */ 4403 if (decode[0] != 0) 4404 need_dict = 1; 4405 for (i = 1; i < 256; i++) { 4406 int l1, l2; 4407 if (decode[i] == 0 4408 #ifdef Py_UNICODE_WIDE 4409 || decode[i] > 0xFFFF 4410 #endif 4411 ) { 4412 need_dict = 1; 4413 break; 4414 } 4415 if (decode[i] == 0xFFFE) 4416 /* unmapped character */ 4417 continue; 4418 l1 = decode[i] >> 11; 4419 l2 = decode[i] >> 7; 4420 if (level1[l1] == 0xFF) 4421 level1[l1] = count2++; 4422 if (level2[l2] == 0xFF) 4423 level2[l2] = count3++; 4424 } 4425 4426 if (count2 >= 0xFF || count3 >= 0xFF) 4427 need_dict = 1; 4428 4429 if (need_dict) { 4430 PyObject *result = PyDict_New(); 4431 PyObject *key, *value; 4432 if (!result) 4433 return NULL; 4434 for (i = 0; i < 256; i++) { 4435 value = NULL; 4436 key = PyInt_FromLong(decode[i]); 4437 value = PyInt_FromLong(i); 4438 if (!key || !value) 4439 goto failed1; 4440 if (PyDict_SetItem(result, key, value) == -1) 4441 goto failed1; 4442 Py_DECREF(key); 4443 Py_DECREF(value); 4444 } 4445 return result; 4446 failed1: 4447 Py_XDECREF(key); 4448 Py_XDECREF(value); 4449 Py_DECREF(result); 4450 return NULL; 4451 } 4452 4453 /* Create a three-level trie */ 4454 result = PyObject_MALLOC(sizeof(struct encoding_map) + 4455 16*count2 + 128*count3 - 1); 4456 if (!result) 4457 return PyErr_NoMemory(); 4458 PyObject_Init(result, &EncodingMapType); 4459 mresult = (struct encoding_map*)result; 4460 mresult->count2 = count2; 4461 mresult->count3 = count3; 4462 mlevel1 = mresult->level1; 4463 mlevel2 = mresult->level23; 4464 mlevel3 = mresult->level23 + 16*count2; 4465 memcpy(mlevel1, level1, 32); 4466 memset(mlevel2, 0xFF, 16*count2); 4467 memset(mlevel3, 0, 128*count3); 4468 count3 = 0; 4469 for (i = 1; i < 256; i++) { 4470 int o1, o2, o3, i2, i3; 4471 if (decode[i] == 0xFFFE) 4472 /* unmapped character */ 4473 continue; 4474 o1 = decode[i]>>11; 4475 o2 = (decode[i]>>7) & 0xF; 4476 i2 = 16*mlevel1[o1] + o2; 4477 if (mlevel2[i2] == 0xFF) 4478 mlevel2[i2] = count3++; 4479 o3 = decode[i] & 0x7F; 4480 i3 = 128*mlevel2[i2] + o3; 4481 mlevel3[i3] = i; 4482 } 4483 return result; 4484 } 4485 4486 static int 4487 encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 4488 { 4489 struct encoding_map *map = (struct encoding_map*)mapping; 4490 int l1 = c>>11; 4491 int l2 = (c>>7) & 0xF; 4492 int l3 = c & 0x7F; 4493 int i; 4494 4495 #ifdef Py_UNICODE_WIDE 4496 if (c > 0xFFFF) { 4497 return -1; 4498 } 4499 #endif 4500 if (c == 0) 4501 return 0; 4502 /* level 1*/ 4503 i = map->level1[l1]; 4504 if (i == 0xFF) { 4505 return -1; 4506 } 4507 /* level 2*/ 4508 i = map->level23[16*i+l2]; 4509 if (i == 0xFF) { 4510 return -1; 4511 } 4512 /* level 3 */ 4513 i = map->level23[16*map->count2 + 128*i + l3]; 4514 if (i == 0) { 4515 return -1; 4516 } 4517 return i; 4518 } 4519 4520 /* Lookup the character ch in the mapping. If the character 4521 can't be found, Py_None is returned (or NULL, if another 4522 error occurred). */ 4523 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 4524 { 4525 PyObject *w = PyInt_FromLong((long)c); 4526 PyObject *x; 4527 4528 if (w == NULL) 4529 return NULL; 4530 x = PyObject_GetItem(mapping, w); 4531 Py_DECREF(w); 4532 if (x == NULL) { 4533 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4534 /* No mapping found means: mapping is undefined. */ 4535 PyErr_Clear(); 4536 x = Py_None; 4537 Py_INCREF(x); 4538 return x; 4539 } else 4540 return NULL; 4541 } 4542 else if (x == Py_None) 4543 return x; 4544 else if (PyInt_Check(x)) { 4545 long value = PyInt_AS_LONG(x); 4546 if (value < 0 || value > 255) { 4547 PyErr_SetString(PyExc_TypeError, 4548 "character mapping must be in range(256)"); 4549 Py_DECREF(x); 4550 return NULL; 4551 } 4552 return x; 4553 } 4554 else if (PyString_Check(x)) 4555 return x; 4556 else { 4557 /* wrong return value */ 4558 PyErr_SetString(PyExc_TypeError, 4559 "character mapping must return integer, None or str"); 4560 Py_DECREF(x); 4561 return NULL; 4562 } 4563 } 4564 4565 static int 4566 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 4567 { 4568 Py_ssize_t outsize = PyString_GET_SIZE(*outobj); 4569 /* exponentially overallocate to minimize reallocations */ 4570 if (requiredsize < 2*outsize) 4571 requiredsize = 2*outsize; 4572 if (_PyString_Resize(outobj, requiredsize)) { 4573 return 0; 4574 } 4575 return 1; 4576 } 4577 4578 typedef enum charmapencode_result { 4579 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 4580 }charmapencode_result; 4581 /* lookup the character, put the result in the output string and adjust 4582 various state variables. Reallocate the output string if not enough 4583 space is available. Return a new reference to the object that 4584 was put in the output buffer, or Py_None, if the mapping was undefined 4585 (in which case no character was written) or NULL, if a 4586 reallocation error occurred. The caller must decref the result */ 4587 static 4588 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, 4589 PyObject **outobj, Py_ssize_t *outpos) 4590 { 4591 PyObject *rep; 4592 char *outstart; 4593 Py_ssize_t outsize = PyString_GET_SIZE(*outobj); 4594 4595 if (Py_TYPE(mapping) == &EncodingMapType) { 4596 int res = encoding_map_lookup(c, mapping); 4597 Py_ssize_t requiredsize = *outpos+1; 4598 if (res == -1) 4599 return enc_FAILED; 4600 if (outsize<requiredsize) 4601 if (!charmapencode_resize(outobj, outpos, requiredsize)) 4602 return enc_EXCEPTION; 4603 outstart = PyString_AS_STRING(*outobj); 4604 outstart[(*outpos)++] = (char)res; 4605 return enc_SUCCESS; 4606 } 4607 4608 rep = charmapencode_lookup(c, mapping); 4609 if (rep==NULL) 4610 return enc_EXCEPTION; 4611 else if (rep==Py_None) { 4612 Py_DECREF(rep); 4613 return enc_FAILED; 4614 } else { 4615 if (PyInt_Check(rep)) { 4616 Py_ssize_t requiredsize = *outpos+1; 4617 if (outsize<requiredsize) 4618 if (!charmapencode_resize(outobj, outpos, requiredsize)) { 4619 Py_DECREF(rep); 4620 return enc_EXCEPTION; 4621 } 4622 outstart = PyString_AS_STRING(*outobj); 4623 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep); 4624 } 4625 else { 4626 const char *repchars = PyString_AS_STRING(rep); 4627 Py_ssize_t repsize = PyString_GET_SIZE(rep); 4628 Py_ssize_t requiredsize = *outpos+repsize; 4629 if (outsize<requiredsize) 4630 if (!charmapencode_resize(outobj, outpos, requiredsize)) { 4631 Py_DECREF(rep); 4632 return enc_EXCEPTION; 4633 } 4634 outstart = PyString_AS_STRING(*outobj); 4635 memcpy(outstart + *outpos, repchars, repsize); 4636 *outpos += repsize; 4637 } 4638 } 4639 Py_DECREF(rep); 4640 return enc_SUCCESS; 4641 } 4642 4643 /* handle an error in PyUnicode_EncodeCharmap 4644 Return 0 on success, -1 on error */ 4645 static 4646 int charmap_encoding_error( 4647 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 4648 PyObject **exceptionObject, 4649 int *known_errorHandler, PyObject **errorHandler, const char *errors, 4650 PyObject **res, Py_ssize_t *respos) 4651 { 4652 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 4653 Py_ssize_t repsize; 4654 Py_ssize_t newpos; 4655 Py_UNICODE *uni2; 4656 /* startpos for collecting unencodable chars */ 4657 Py_ssize_t collstartpos = *inpos; 4658 Py_ssize_t collendpos = *inpos+1; 4659 Py_ssize_t collpos; 4660 char *encoding = "charmap"; 4661 char *reason = "character maps to <undefined>"; 4662 charmapencode_result x; 4663 4664 /* find all unencodable characters */ 4665 while (collendpos < size) { 4666 PyObject *rep; 4667 if (Py_TYPE(mapping) == &EncodingMapType) { 4668 int res = encoding_map_lookup(p[collendpos], mapping); 4669 if (res != -1) 4670 break; 4671 ++collendpos; 4672 continue; 4673 } 4674 4675 rep = charmapencode_lookup(p[collendpos], mapping); 4676 if (rep==NULL) 4677 return -1; 4678 else if (rep!=Py_None) { 4679 Py_DECREF(rep); 4680 break; 4681 } 4682 Py_DECREF(rep); 4683 ++collendpos; 4684 } 4685 /* cache callback name lookup 4686 * (if not done yet, i.e. it's the first error) */ 4687 if (*known_errorHandler==-1) { 4688 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4689 *known_errorHandler = 1; 4690 else if (!strcmp(errors, "replace")) 4691 *known_errorHandler = 2; 4692 else if (!strcmp(errors, "ignore")) 4693 *known_errorHandler = 3; 4694 else if (!strcmp(errors, "xmlcharrefreplace")) 4695 *known_errorHandler = 4; 4696 else 4697 *known_errorHandler = 0; 4698 } 4699 switch (*known_errorHandler) { 4700 case 1: /* strict */ 4701 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4702 return -1; 4703 case 2: /* replace */ 4704 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 4705 x = charmapencode_output('?', mapping, res, respos); 4706 if (x==enc_EXCEPTION) { 4707 return -1; 4708 } 4709 else if (x==enc_FAILED) { 4710 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4711 return -1; 4712 } 4713 } 4714 /* fall through */ 4715 case 3: /* ignore */ 4716 *inpos = collendpos; 4717 break; 4718 case 4: /* xmlcharrefreplace */ 4719 /* generate replacement */ 4720 for (collpos = collstartpos; collpos < collendpos;) { 4721 char buffer[2+29+1+1]; 4722 char *cp; 4723 Py_UCS4 ch = p[collpos++]; 4724 #ifndef Py_UNICODE_WIDE 4725 if ((0xD800 <= ch && ch <= 0xDBFF) && 4726 (collpos < collendpos) && 4727 (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) { 4728 ch = ((((ch & 0x03FF) << 10) | 4729 ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000); 4730 } 4731 #endif 4732 sprintf(buffer, "&#%d;", (int)ch); 4733 for (cp = buffer; *cp; ++cp) { 4734 x = charmapencode_output(*cp, mapping, res, respos); 4735 if (x==enc_EXCEPTION) 4736 return -1; 4737 else if (x==enc_FAILED) { 4738 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4739 return -1; 4740 } 4741 } 4742 } 4743 *inpos = collendpos; 4744 break; 4745 default: 4746 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 4747 encoding, reason, p, size, exceptionObject, 4748 collstartpos, collendpos, &newpos); 4749 if (repunicode == NULL) 4750 return -1; 4751 /* generate replacement */ 4752 repsize = PyUnicode_GET_SIZE(repunicode); 4753 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 4754 x = charmapencode_output(*uni2, mapping, res, respos); 4755 if (x==enc_EXCEPTION) { 4756 return -1; 4757 } 4758 else if (x==enc_FAILED) { 4759 Py_DECREF(repunicode); 4760 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4761 return -1; 4762 } 4763 } 4764 *inpos = newpos; 4765 Py_DECREF(repunicode); 4766 } 4767 return 0; 4768 } 4769 4770 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 4771 Py_ssize_t size, 4772 PyObject *mapping, 4773 const char *errors) 4774 { 4775 /* output object */ 4776 PyObject *res = NULL; 4777 /* current input position */ 4778 Py_ssize_t inpos = 0; 4779 /* current output position */ 4780 Py_ssize_t respos = 0; 4781 PyObject *errorHandler = NULL; 4782 PyObject *exc = NULL; 4783 /* the following variable is used for caching string comparisons 4784 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 4785 * 3=ignore, 4=xmlcharrefreplace */ 4786 int known_errorHandler = -1; 4787 4788 /* Default to Latin-1 */ 4789 if (mapping == NULL) 4790 return PyUnicode_EncodeLatin1(p, size, errors); 4791 4792 /* allocate enough for a simple encoding without 4793 replacements, if we need more, we'll resize */ 4794 res = PyString_FromStringAndSize(NULL, size); 4795 if (res == NULL) 4796 goto onError; 4797 if (size == 0) 4798 return res; 4799 4800 while (inpos<size) { 4801 /* try to encode it */ 4802 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 4803 if (x==enc_EXCEPTION) /* error */ 4804 goto onError; 4805 if (x==enc_FAILED) { /* unencodable character */ 4806 if (charmap_encoding_error(p, size, &inpos, mapping, 4807 &exc, 4808 &known_errorHandler, &errorHandler, errors, 4809 &res, &respos)) { 4810 goto onError; 4811 } 4812 } 4813 else 4814 /* done with this character => adjust input position */ 4815 ++inpos; 4816 } 4817 4818 /* Resize if we allocated to much */ 4819 if (respos<PyString_GET_SIZE(res)) { 4820 if (_PyString_Resize(&res, respos)) 4821 goto onError; 4822 } 4823 Py_XDECREF(exc); 4824 Py_XDECREF(errorHandler); 4825 return res; 4826 4827 onError: 4828 Py_XDECREF(res); 4829 Py_XDECREF(exc); 4830 Py_XDECREF(errorHandler); 4831 return NULL; 4832 } 4833 4834 PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 4835 PyObject *mapping) 4836 { 4837 if (!PyUnicode_Check(unicode) || mapping == NULL) { 4838 PyErr_BadArgument(); 4839 return NULL; 4840 } 4841 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 4842 PyUnicode_GET_SIZE(unicode), 4843 mapping, 4844 NULL); 4845 } 4846 4847 /* create or adjust a UnicodeTranslateError */ 4848 static void make_translate_exception(PyObject **exceptionObject, 4849 const Py_UNICODE *unicode, Py_ssize_t size, 4850 Py_ssize_t startpos, Py_ssize_t endpos, 4851 const char *reason) 4852 { 4853 if (*exceptionObject == NULL) { 4854 *exceptionObject = PyUnicodeTranslateError_Create( 4855 unicode, size, startpos, endpos, reason); 4856 } 4857 else { 4858 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 4859 goto onError; 4860 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 4861 goto onError; 4862 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 4863 goto onError; 4864 return; 4865 onError: 4866 Py_CLEAR(*exceptionObject); 4867 } 4868 } 4869 4870 /* raises a UnicodeTranslateError */ 4871 static void raise_translate_exception(PyObject **exceptionObject, 4872 const Py_UNICODE *unicode, Py_ssize_t size, 4873 Py_ssize_t startpos, Py_ssize_t endpos, 4874 const char *reason) 4875 { 4876 make_translate_exception(exceptionObject, 4877 unicode, size, startpos, endpos, reason); 4878 if (*exceptionObject != NULL) 4879 PyCodec_StrictErrors(*exceptionObject); 4880 } 4881 4882 /* error handling callback helper: 4883 build arguments, call the callback and check the arguments, 4884 put the result into newpos and return the replacement string, which 4885 has to be freed by the caller */ 4886 static PyObject *unicode_translate_call_errorhandler(const char *errors, 4887 PyObject **errorHandler, 4888 const char *reason, 4889 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 4890 Py_ssize_t startpos, Py_ssize_t endpos, 4891 Py_ssize_t *newpos) 4892 { 4893 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple"; 4894 4895 Py_ssize_t i_newpos; 4896 PyObject *restuple; 4897 PyObject *resunicode; 4898 4899 if (*errorHandler == NULL) { 4900 *errorHandler = PyCodec_LookupError(errors); 4901 if (*errorHandler == NULL) 4902 return NULL; 4903 } 4904 4905 make_translate_exception(exceptionObject, 4906 unicode, size, startpos, endpos, reason); 4907 if (*exceptionObject == NULL) 4908 return NULL; 4909 4910 restuple = PyObject_CallFunctionObjArgs( 4911 *errorHandler, *exceptionObject, NULL); 4912 if (restuple == NULL) 4913 return NULL; 4914 if (!PyTuple_Check(restuple)) { 4915 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4916 Py_DECREF(restuple); 4917 return NULL; 4918 } 4919 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 4920 &resunicode, &i_newpos)) { 4921 Py_DECREF(restuple); 4922 return NULL; 4923 } 4924 if (i_newpos<0) 4925 *newpos = size+i_newpos; 4926 else 4927 *newpos = i_newpos; 4928 if (*newpos<0 || *newpos>size) { 4929 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 4930 Py_DECREF(restuple); 4931 return NULL; 4932 } 4933 Py_INCREF(resunicode); 4934 Py_DECREF(restuple); 4935 return resunicode; 4936 } 4937 4938 /* Lookup the character ch in the mapping and put the result in result, 4939 which must be decrefed by the caller. 4940 Return 0 on success, -1 on error */ 4941 static 4942 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 4943 { 4944 PyObject *w = PyInt_FromLong((long)c); 4945 PyObject *x; 4946 4947 if (w == NULL) 4948 return -1; 4949 x = PyObject_GetItem(mapping, w); 4950 Py_DECREF(w); 4951 if (x == NULL) { 4952 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4953 /* No mapping found means: use 1:1 mapping. */ 4954 PyErr_Clear(); 4955 *result = NULL; 4956 return 0; 4957 } else 4958 return -1; 4959 } 4960 else if (x == Py_None) { 4961 *result = x; 4962 return 0; 4963 } 4964 else if (PyInt_Check(x)) { 4965 long value = PyInt_AS_LONG(x); 4966 long max = PyUnicode_GetMax(); 4967 if (value < 0 || value > max) { 4968 PyErr_Format(PyExc_TypeError, 4969 "character mapping must be in range(0x%lx)", max+1); 4970 Py_DECREF(x); 4971 return -1; 4972 } 4973 *result = x; 4974 return 0; 4975 } 4976 else if (PyUnicode_Check(x)) { 4977 *result = x; 4978 return 0; 4979 } 4980 else { 4981 /* wrong return value */ 4982 PyErr_SetString(PyExc_TypeError, 4983 "character mapping must return integer, None or unicode"); 4984 Py_DECREF(x); 4985 return -1; 4986 } 4987 } 4988 /* ensure that *outobj is at least requiredsize characters long, 4989 if not reallocate and adjust various state variables. 4990 Return 0 on success, -1 on error */ 4991 static 4992 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, 4993 Py_ssize_t requiredsize) 4994 { 4995 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj); 4996 if (requiredsize > oldsize) { 4997 /* remember old output position */ 4998 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 4999 /* exponentially overallocate to minimize reallocations */ 5000 if (requiredsize < 2 * oldsize) 5001 requiredsize = 2 * oldsize; 5002 if (PyUnicode_Resize(outobj, requiredsize) < 0) 5003 return -1; 5004 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 5005 } 5006 return 0; 5007 } 5008 /* lookup the character, put the result in the output string and adjust 5009 various state variables. Return a new reference to the object that 5010 was put in the output buffer in *result, or Py_None, if the mapping was 5011 undefined (in which case no character was written). 5012 The called must decref result. 5013 Return 0 on success, -1 on error. */ 5014 static 5015 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp, 5016 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp, 5017 PyObject **res) 5018 { 5019 if (charmaptranslate_lookup(*curinp, mapping, res)) 5020 return -1; 5021 if (*res==NULL) { 5022 /* not found => default to 1:1 mapping */ 5023 *(*outp)++ = *curinp; 5024 } 5025 else if (*res==Py_None) 5026 ; 5027 else if (PyInt_Check(*res)) { 5028 /* no overflow check, because we know that the space is enough */ 5029 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res); 5030 } 5031 else if (PyUnicode_Check(*res)) { 5032 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); 5033 if (repsize==1) { 5034 /* no overflow check, because we know that the space is enough */ 5035 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 5036 } 5037 else if (repsize!=0) { 5038 /* more than one character */ 5039 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + 5040 (insize - (curinp-startinp)) + 5041 repsize - 1; 5042 if (charmaptranslate_makespace(outobj, outp, requiredsize)) 5043 return -1; 5044 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 5045 *outp += repsize; 5046 } 5047 } 5048 else 5049 return -1; 5050 return 0; 5051 } 5052 5053 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, 5054 Py_ssize_t size, 5055 PyObject *mapping, 5056 const char *errors) 5057 { 5058 /* output object */ 5059 PyObject *res = NULL; 5060 /* pointers to the beginning and end+1 of input */ 5061 const Py_UNICODE *startp = p; 5062 const Py_UNICODE *endp = p + size; 5063 /* pointer into the output */ 5064 Py_UNICODE *str; 5065 /* current output position */ 5066 Py_ssize_t respos = 0; 5067 char *reason = "character maps to <undefined>"; 5068 PyObject *errorHandler = NULL; 5069 PyObject *exc = NULL; 5070 /* the following variable is used for caching string comparisons 5071 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 5072 * 3=ignore, 4=xmlcharrefreplace */ 5073 int known_errorHandler = -1; 5074 5075 if (mapping == NULL) { 5076 PyErr_BadArgument(); 5077 return NULL; 5078 } 5079 5080 /* allocate enough for a simple 1:1 translation without 5081 replacements, if we need more, we'll resize */ 5082 res = PyUnicode_FromUnicode(NULL, size); 5083 if (res == NULL) 5084 goto onError; 5085 if (size == 0) 5086 return res; 5087 str = PyUnicode_AS_UNICODE(res); 5088 5089 while (p<endp) { 5090 /* try to encode it */ 5091 PyObject *x = NULL; 5092 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) { 5093 Py_XDECREF(x); 5094 goto onError; 5095 } 5096 Py_XDECREF(x); 5097 if (x!=Py_None) /* it worked => adjust input pointer */ 5098 ++p; 5099 else { /* untranslatable character */ 5100 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 5101 Py_ssize_t repsize; 5102 Py_ssize_t newpos; 5103 Py_UNICODE *uni2; 5104 /* startpos for collecting untranslatable chars */ 5105 const Py_UNICODE *collstart = p; 5106 const Py_UNICODE *collend = p+1; 5107 const Py_UNICODE *coll; 5108 5109 /* find all untranslatable characters */ 5110 while (collend < endp) { 5111 if (charmaptranslate_lookup(*collend, mapping, &x)) 5112 goto onError; 5113 Py_XDECREF(x); 5114 if (x!=Py_None) 5115 break; 5116 ++collend; 5117 } 5118 /* cache callback name lookup 5119 * (if not done yet, i.e. it's the first error) */ 5120 if (known_errorHandler==-1) { 5121 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5122 known_errorHandler = 1; 5123 else if (!strcmp(errors, "replace")) 5124 known_errorHandler = 2; 5125 else if (!strcmp(errors, "ignore")) 5126 known_errorHandler = 3; 5127 else if (!strcmp(errors, "xmlcharrefreplace")) 5128 known_errorHandler = 4; 5129 else 5130 known_errorHandler = 0; 5131 } 5132 switch (known_errorHandler) { 5133 case 1: /* strict */ 5134 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 5135 goto onError; 5136 case 2: /* replace */ 5137 /* No need to check for space, this is a 1:1 replacement */ 5138 for (coll = collstart; coll<collend; ++coll) 5139 *str++ = '?'; 5140 /* fall through */ 5141 case 3: /* ignore */ 5142 p = collend; 5143 break; 5144 case 4: /* xmlcharrefreplace */ 5145 /* generate replacement (temporarily (mis)uses p) */ 5146 for (p = collstart; p < collend;) { 5147 char buffer[2+29+1+1]; 5148 char *cp; 5149 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend); 5150 sprintf(buffer, "&#%d;", (int)ch); 5151 if (charmaptranslate_makespace(&res, &str, 5152 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 5153 goto onError; 5154 for (cp = buffer; *cp; ++cp) 5155 *str++ = *cp; 5156 } 5157 p = collend; 5158 break; 5159 default: 5160 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 5161 reason, startp, size, &exc, 5162 collstart-startp, collend-startp, &newpos); 5163 if (repunicode == NULL) 5164 goto onError; 5165 /* generate replacement */ 5166 repsize = PyUnicode_GET_SIZE(repunicode); 5167 if (charmaptranslate_makespace(&res, &str, 5168 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 5169 Py_DECREF(repunicode); 5170 goto onError; 5171 } 5172 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 5173 *str++ = *uni2; 5174 p = startp + newpos; 5175 Py_DECREF(repunicode); 5176 } 5177 } 5178 } 5179 /* Resize if we allocated to much */ 5180 respos = str-PyUnicode_AS_UNICODE(res); 5181 if (respos<PyUnicode_GET_SIZE(res)) { 5182 if (PyUnicode_Resize(&res, respos) < 0) 5183 goto onError; 5184 } 5185 Py_XDECREF(exc); 5186 Py_XDECREF(errorHandler); 5187 return res; 5188 5189 onError: 5190 Py_XDECREF(res); 5191 Py_XDECREF(exc); 5192 Py_XDECREF(errorHandler); 5193 return NULL; 5194 } 5195 5196 PyObject *PyUnicode_Translate(PyObject *str, 5197 PyObject *mapping, 5198 const char *errors) 5199 { 5200 PyObject *result; 5201 5202 str = PyUnicode_FromObject(str); 5203 if (str == NULL) 5204 goto onError; 5205 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 5206 PyUnicode_GET_SIZE(str), 5207 mapping, 5208 errors); 5209 Py_DECREF(str); 5210 return result; 5211 5212 onError: 5213 Py_XDECREF(str); 5214 return NULL; 5215 } 5216 5217 /* --- Decimal Encoder ---------------------------------------------------- */ 5218 5219 int PyUnicode_EncodeDecimal(Py_UNICODE *s, 5220 Py_ssize_t length, 5221 char *output, 5222 const char *errors) 5223 { 5224 Py_UNICODE *p, *end; 5225 PyObject *errorHandler = NULL; 5226 PyObject *exc = NULL; 5227 const char *encoding = "decimal"; 5228 const char *reason = "invalid decimal Unicode string"; 5229 /* the following variable is used for caching string comparisons 5230 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 5231 int known_errorHandler = -1; 5232 5233 if (output == NULL) { 5234 PyErr_BadArgument(); 5235 return -1; 5236 } 5237 5238 p = s; 5239 end = s + length; 5240 while (p < end) { 5241 register Py_UNICODE ch = *p; 5242 int decimal; 5243 PyObject *repunicode; 5244 Py_ssize_t repsize; 5245 Py_ssize_t newpos; 5246 Py_UNICODE *uni2; 5247 Py_UNICODE *collstart; 5248 Py_UNICODE *collend; 5249 5250 if (Py_UNICODE_ISSPACE(ch)) { 5251 *output++ = ' '; 5252 ++p; 5253 continue; 5254 } 5255 decimal = Py_UNICODE_TODECIMAL(ch); 5256 if (decimal >= 0) { 5257 *output++ = '0' + decimal; 5258 ++p; 5259 continue; 5260 } 5261 if (0 < ch && ch < 256) { 5262 *output++ = (char)ch; 5263 ++p; 5264 continue; 5265 } 5266 /* All other characters are considered unencodable */ 5267 collstart = p; 5268 for (collend = p+1; collend < end; collend++) { 5269 if ((0 < *collend && *collend < 256) || 5270 Py_UNICODE_ISSPACE(*collend) || 5271 0 <= Py_UNICODE_TODECIMAL(*collend)) 5272 break; 5273 } 5274 /* cache callback name lookup 5275 * (if not done yet, i.e. it's the first error) */ 5276 if (known_errorHandler==-1) { 5277 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5278 known_errorHandler = 1; 5279 else if (!strcmp(errors, "replace")) 5280 known_errorHandler = 2; 5281 else if (!strcmp(errors, "ignore")) 5282 known_errorHandler = 3; 5283 else if (!strcmp(errors, "xmlcharrefreplace")) 5284 known_errorHandler = 4; 5285 else 5286 known_errorHandler = 0; 5287 } 5288 switch (known_errorHandler) { 5289 case 1: /* strict */ 5290 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 5291 goto onError; 5292 case 2: /* replace */ 5293 for (p = collstart; p < collend; ++p) 5294 *output++ = '?'; 5295 /* fall through */ 5296 case 3: /* ignore */ 5297 p = collend; 5298 break; 5299 case 4: /* xmlcharrefreplace */ 5300 /* generate replacement (temporarily (mis)uses p) */ 5301 for (p = collstart; p < collend;) { 5302 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend); 5303 output += sprintf(output, "&#%d;", ch); 5304 } 5305 p = collend; 5306 break; 5307 default: 5308 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 5309 encoding, reason, s, length, &exc, 5310 collstart-s, collend-s, &newpos); 5311 if (repunicode == NULL) 5312 goto onError; 5313 /* generate replacement */ 5314 repsize = PyUnicode_GET_SIZE(repunicode); 5315 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 5316 Py_UNICODE ch = *uni2; 5317 if (Py_UNICODE_ISSPACE(ch)) 5318 *output++ = ' '; 5319 else { 5320 decimal = Py_UNICODE_TODECIMAL(ch); 5321 if (decimal >= 0) 5322 *output++ = '0' + decimal; 5323 else if (0 < ch && ch < 256) 5324 *output++ = (char)ch; 5325 else { 5326 Py_DECREF(repunicode); 5327 raise_encode_exception(&exc, encoding, 5328 s, length, collstart-s, collend-s, reason); 5329 goto onError; 5330 } 5331 } 5332 } 5333 p = s + newpos; 5334 Py_DECREF(repunicode); 5335 } 5336 } 5337 /* 0-terminate the output string */ 5338 *output++ = '\0'; 5339 Py_XDECREF(exc); 5340 Py_XDECREF(errorHandler); 5341 return 0; 5342 5343 onError: 5344 Py_XDECREF(exc); 5345 Py_XDECREF(errorHandler); 5346 return -1; 5347 } 5348 5349 /* --- Helpers ------------------------------------------------------------ */ 5350 5351 #include "stringlib/unicodedefs.h" 5352 #include "stringlib/fastsearch.h" 5353 5354 #include "stringlib/count.h" 5355 #include "stringlib/find.h" 5356 #include "stringlib/partition.h" 5357 #include "stringlib/split.h" 5358 5359 /* helper macro to fixup start/end slice values */ 5360 #define ADJUST_INDICES(start, end, len) \ 5361 if (end > len) \ 5362 end = len; \ 5363 else if (end < 0) { \ 5364 end += len; \ 5365 if (end < 0) \ 5366 end = 0; \ 5367 } \ 5368 if (start < 0) { \ 5369 start += len; \ 5370 if (start < 0) \ 5371 start = 0; \ 5372 } 5373 5374 Py_ssize_t PyUnicode_Count(PyObject *str, 5375 PyObject *substr, 5376 Py_ssize_t start, 5377 Py_ssize_t end) 5378 { 5379 Py_ssize_t result; 5380 PyUnicodeObject* str_obj; 5381 PyUnicodeObject* sub_obj; 5382 5383 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 5384 if (!str_obj) 5385 return -1; 5386 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 5387 if (!sub_obj) { 5388 Py_DECREF(str_obj); 5389 return -1; 5390 } 5391 5392 ADJUST_INDICES(start, end, str_obj->length); 5393 result = stringlib_count( 5394 str_obj->str + start, end - start, sub_obj->str, sub_obj->length, 5395 PY_SSIZE_T_MAX 5396 ); 5397 5398 Py_DECREF(sub_obj); 5399 Py_DECREF(str_obj); 5400 5401 return result; 5402 } 5403 5404 Py_ssize_t PyUnicode_Find(PyObject *str, 5405 PyObject *sub, 5406 Py_ssize_t start, 5407 Py_ssize_t end, 5408 int direction) 5409 { 5410 Py_ssize_t result; 5411 5412 str = PyUnicode_FromObject(str); 5413 if (!str) 5414 return -2; 5415 sub = PyUnicode_FromObject(sub); 5416 if (!sub) { 5417 Py_DECREF(str); 5418 return -2; 5419 } 5420 5421 if (direction > 0) 5422 result = stringlib_find_slice( 5423 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 5424 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 5425 start, end 5426 ); 5427 else 5428 result = stringlib_rfind_slice( 5429 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 5430 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 5431 start, end 5432 ); 5433 5434 Py_DECREF(str); 5435 Py_DECREF(sub); 5436 5437 return result; 5438 } 5439 5440 static 5441 int tailmatch(PyUnicodeObject *self, 5442 PyUnicodeObject *substring, 5443 Py_ssize_t start, 5444 Py_ssize_t end, 5445 int direction) 5446 { 5447 if (substring->length == 0) 5448 return 1; 5449 5450 ADJUST_INDICES(start, end, self->length); 5451 end -= substring->length; 5452 if (end < start) 5453 return 0; 5454 5455 if (direction > 0) { 5456 if (Py_UNICODE_MATCH(self, end, substring)) 5457 return 1; 5458 } else { 5459 if (Py_UNICODE_MATCH(self, start, substring)) 5460 return 1; 5461 } 5462 5463 return 0; 5464 } 5465 5466 Py_ssize_t PyUnicode_Tailmatch(PyObject *str, 5467 PyObject *substr, 5468 Py_ssize_t start, 5469 Py_ssize_t end, 5470 int direction) 5471 { 5472 Py_ssize_t result; 5473 5474 str = PyUnicode_FromObject(str); 5475 if (str == NULL) 5476 return -1; 5477 substr = PyUnicode_FromObject(substr); 5478 if (substr == NULL) { 5479 Py_DECREF(str); 5480 return -1; 5481 } 5482 5483 result = tailmatch((PyUnicodeObject *)str, 5484 (PyUnicodeObject *)substr, 5485 start, end, direction); 5486 Py_DECREF(str); 5487 Py_DECREF(substr); 5488 return result; 5489 } 5490 5491 /* Apply fixfct filter to the Unicode object self and return a 5492 reference to the modified object */ 5493 5494 static 5495 PyObject *fixup(PyUnicodeObject *self, 5496 int (*fixfct)(PyUnicodeObject *s)) 5497 { 5498 5499 PyUnicodeObject *u; 5500 5501 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5502 if (u == NULL) 5503 return NULL; 5504 5505 Py_UNICODE_COPY(u->str, self->str, self->length); 5506 5507 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 5508 /* fixfct should return TRUE if it modified the buffer. If 5509 FALSE, return a reference to the original buffer instead 5510 (to save space, not time) */ 5511 Py_INCREF(self); 5512 Py_DECREF(u); 5513 return (PyObject*) self; 5514 } 5515 return (PyObject*) u; 5516 } 5517 5518 static 5519 int fixupper(PyUnicodeObject *self) 5520 { 5521 Py_ssize_t len = self->length; 5522 Py_UNICODE *s = self->str; 5523 int status = 0; 5524 5525 while (len-- > 0) { 5526 register Py_UNICODE ch; 5527 5528 ch = Py_UNICODE_TOUPPER(*s); 5529 if (ch != *s) { 5530 status = 1; 5531 *s = ch; 5532 } 5533 s++; 5534 } 5535 5536 return status; 5537 } 5538 5539 static 5540 int fixlower(PyUnicodeObject *self) 5541 { 5542 Py_ssize_t len = self->length; 5543 Py_UNICODE *s = self->str; 5544 int status = 0; 5545 5546 while (len-- > 0) { 5547 register Py_UNICODE ch; 5548 5549 ch = Py_UNICODE_TOLOWER(*s); 5550 if (ch != *s) { 5551 status = 1; 5552 *s = ch; 5553 } 5554 s++; 5555 } 5556 5557 return status; 5558 } 5559 5560 static 5561 int fixswapcase(PyUnicodeObject *self) 5562 { 5563 Py_ssize_t len = self->length; 5564 Py_UNICODE *s = self->str; 5565 int status = 0; 5566 5567 while (len-- > 0) { 5568 if (Py_UNICODE_ISUPPER(*s)) { 5569 *s = Py_UNICODE_TOLOWER(*s); 5570 status = 1; 5571 } else if (Py_UNICODE_ISLOWER(*s)) { 5572 *s = Py_UNICODE_TOUPPER(*s); 5573 status = 1; 5574 } 5575 s++; 5576 } 5577 5578 return status; 5579 } 5580 5581 static 5582 int fixcapitalize(PyUnicodeObject *self) 5583 { 5584 Py_ssize_t len = self->length; 5585 Py_UNICODE *s = self->str; 5586 int status = 0; 5587 5588 if (len == 0) 5589 return 0; 5590 if (!Py_UNICODE_ISUPPER(*s)) { 5591 *s = Py_UNICODE_TOUPPER(*s); 5592 status = 1; 5593 } 5594 s++; 5595 while (--len > 0) { 5596 if (!Py_UNICODE_ISLOWER(*s)) { 5597 *s = Py_UNICODE_TOLOWER(*s); 5598 status = 1; 5599 } 5600 s++; 5601 } 5602 return status; 5603 } 5604 5605 static 5606 int fixtitle(PyUnicodeObject *self) 5607 { 5608 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5609 register Py_UNICODE *e; 5610 int previous_is_cased; 5611 5612 /* Shortcut for single character strings */ 5613 if (PyUnicode_GET_SIZE(self) == 1) { 5614 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 5615 if (*p != ch) { 5616 *p = ch; 5617 return 1; 5618 } 5619 else 5620 return 0; 5621 } 5622 5623 e = p + PyUnicode_GET_SIZE(self); 5624 previous_is_cased = 0; 5625 for (; p < e; p++) { 5626 register const Py_UNICODE ch = *p; 5627 5628 if (previous_is_cased) 5629 *p = Py_UNICODE_TOLOWER(ch); 5630 else 5631 *p = Py_UNICODE_TOTITLE(ch); 5632 5633 if (Py_UNICODE_ISLOWER(ch) || 5634 Py_UNICODE_ISUPPER(ch) || 5635 Py_UNICODE_ISTITLE(ch)) 5636 previous_is_cased = 1; 5637 else 5638 previous_is_cased = 0; 5639 } 5640 return 1; 5641 } 5642 5643 PyObject * 5644 PyUnicode_Join(PyObject *separator, PyObject *seq) 5645 { 5646 PyObject *internal_separator = NULL; 5647 const Py_UNICODE blank = ' '; 5648 const Py_UNICODE *sep = ␣ 5649 Py_ssize_t seplen = 1; 5650 PyUnicodeObject *res = NULL; /* the result */ 5651 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */ 5652 Py_ssize_t res_used; /* # used bytes */ 5653 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ 5654 PyObject *fseq; /* PySequence_Fast(seq) */ 5655 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 5656 PyObject *item; 5657 Py_ssize_t i; 5658 5659 fseq = PySequence_Fast(seq, "can only join an iterable"); 5660 if (fseq == NULL) { 5661 return NULL; 5662 } 5663 5664 /* Grrrr. A codec may be invoked to convert str objects to 5665 * Unicode, and so it's possible to call back into Python code 5666 * during PyUnicode_FromObject(), and so it's possible for a sick 5667 * codec to change the size of fseq (if seq is a list). Therefore 5668 * we have to keep refetching the size -- can't assume seqlen 5669 * is invariant. 5670 */ 5671 seqlen = PySequence_Fast_GET_SIZE(fseq); 5672 /* If empty sequence, return u"". */ 5673 if (seqlen == 0) { 5674 res = _PyUnicode_New(0); /* empty sequence; return u"" */ 5675 goto Done; 5676 } 5677 /* If singleton sequence with an exact Unicode, return that. */ 5678 if (seqlen == 1) { 5679 item = PySequence_Fast_GET_ITEM(fseq, 0); 5680 if (PyUnicode_CheckExact(item)) { 5681 Py_INCREF(item); 5682 res = (PyUnicodeObject *)item; 5683 goto Done; 5684 } 5685 } 5686 5687 /* At least two items to join, or one that isn't exact Unicode. */ 5688 if (seqlen > 1) { 5689 /* Set up sep and seplen -- they're needed. */ 5690 if (separator == NULL) { 5691 sep = ␣ 5692 seplen = 1; 5693 } 5694 else { 5695 internal_separator = PyUnicode_FromObject(separator); 5696 if (internal_separator == NULL) 5697 goto onError; 5698 sep = PyUnicode_AS_UNICODE(internal_separator); 5699 seplen = PyUnicode_GET_SIZE(internal_separator); 5700 /* In case PyUnicode_FromObject() mutated seq. */ 5701 seqlen = PySequence_Fast_GET_SIZE(fseq); 5702 } 5703 } 5704 5705 /* Get space. */ 5706 res = _PyUnicode_New(res_alloc); 5707 if (res == NULL) 5708 goto onError; 5709 res_p = PyUnicode_AS_UNICODE(res); 5710 res_used = 0; 5711 5712 for (i = 0; i < seqlen; ++i) { 5713 Py_ssize_t itemlen; 5714 Py_ssize_t new_res_used; 5715 5716 item = PySequence_Fast_GET_ITEM(fseq, i); 5717 /* Convert item to Unicode. */ 5718 if (! PyUnicode_Check(item) && ! PyString_Check(item)) { 5719 PyErr_Format(PyExc_TypeError, 5720 "sequence item %zd: expected string or Unicode," 5721 " %.80s found", 5722 i, Py_TYPE(item)->tp_name); 5723 goto onError; 5724 } 5725 item = PyUnicode_FromObject(item); 5726 if (item == NULL) 5727 goto onError; 5728 /* We own a reference to item from here on. */ 5729 5730 /* In case PyUnicode_FromObject() mutated seq. */ 5731 seqlen = PySequence_Fast_GET_SIZE(fseq); 5732 5733 /* Make sure we have enough space for the separator and the item. */ 5734 itemlen = PyUnicode_GET_SIZE(item); 5735 new_res_used = res_used + itemlen; 5736 if (new_res_used < 0) 5737 goto Overflow; 5738 if (i < seqlen - 1) { 5739 new_res_used += seplen; 5740 if (new_res_used < 0) 5741 goto Overflow; 5742 } 5743 if (new_res_used > res_alloc) { 5744 /* double allocated size until it's big enough */ 5745 do { 5746 res_alloc += res_alloc; 5747 if (res_alloc <= 0) 5748 goto Overflow; 5749 } while (new_res_used > res_alloc); 5750 if (_PyUnicode_Resize(&res, res_alloc) < 0) { 5751 Py_DECREF(item); 5752 goto onError; 5753 } 5754 res_p = PyUnicode_AS_UNICODE(res) + res_used; 5755 } 5756 5757 /* Copy item, and maybe the separator. */ 5758 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen); 5759 res_p += itemlen; 5760 if (i < seqlen - 1) { 5761 Py_UNICODE_COPY(res_p, sep, seplen); 5762 res_p += seplen; 5763 } 5764 Py_DECREF(item); 5765 res_used = new_res_used; 5766 } 5767 5768 /* Shrink res to match the used area; this probably can't fail, 5769 * but it's cheap to check. 5770 */ 5771 if (_PyUnicode_Resize(&res, res_used) < 0) 5772 goto onError; 5773 5774 Done: 5775 Py_XDECREF(internal_separator); 5776 Py_DECREF(fseq); 5777 return (PyObject *)res; 5778 5779 Overflow: 5780 PyErr_SetString(PyExc_OverflowError, 5781 "join() result is too long for a Python string"); 5782 Py_DECREF(item); 5783 /* fall through */ 5784 5785 onError: 5786 Py_XDECREF(internal_separator); 5787 Py_DECREF(fseq); 5788 Py_XDECREF(res); 5789 return NULL; 5790 } 5791 5792 static 5793 PyUnicodeObject *pad(PyUnicodeObject *self, 5794 Py_ssize_t left, 5795 Py_ssize_t right, 5796 Py_UNICODE fill) 5797 { 5798 PyUnicodeObject *u; 5799 5800 if (left < 0) 5801 left = 0; 5802 if (right < 0) 5803 right = 0; 5804 5805 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 5806 Py_INCREF(self); 5807 return self; 5808 } 5809 5810 if (left > PY_SSIZE_T_MAX - self->length || 5811 right > PY_SSIZE_T_MAX - (left + self->length)) { 5812 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 5813 return NULL; 5814 } 5815 u = _PyUnicode_New(left + self->length + right); 5816 if (u) { 5817 if (left) 5818 Py_UNICODE_FILL(u->str, fill, left); 5819 Py_UNICODE_COPY(u->str + left, self->str, self->length); 5820 if (right) 5821 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 5822 } 5823 5824 return u; 5825 } 5826 5827 PyObject *PyUnicode_Splitlines(PyObject *string, int keepends) 5828 { 5829 PyObject *list; 5830 5831 string = PyUnicode_FromObject(string); 5832 if (string == NULL) 5833 return NULL; 5834 5835 list = stringlib_splitlines( 5836 (PyObject*) string, PyUnicode_AS_UNICODE(string), 5837 PyUnicode_GET_SIZE(string), keepends); 5838 5839 Py_DECREF(string); 5840 return list; 5841 } 5842 5843 static 5844 PyObject *split(PyUnicodeObject *self, 5845 PyUnicodeObject *substring, 5846 Py_ssize_t maxcount) 5847 { 5848 if (maxcount < 0) 5849 maxcount = PY_SSIZE_T_MAX; 5850 5851 if (substring == NULL) 5852 return stringlib_split_whitespace( 5853 (PyObject*) self, self->str, self->length, maxcount 5854 ); 5855 5856 return stringlib_split( 5857 (PyObject*) self, self->str, self->length, 5858 substring->str, substring->length, 5859 maxcount 5860 ); 5861 } 5862 5863 static 5864 PyObject *rsplit(PyUnicodeObject *self, 5865 PyUnicodeObject *substring, 5866 Py_ssize_t maxcount) 5867 { 5868 if (maxcount < 0) 5869 maxcount = PY_SSIZE_T_MAX; 5870 5871 if (substring == NULL) 5872 return stringlib_rsplit_whitespace( 5873 (PyObject*) self, self->str, self->length, maxcount 5874 ); 5875 5876 return stringlib_rsplit( 5877 (PyObject*) self, self->str, self->length, 5878 substring->str, substring->length, 5879 maxcount 5880 ); 5881 } 5882 5883 static 5884 PyObject *replace(PyUnicodeObject *self, 5885 PyUnicodeObject *str1, 5886 PyUnicodeObject *str2, 5887 Py_ssize_t maxcount) 5888 { 5889 PyUnicodeObject *u; 5890 5891 if (maxcount < 0) 5892 maxcount = PY_SSIZE_T_MAX; 5893 else if (maxcount == 0 || self->length == 0) 5894 goto nothing; 5895 5896 if (str1->length == str2->length) { 5897 Py_ssize_t i; 5898 /* same length */ 5899 if (str1->length == 0) 5900 goto nothing; 5901 if (str1->length == 1) { 5902 /* replace characters */ 5903 Py_UNICODE u1, u2; 5904 if (!findchar(self->str, self->length, str1->str[0])) 5905 goto nothing; 5906 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5907 if (!u) 5908 return NULL; 5909 Py_UNICODE_COPY(u->str, self->str, self->length); 5910 u1 = str1->str[0]; 5911 u2 = str2->str[0]; 5912 for (i = 0; i < u->length; i++) 5913 if (u->str[i] == u1) { 5914 if (--maxcount < 0) 5915 break; 5916 u->str[i] = u2; 5917 } 5918 } else { 5919 i = stringlib_find( 5920 self->str, self->length, str1->str, str1->length, 0 5921 ); 5922 if (i < 0) 5923 goto nothing; 5924 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5925 if (!u) 5926 return NULL; 5927 Py_UNICODE_COPY(u->str, self->str, self->length); 5928 5929 /* change everything in-place, starting with this one */ 5930 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 5931 i += str1->length; 5932 5933 while ( --maxcount > 0) { 5934 i = stringlib_find(self->str+i, self->length-i, 5935 str1->str, str1->length, 5936 i); 5937 if (i == -1) 5938 break; 5939 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 5940 i += str1->length; 5941 } 5942 } 5943 } else { 5944 5945 Py_ssize_t n, i, j; 5946 Py_ssize_t product, new_size, delta; 5947 Py_UNICODE *p; 5948 5949 /* replace strings */ 5950 n = stringlib_count(self->str, self->length, str1->str, str1->length, 5951 maxcount); 5952 if (n == 0) 5953 goto nothing; 5954 /* new_size = self->length + n * (str2->length - str1->length)); */ 5955 delta = (str2->length - str1->length); 5956 if (delta == 0) { 5957 new_size = self->length; 5958 } else { 5959 product = n * (str2->length - str1->length); 5960 if ((product / (str2->length - str1->length)) != n) { 5961 PyErr_SetString(PyExc_OverflowError, 5962 "replace string is too long"); 5963 return NULL; 5964 } 5965 new_size = self->length + product; 5966 if (new_size < 0) { 5967 PyErr_SetString(PyExc_OverflowError, 5968 "replace string is too long"); 5969 return NULL; 5970 } 5971 } 5972 u = _PyUnicode_New(new_size); 5973 if (!u) 5974 return NULL; 5975 i = 0; 5976 p = u->str; 5977 if (str1->length > 0) { 5978 while (n-- > 0) { 5979 /* look for next match */ 5980 j = stringlib_find(self->str+i, self->length-i, 5981 str1->str, str1->length, 5982 i); 5983 if (j == -1) 5984 break; 5985 else if (j > i) { 5986 /* copy unchanged part [i:j] */ 5987 Py_UNICODE_COPY(p, self->str+i, j-i); 5988 p += j - i; 5989 } 5990 /* copy substitution string */ 5991 if (str2->length > 0) { 5992 Py_UNICODE_COPY(p, str2->str, str2->length); 5993 p += str2->length; 5994 } 5995 i = j + str1->length; 5996 } 5997 if (i < self->length) 5998 /* copy tail [i:] */ 5999 Py_UNICODE_COPY(p, self->str+i, self->length-i); 6000 } else { 6001 /* interleave */ 6002 while (n > 0) { 6003 Py_UNICODE_COPY(p, str2->str, str2->length); 6004 p += str2->length; 6005 if (--n <= 0) 6006 break; 6007 *p++ = self->str[i++]; 6008 } 6009 Py_UNICODE_COPY(p, self->str+i, self->length-i); 6010 } 6011 } 6012 return (PyObject *) u; 6013 6014 nothing: 6015 /* nothing to replace; return original string (when possible) */ 6016 if (PyUnicode_CheckExact(self)) { 6017 Py_INCREF(self); 6018 return (PyObject *) self; 6019 } 6020 return PyUnicode_FromUnicode(self->str, self->length); 6021 } 6022 6023 /* --- Unicode Object Methods --------------------------------------------- */ 6024 6025 PyDoc_STRVAR(title__doc__, 6026 "S.title() -> unicode\n\ 6027 \n\ 6028 Return a titlecased version of S, i.e. words start with title case\n\ 6029 characters, all remaining cased characters have lower case."); 6030 6031 static PyObject* 6032 unicode_title(PyUnicodeObject *self) 6033 { 6034 return fixup(self, fixtitle); 6035 } 6036 6037 PyDoc_STRVAR(capitalize__doc__, 6038 "S.capitalize() -> unicode\n\ 6039 \n\ 6040 Return a capitalized version of S, i.e. make the first character\n\ 6041 have upper case and the rest lower case."); 6042 6043 static PyObject* 6044 unicode_capitalize(PyUnicodeObject *self) 6045 { 6046 return fixup(self, fixcapitalize); 6047 } 6048 6049 #if 0 6050 PyDoc_STRVAR(capwords__doc__, 6051 "S.capwords() -> unicode\n\ 6052 \n\ 6053 Apply .capitalize() to all words in S and return the result with\n\ 6054 normalized whitespace (all whitespace strings are replaced by ' ')."); 6055 6056 static PyObject* 6057 unicode_capwords(PyUnicodeObject *self) 6058 { 6059 PyObject *list; 6060 PyObject *item; 6061 Py_ssize_t i; 6062 6063 /* Split into words */ 6064 list = split(self, NULL, -1); 6065 if (!list) 6066 return NULL; 6067 6068 /* Capitalize each word */ 6069 for (i = 0; i < PyList_GET_SIZE(list); i++) { 6070 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 6071 fixcapitalize); 6072 if (item == NULL) 6073 goto onError; 6074 Py_DECREF(PyList_GET_ITEM(list, i)); 6075 PyList_SET_ITEM(list, i, item); 6076 } 6077 6078 /* Join the words to form a new string */ 6079 item = PyUnicode_Join(NULL, list); 6080 6081 onError: 6082 Py_DECREF(list); 6083 return (PyObject *)item; 6084 } 6085 #endif 6086 6087 /* Argument converter. Coerces to a single unicode character */ 6088 6089 static int 6090 convert_uc(PyObject *obj, void *addr) 6091 { 6092 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr; 6093 PyObject *uniobj; 6094 Py_UNICODE *unistr; 6095 6096 uniobj = PyUnicode_FromObject(obj); 6097 if (uniobj == NULL) { 6098 PyErr_SetString(PyExc_TypeError, 6099 "The fill character cannot be converted to Unicode"); 6100 return 0; 6101 } 6102 if (PyUnicode_GET_SIZE(uniobj) != 1) { 6103 PyErr_SetString(PyExc_TypeError, 6104 "The fill character must be exactly one character long"); 6105 Py_DECREF(uniobj); 6106 return 0; 6107 } 6108 unistr = PyUnicode_AS_UNICODE(uniobj); 6109 *fillcharloc = unistr[0]; 6110 Py_DECREF(uniobj); 6111 return 1; 6112 } 6113 6114 PyDoc_STRVAR(center__doc__, 6115 "S.center(width[, fillchar]) -> unicode\n\ 6116 \n\ 6117 Return S centered in a Unicode string of length width. Padding is\n\ 6118 done using the specified fill character (default is a space)"); 6119 6120 static PyObject * 6121 unicode_center(PyUnicodeObject *self, PyObject *args) 6122 { 6123 Py_ssize_t marg, left; 6124 Py_ssize_t width; 6125 Py_UNICODE fillchar = ' '; 6126 6127 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 6128 return NULL; 6129 6130 if (self->length >= width && PyUnicode_CheckExact(self)) { 6131 Py_INCREF(self); 6132 return (PyObject*) self; 6133 } 6134 6135 marg = width - self->length; 6136 left = marg / 2 + (marg & width & 1); 6137 6138 return (PyObject*) pad(self, left, marg - left, fillchar); 6139 } 6140 6141 #if 0 6142 6143 /* This code should go into some future Unicode collation support 6144 module. The basic comparison should compare ordinals on a naive 6145 basis (this is what Java does and thus Jython too). */ 6146 6147 /* speedy UTF-16 code point order comparison */ 6148 /* gleaned from: */ 6149 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 6150 6151 static short utf16Fixup[32] = 6152 { 6153 0, 0, 0, 0, 0, 0, 0, 0, 6154 0, 0, 0, 0, 0, 0, 0, 0, 6155 0, 0, 0, 0, 0, 0, 0, 0, 6156 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 6157 }; 6158 6159 static int 6160 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 6161 { 6162 Py_ssize_t len1, len2; 6163 6164 Py_UNICODE *s1 = str1->str; 6165 Py_UNICODE *s2 = str2->str; 6166 6167 len1 = str1->length; 6168 len2 = str2->length; 6169 6170 while (len1 > 0 && len2 > 0) { 6171 Py_UNICODE c1, c2; 6172 6173 c1 = *s1++; 6174 c2 = *s2++; 6175 6176 if (c1 > (1<<11) * 26) 6177 c1 += utf16Fixup[c1>>11]; 6178 if (c2 > (1<<11) * 26) 6179 c2 += utf16Fixup[c2>>11]; 6180 /* now c1 and c2 are in UTF-32-compatible order */ 6181 6182 if (c1 != c2) 6183 return (c1 < c2) ? -1 : 1; 6184 6185 len1--; len2--; 6186 } 6187 6188 return (len1 < len2) ? -1 : (len1 != len2); 6189 } 6190 6191 #else 6192 6193 static int 6194 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 6195 { 6196 register Py_ssize_t len1, len2; 6197 6198 Py_UNICODE *s1 = str1->str; 6199 Py_UNICODE *s2 = str2->str; 6200 6201 len1 = str1->length; 6202 len2 = str2->length; 6203 6204 while (len1 > 0 && len2 > 0) { 6205 Py_UNICODE c1, c2; 6206 6207 c1 = *s1++; 6208 c2 = *s2++; 6209 6210 if (c1 != c2) 6211 return (c1 < c2) ? -1 : 1; 6212 6213 len1--; len2--; 6214 } 6215 6216 return (len1 < len2) ? -1 : (len1 != len2); 6217 } 6218 6219 #endif 6220 6221 int PyUnicode_Compare(PyObject *left, 6222 PyObject *right) 6223 { 6224 PyUnicodeObject *u = NULL, *v = NULL; 6225 int result; 6226 6227 /* Coerce the two arguments */ 6228 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 6229 if (u == NULL) 6230 goto onError; 6231 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 6232 if (v == NULL) 6233 goto onError; 6234 6235 /* Shortcut for empty or interned objects */ 6236 if (v == u) { 6237 Py_DECREF(u); 6238 Py_DECREF(v); 6239 return 0; 6240 } 6241 6242 result = unicode_compare(u, v); 6243 6244 Py_DECREF(u); 6245 Py_DECREF(v); 6246 return result; 6247 6248 onError: 6249 Py_XDECREF(u); 6250 Py_XDECREF(v); 6251 return -1; 6252 } 6253 6254 PyObject *PyUnicode_RichCompare(PyObject *left, 6255 PyObject *right, 6256 int op) 6257 { 6258 int result; 6259 6260 result = PyUnicode_Compare(left, right); 6261 if (result == -1 && PyErr_Occurred()) 6262 goto onError; 6263 6264 /* Convert the return value to a Boolean */ 6265 switch (op) { 6266 case Py_EQ: 6267 result = (result == 0); 6268 break; 6269 case Py_NE: 6270 result = (result != 0); 6271 break; 6272 case Py_LE: 6273 result = (result <= 0); 6274 break; 6275 case Py_GE: 6276 result = (result >= 0); 6277 break; 6278 case Py_LT: 6279 result = (result == -1); 6280 break; 6281 case Py_GT: 6282 result = (result == 1); 6283 break; 6284 } 6285 return PyBool_FromLong(result); 6286 6287 onError: 6288 6289 /* Standard case 6290 6291 Type errors mean that PyUnicode_FromObject() could not convert 6292 one of the arguments (usually the right hand side) to Unicode, 6293 ie. we can't handle the comparison request. However, it is 6294 possible that the other object knows a comparison method, which 6295 is why we return Py_NotImplemented to give the other object a 6296 chance. 6297 6298 */ 6299 if (PyErr_ExceptionMatches(PyExc_TypeError)) { 6300 PyErr_Clear(); 6301 Py_INCREF(Py_NotImplemented); 6302 return Py_NotImplemented; 6303 } 6304 if (op != Py_EQ && op != Py_NE) 6305 return NULL; 6306 6307 /* Equality comparison. 6308 6309 This is a special case: we silence any PyExc_UnicodeDecodeError 6310 and instead turn it into a PyErr_UnicodeWarning. 6311 6312 */ 6313 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) 6314 return NULL; 6315 PyErr_Clear(); 6316 if (PyErr_Warn(PyExc_UnicodeWarning, 6317 (op == Py_EQ) ? 6318 "Unicode equal comparison " 6319 "failed to convert both arguments to Unicode - " 6320 "interpreting them as being unequal" : 6321 "Unicode unequal comparison " 6322 "failed to convert both arguments to Unicode - " 6323 "interpreting them as being unequal" 6324 ) < 0) 6325 return NULL; 6326 result = (op == Py_NE); 6327 return PyBool_FromLong(result); 6328 } 6329 6330 int PyUnicode_Contains(PyObject *container, 6331 PyObject *element) 6332 { 6333 PyObject *str, *sub; 6334 int result; 6335 6336 /* Coerce the two arguments */ 6337 sub = PyUnicode_FromObject(element); 6338 if (!sub) { 6339 return -1; 6340 } 6341 6342 str = PyUnicode_FromObject(container); 6343 if (!str) { 6344 Py_DECREF(sub); 6345 return -1; 6346 } 6347 6348 result = stringlib_contains_obj(str, sub); 6349 6350 Py_DECREF(str); 6351 Py_DECREF(sub); 6352 6353 return result; 6354 } 6355 6356 /* Concat to string or Unicode object giving a new Unicode object. */ 6357 6358 PyObject *PyUnicode_Concat(PyObject *left, 6359 PyObject *right) 6360 { 6361 PyUnicodeObject *u = NULL, *v = NULL, *w; 6362 6363 /* Coerce the two arguments */ 6364 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 6365 if (u == NULL) 6366 goto onError; 6367 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 6368 if (v == NULL) 6369 goto onError; 6370 6371 /* Shortcuts */ 6372 if (v == unicode_empty) { 6373 Py_DECREF(v); 6374 return (PyObject *)u; 6375 } 6376 if (u == unicode_empty) { 6377 Py_DECREF(u); 6378 return (PyObject *)v; 6379 } 6380 6381 if (u->length > PY_SSIZE_T_MAX - v->length) { 6382 PyErr_SetString(PyExc_OverflowError, 6383 "strings are too large to concat"); 6384 goto onError; 6385 } 6386 6387 /* Concat the two Unicode strings */ 6388 w = _PyUnicode_New(u->length + v->length); 6389 if (w == NULL) 6390 goto onError; 6391 Py_UNICODE_COPY(w->str, u->str, u->length); 6392 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 6393 6394 Py_DECREF(u); 6395 Py_DECREF(v); 6396 return (PyObject *)w; 6397 6398 onError: 6399 Py_XDECREF(u); 6400 Py_XDECREF(v); 6401 return NULL; 6402 } 6403 6404 PyDoc_STRVAR(count__doc__, 6405 "S.count(sub[, start[, end]]) -> int\n\ 6406 \n\ 6407 Return the number of non-overlapping occurrences of substring sub in\n\ 6408 Unicode string S[start:end]. Optional arguments start and end are\n\ 6409 interpreted as in slice notation."); 6410 6411 static PyObject * 6412 unicode_count(PyUnicodeObject *self, PyObject *args) 6413 { 6414 PyUnicodeObject *substring; 6415 Py_ssize_t start = 0; 6416 Py_ssize_t end = PY_SSIZE_T_MAX; 6417 PyObject *result; 6418 6419 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 6420 &start, &end)) 6421 return NULL; 6422 6423 ADJUST_INDICES(start, end, self->length); 6424 result = PyInt_FromSsize_t( 6425 stringlib_count(self->str + start, end - start, 6426 substring->str, substring->length, 6427 PY_SSIZE_T_MAX) 6428 ); 6429 6430 Py_DECREF(substring); 6431 6432 return result; 6433 } 6434 6435 PyDoc_STRVAR(encode__doc__, 6436 "S.encode([encoding[,errors]]) -> string or unicode\n\ 6437 \n\ 6438 Encodes S using the codec registered for encoding. encoding defaults\n\ 6439 to the default encoding. errors may be given to set a different error\n\ 6440 handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 6441 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 6442 'xmlcharrefreplace' as well as any other name registered with\n\ 6443 codecs.register_error that can handle UnicodeEncodeErrors."); 6444 6445 static PyObject * 6446 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) 6447 { 6448 static char *kwlist[] = {"encoding", "errors", 0}; 6449 char *encoding = NULL; 6450 char *errors = NULL; 6451 PyObject *v; 6452 6453 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 6454 kwlist, &encoding, &errors)) 6455 return NULL; 6456 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors); 6457 if (v == NULL) 6458 goto onError; 6459 if (!PyString_Check(v) && !PyUnicode_Check(v)) { 6460 PyErr_Format(PyExc_TypeError, 6461 "encoder did not return a string/unicode object " 6462 "(type=%.400s)", 6463 Py_TYPE(v)->tp_name); 6464 Py_DECREF(v); 6465 return NULL; 6466 } 6467 return v; 6468 6469 onError: 6470 return NULL; 6471 } 6472 6473 PyDoc_STRVAR(decode__doc__, 6474 "S.decode([encoding[,errors]]) -> string or unicode\n\ 6475 \n\ 6476 Decodes S using the codec registered for encoding. encoding defaults\n\ 6477 to the default encoding. errors may be given to set a different error\n\ 6478 handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 6479 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\ 6480 as well as any other name registered with codecs.register_error that is\n\ 6481 able to handle UnicodeDecodeErrors."); 6482 6483 static PyObject * 6484 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) 6485 { 6486 static char *kwlist[] = {"encoding", "errors", 0}; 6487 char *encoding = NULL; 6488 char *errors = NULL; 6489 PyObject *v; 6490 6491 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode", 6492 kwlist, &encoding, &errors)) 6493 return NULL; 6494 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors); 6495 if (v == NULL) 6496 goto onError; 6497 if (!PyString_Check(v) && !PyUnicode_Check(v)) { 6498 PyErr_Format(PyExc_TypeError, 6499 "decoder did not return a string/unicode object " 6500 "(type=%.400s)", 6501 Py_TYPE(v)->tp_name); 6502 Py_DECREF(v); 6503 return NULL; 6504 } 6505 return v; 6506 6507 onError: 6508 return NULL; 6509 } 6510 6511 PyDoc_STRVAR(expandtabs__doc__, 6512 "S.expandtabs([tabsize]) -> unicode\n\ 6513 \n\ 6514 Return a copy of S where all tab characters are expanded using spaces.\n\ 6515 If tabsize is not given, a tab size of 8 characters is assumed."); 6516 6517 static PyObject* 6518 unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 6519 { 6520 Py_UNICODE *e; 6521 Py_UNICODE *p; 6522 Py_UNICODE *q; 6523 Py_UNICODE *qe; 6524 Py_ssize_t i, j, incr; 6525 PyUnicodeObject *u; 6526 int tabsize = 8; 6527 6528 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 6529 return NULL; 6530 6531 /* First pass: determine size of output string */ 6532 i = 0; /* chars up to and including most recent \n or \r */ 6533 j = 0; /* chars since most recent \n or \r (use in tab calculations) */ 6534 e = self->str + self->length; /* end of input */ 6535 for (p = self->str; p < e; p++) 6536 if (*p == '\t') { 6537 if (tabsize > 0) { 6538 incr = tabsize - (j % tabsize); /* cannot overflow */ 6539 if (j > PY_SSIZE_T_MAX - incr) 6540 goto overflow1; 6541 j += incr; 6542 } 6543 } 6544 else { 6545 if (j > PY_SSIZE_T_MAX - 1) 6546 goto overflow1; 6547 j++; 6548 if (*p == '\n' || *p == '\r') { 6549 if (i > PY_SSIZE_T_MAX - j) 6550 goto overflow1; 6551 i += j; 6552 j = 0; 6553 } 6554 } 6555 6556 if (i > PY_SSIZE_T_MAX - j) 6557 goto overflow1; 6558 6559 /* Second pass: create output string and fill it */ 6560 u = _PyUnicode_New(i + j); 6561 if (!u) 6562 return NULL; 6563 6564 j = 0; /* same as in first pass */ 6565 q = u->str; /* next output char */ 6566 qe = u->str + u->length; /* end of output */ 6567 6568 for (p = self->str; p < e; p++) 6569 if (*p == '\t') { 6570 if (tabsize > 0) { 6571 i = tabsize - (j % tabsize); 6572 j += i; 6573 while (i--) { 6574 if (q >= qe) 6575 goto overflow2; 6576 *q++ = ' '; 6577 } 6578 } 6579 } 6580 else { 6581 if (q >= qe) 6582 goto overflow2; 6583 *q++ = *p; 6584 j++; 6585 if (*p == '\n' || *p == '\r') 6586 j = 0; 6587 } 6588 6589 return (PyObject*) u; 6590 6591 overflow2: 6592 Py_DECREF(u); 6593 overflow1: 6594 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 6595 return NULL; 6596 } 6597 6598 PyDoc_STRVAR(find__doc__, 6599 "S.find(sub [,start [,end]]) -> int\n\ 6600 \n\ 6601 Return the lowest index in S where substring sub is found,\n\ 6602 such that sub is contained within S[start:end]. Optional\n\ 6603 arguments start and end are interpreted as in slice notation.\n\ 6604 \n\ 6605 Return -1 on failure."); 6606 6607 static PyObject * 6608 unicode_find(PyUnicodeObject *self, PyObject *args) 6609 { 6610 PyUnicodeObject *substring; 6611 Py_ssize_t start; 6612 Py_ssize_t end; 6613 Py_ssize_t result; 6614 6615 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 6616 &start, &end)) 6617 return NULL; 6618 6619 result = stringlib_find_slice( 6620 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 6621 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 6622 start, end 6623 ); 6624 6625 Py_DECREF(substring); 6626 6627 return PyInt_FromSsize_t(result); 6628 } 6629 6630 static PyObject * 6631 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) 6632 { 6633 if (index < 0 || index >= self->length) { 6634 PyErr_SetString(PyExc_IndexError, "string index out of range"); 6635 return NULL; 6636 } 6637 6638 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 6639 } 6640 6641 static long 6642 unicode_hash(PyUnicodeObject *self) 6643 { 6644 /* Since Unicode objects compare equal to their ASCII string 6645 counterparts, they should use the individual character values 6646 as basis for their hash value. This is needed to assure that 6647 strings and Unicode objects behave in the same way as 6648 dictionary keys. */ 6649 6650 register Py_ssize_t len; 6651 register Py_UNICODE *p; 6652 register long x; 6653 6654 #ifdef Py_DEBUG 6655 assert(_Py_HashSecret_Initialized); 6656 #endif 6657 if (self->hash != -1) 6658 return self->hash; 6659 len = PyUnicode_GET_SIZE(self); 6660 /* 6661 We make the hash of the empty string be 0, rather than using 6662 (prefix ^ suffix), since this slightly obfuscates the hash secret 6663 */ 6664 if (len == 0) { 6665 self->hash = 0; 6666 return 0; 6667 } 6668 p = PyUnicode_AS_UNICODE(self); 6669 x = _Py_HashSecret.prefix; 6670 x ^= *p << 7; 6671 while (--len >= 0) 6672 x = (1000003*x) ^ *p++; 6673 x ^= PyUnicode_GET_SIZE(self); 6674 x ^= _Py_HashSecret.suffix; 6675 if (x == -1) 6676 x = -2; 6677 self->hash = x; 6678 return x; 6679 } 6680 6681 PyDoc_STRVAR(index__doc__, 6682 "S.index(sub [,start [,end]]) -> int\n\ 6683 \n\ 6684 Like S.find() but raise ValueError when the substring is not found."); 6685 6686 static PyObject * 6687 unicode_index(PyUnicodeObject *self, PyObject *args) 6688 { 6689 Py_ssize_t result; 6690 PyUnicodeObject *substring; 6691 Py_ssize_t start; 6692 Py_ssize_t end; 6693 6694 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 6695 &start, &end)) 6696 return NULL; 6697 6698 result = stringlib_find_slice( 6699 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 6700 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 6701 start, end 6702 ); 6703 6704 Py_DECREF(substring); 6705 6706 if (result < 0) { 6707 PyErr_SetString(PyExc_ValueError, "substring not found"); 6708 return NULL; 6709 } 6710 6711 return PyInt_FromSsize_t(result); 6712 } 6713 6714 PyDoc_STRVAR(islower__doc__, 6715 "S.islower() -> bool\n\ 6716 \n\ 6717 Return True if all cased characters in S are lowercase and there is\n\ 6718 at least one cased character in S, False otherwise."); 6719 6720 static PyObject* 6721 unicode_islower(PyUnicodeObject *self) 6722 { 6723 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6724 register const Py_UNICODE *e; 6725 int cased; 6726 6727 /* Shortcut for single character strings */ 6728 if (PyUnicode_GET_SIZE(self) == 1) 6729 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 6730 6731 /* Special case for empty strings */ 6732 if (PyUnicode_GET_SIZE(self) == 0) 6733 return PyBool_FromLong(0); 6734 6735 e = p + PyUnicode_GET_SIZE(self); 6736 cased = 0; 6737 for (; p < e; p++) { 6738 register const Py_UNICODE ch = *p; 6739 6740 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 6741 return PyBool_FromLong(0); 6742 else if (!cased && Py_UNICODE_ISLOWER(ch)) 6743 cased = 1; 6744 } 6745 return PyBool_FromLong(cased); 6746 } 6747 6748 PyDoc_STRVAR(isupper__doc__, 6749 "S.isupper() -> bool\n\ 6750 \n\ 6751 Return True if all cased characters in S are uppercase and there is\n\ 6752 at least one cased character in S, False otherwise."); 6753 6754 static PyObject* 6755 unicode_isupper(PyUnicodeObject *self) 6756 { 6757 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6758 register const Py_UNICODE *e; 6759 int cased; 6760 6761 /* Shortcut for single character strings */ 6762 if (PyUnicode_GET_SIZE(self) == 1) 6763 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 6764 6765 /* Special case for empty strings */ 6766 if (PyUnicode_GET_SIZE(self) == 0) 6767 return PyBool_FromLong(0); 6768 6769 e = p + PyUnicode_GET_SIZE(self); 6770 cased = 0; 6771 for (; p < e; p++) { 6772 register const Py_UNICODE ch = *p; 6773 6774 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 6775 return PyBool_FromLong(0); 6776 else if (!cased && Py_UNICODE_ISUPPER(ch)) 6777 cased = 1; 6778 } 6779 return PyBool_FromLong(cased); 6780 } 6781 6782 PyDoc_STRVAR(istitle__doc__, 6783 "S.istitle() -> bool\n\ 6784 \n\ 6785 Return True if S is a titlecased string and there is at least one\n\ 6786 character in S, i.e. upper- and titlecase characters may only\n\ 6787 follow uncased characters and lowercase characters only cased ones.\n\ 6788 Return False otherwise."); 6789 6790 static PyObject* 6791 unicode_istitle(PyUnicodeObject *self) 6792 { 6793 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6794 register const Py_UNICODE *e; 6795 int cased, previous_is_cased; 6796 6797 /* Shortcut for single character strings */ 6798 if (PyUnicode_GET_SIZE(self) == 1) 6799 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 6800 (Py_UNICODE_ISUPPER(*p) != 0)); 6801 6802 /* Special case for empty strings */ 6803 if (PyUnicode_GET_SIZE(self) == 0) 6804 return PyBool_FromLong(0); 6805 6806 e = p + PyUnicode_GET_SIZE(self); 6807 cased = 0; 6808 previous_is_cased = 0; 6809 for (; p < e; p++) { 6810 register const Py_UNICODE ch = *p; 6811 6812 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 6813 if (previous_is_cased) 6814 return PyBool_FromLong(0); 6815 previous_is_cased = 1; 6816 cased = 1; 6817 } 6818 else if (Py_UNICODE_ISLOWER(ch)) { 6819 if (!previous_is_cased) 6820 return PyBool_FromLong(0); 6821 previous_is_cased = 1; 6822 cased = 1; 6823 } 6824 else 6825 previous_is_cased = 0; 6826 } 6827 return PyBool_FromLong(cased); 6828 } 6829 6830 PyDoc_STRVAR(isspace__doc__, 6831 "S.isspace() -> bool\n\ 6832 \n\ 6833 Return True if all characters in S are whitespace\n\ 6834 and there is at least one character in S, False otherwise."); 6835 6836 static PyObject* 6837 unicode_isspace(PyUnicodeObject *self) 6838 { 6839 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6840 register const Py_UNICODE *e; 6841 6842 /* Shortcut for single character strings */ 6843 if (PyUnicode_GET_SIZE(self) == 1 && 6844 Py_UNICODE_ISSPACE(*p)) 6845 return PyBool_FromLong(1); 6846 6847 /* Special case for empty strings */ 6848 if (PyUnicode_GET_SIZE(self) == 0) 6849 return PyBool_FromLong(0); 6850 6851 e = p + PyUnicode_GET_SIZE(self); 6852 for (; p < e; p++) { 6853 if (!Py_UNICODE_ISSPACE(*p)) 6854 return PyBool_FromLong(0); 6855 } 6856 return PyBool_FromLong(1); 6857 } 6858 6859 PyDoc_STRVAR(isalpha__doc__, 6860 "S.isalpha() -> bool\n\ 6861 \n\ 6862 Return True if all characters in S are alphabetic\n\ 6863 and there is at least one character in S, False otherwise."); 6864 6865 static PyObject* 6866 unicode_isalpha(PyUnicodeObject *self) 6867 { 6868 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6869 register const Py_UNICODE *e; 6870 6871 /* Shortcut for single character strings */ 6872 if (PyUnicode_GET_SIZE(self) == 1 && 6873 Py_UNICODE_ISALPHA(*p)) 6874 return PyBool_FromLong(1); 6875 6876 /* Special case for empty strings */ 6877 if (PyUnicode_GET_SIZE(self) == 0) 6878 return PyBool_FromLong(0); 6879 6880 e = p + PyUnicode_GET_SIZE(self); 6881 for (; p < e; p++) { 6882 if (!Py_UNICODE_ISALPHA(*p)) 6883 return PyBool_FromLong(0); 6884 } 6885 return PyBool_FromLong(1); 6886 } 6887 6888 PyDoc_STRVAR(isalnum__doc__, 6889 "S.isalnum() -> bool\n\ 6890 \n\ 6891 Return True if all characters in S are alphanumeric\n\ 6892 and there is at least one character in S, False otherwise."); 6893 6894 static PyObject* 6895 unicode_isalnum(PyUnicodeObject *self) 6896 { 6897 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6898 register const Py_UNICODE *e; 6899 6900 /* Shortcut for single character strings */ 6901 if (PyUnicode_GET_SIZE(self) == 1 && 6902 Py_UNICODE_ISALNUM(*p)) 6903 return PyBool_FromLong(1); 6904 6905 /* Special case for empty strings */ 6906 if (PyUnicode_GET_SIZE(self) == 0) 6907 return PyBool_FromLong(0); 6908 6909 e = p + PyUnicode_GET_SIZE(self); 6910 for (; p < e; p++) { 6911 if (!Py_UNICODE_ISALNUM(*p)) 6912 return PyBool_FromLong(0); 6913 } 6914 return PyBool_FromLong(1); 6915 } 6916 6917 PyDoc_STRVAR(isdecimal__doc__, 6918 "S.isdecimal() -> bool\n\ 6919 \n\ 6920 Return True if there are only decimal characters in S,\n\ 6921 False otherwise."); 6922 6923 static PyObject* 6924 unicode_isdecimal(PyUnicodeObject *self) 6925 { 6926 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6927 register const Py_UNICODE *e; 6928 6929 /* Shortcut for single character strings */ 6930 if (PyUnicode_GET_SIZE(self) == 1 && 6931 Py_UNICODE_ISDECIMAL(*p)) 6932 return PyBool_FromLong(1); 6933 6934 /* Special case for empty strings */ 6935 if (PyUnicode_GET_SIZE(self) == 0) 6936 return PyBool_FromLong(0); 6937 6938 e = p + PyUnicode_GET_SIZE(self); 6939 for (; p < e; p++) { 6940 if (!Py_UNICODE_ISDECIMAL(*p)) 6941 return PyBool_FromLong(0); 6942 } 6943 return PyBool_FromLong(1); 6944 } 6945 6946 PyDoc_STRVAR(isdigit__doc__, 6947 "S.isdigit() -> bool\n\ 6948 \n\ 6949 Return True if all characters in S are digits\n\ 6950 and there is at least one character in S, False otherwise."); 6951 6952 static PyObject* 6953 unicode_isdigit(PyUnicodeObject *self) 6954 { 6955 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6956 register const Py_UNICODE *e; 6957 6958 /* Shortcut for single character strings */ 6959 if (PyUnicode_GET_SIZE(self) == 1 && 6960 Py_UNICODE_ISDIGIT(*p)) 6961 return PyBool_FromLong(1); 6962 6963 /* Special case for empty strings */ 6964 if (PyUnicode_GET_SIZE(self) == 0) 6965 return PyBool_FromLong(0); 6966 6967 e = p + PyUnicode_GET_SIZE(self); 6968 for (; p < e; p++) { 6969 if (!Py_UNICODE_ISDIGIT(*p)) 6970 return PyBool_FromLong(0); 6971 } 6972 return PyBool_FromLong(1); 6973 } 6974 6975 PyDoc_STRVAR(isnumeric__doc__, 6976 "S.isnumeric() -> bool\n\ 6977 \n\ 6978 Return True if there are only numeric characters in S,\n\ 6979 False otherwise."); 6980 6981 static PyObject* 6982 unicode_isnumeric(PyUnicodeObject *self) 6983 { 6984 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6985 register const Py_UNICODE *e; 6986 6987 /* Shortcut for single character strings */ 6988 if (PyUnicode_GET_SIZE(self) == 1 && 6989 Py_UNICODE_ISNUMERIC(*p)) 6990 return PyBool_FromLong(1); 6991 6992 /* Special case for empty strings */ 6993 if (PyUnicode_GET_SIZE(self) == 0) 6994 return PyBool_FromLong(0); 6995 6996 e = p + PyUnicode_GET_SIZE(self); 6997 for (; p < e; p++) { 6998 if (!Py_UNICODE_ISNUMERIC(*p)) 6999 return PyBool_FromLong(0); 7000 } 7001 return PyBool_FromLong(1); 7002 } 7003 7004 PyDoc_STRVAR(join__doc__, 7005 "S.join(iterable) -> unicode\n\ 7006 \n\ 7007 Return a string which is the concatenation of the strings in the\n\ 7008 iterable. The separator between elements is S."); 7009 7010 static PyObject* 7011 unicode_join(PyObject *self, PyObject *data) 7012 { 7013 return PyUnicode_Join(self, data); 7014 } 7015 7016 static Py_ssize_t 7017 unicode_length(PyUnicodeObject *self) 7018 { 7019 return self->length; 7020 } 7021 7022 PyDoc_STRVAR(ljust__doc__, 7023 "S.ljust(width[, fillchar]) -> int\n\ 7024 \n\ 7025 Return S left-justified in a Unicode string of length width. Padding is\n\ 7026 done using the specified fill character (default is a space)."); 7027 7028 static PyObject * 7029 unicode_ljust(PyUnicodeObject *self, PyObject *args) 7030 { 7031 Py_ssize_t width; 7032 Py_UNICODE fillchar = ' '; 7033 7034 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 7035 return NULL; 7036 7037 if (self->length >= width && PyUnicode_CheckExact(self)) { 7038 Py_INCREF(self); 7039 return (PyObject*) self; 7040 } 7041 7042 return (PyObject*) pad(self, 0, width - self->length, fillchar); 7043 } 7044 7045 PyDoc_STRVAR(lower__doc__, 7046 "S.lower() -> unicode\n\ 7047 \n\ 7048 Return a copy of the string S converted to lowercase."); 7049 7050 static PyObject* 7051 unicode_lower(PyUnicodeObject *self) 7052 { 7053 return fixup(self, fixlower); 7054 } 7055 7056 #define LEFTSTRIP 0 7057 #define RIGHTSTRIP 1 7058 #define BOTHSTRIP 2 7059 7060 /* Arrays indexed by above */ 7061 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 7062 7063 #define STRIPNAME(i) (stripformat[i]+3) 7064 7065 /* externally visible for str.strip(unicode) */ 7066 PyObject * 7067 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 7068 { 7069 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 7070 Py_ssize_t len = PyUnicode_GET_SIZE(self); 7071 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 7072 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); 7073 Py_ssize_t i, j; 7074 7075 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); 7076 7077 i = 0; 7078 if (striptype != RIGHTSTRIP) { 7079 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { 7080 i++; 7081 } 7082 } 7083 7084 j = len; 7085 if (striptype != LEFTSTRIP) { 7086 do { 7087 j--; 7088 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); 7089 j++; 7090 } 7091 7092 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 7093 Py_INCREF(self); 7094 return (PyObject*)self; 7095 } 7096 else 7097 return PyUnicode_FromUnicode(s+i, j-i); 7098 } 7099 7100 7101 static PyObject * 7102 do_strip(PyUnicodeObject *self, int striptype) 7103 { 7104 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 7105 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; 7106 7107 i = 0; 7108 if (striptype != RIGHTSTRIP) { 7109 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 7110 i++; 7111 } 7112 } 7113 7114 j = len; 7115 if (striptype != LEFTSTRIP) { 7116 do { 7117 j--; 7118 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 7119 j++; 7120 } 7121 7122 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 7123 Py_INCREF(self); 7124 return (PyObject*)self; 7125 } 7126 else 7127 return PyUnicode_FromUnicode(s+i, j-i); 7128 } 7129 7130 7131 static PyObject * 7132 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 7133 { 7134 PyObject *sep = NULL; 7135 7136 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 7137 return NULL; 7138 7139 if (sep != NULL && sep != Py_None) { 7140 if (PyUnicode_Check(sep)) 7141 return _PyUnicode_XStrip(self, striptype, sep); 7142 else if (PyString_Check(sep)) { 7143 PyObject *res; 7144 sep = PyUnicode_FromObject(sep); 7145 if (sep==NULL) 7146 return NULL; 7147 res = _PyUnicode_XStrip(self, striptype, sep); 7148 Py_DECREF(sep); 7149 return res; 7150 } 7151 else { 7152 PyErr_Format(PyExc_TypeError, 7153 "%s arg must be None, unicode or str", 7154 STRIPNAME(striptype)); 7155 return NULL; 7156 } 7157 } 7158 7159 return do_strip(self, striptype); 7160 } 7161 7162 7163 PyDoc_STRVAR(strip__doc__, 7164 "S.strip([chars]) -> unicode\n\ 7165 \n\ 7166 Return a copy of the string S with leading and trailing\n\ 7167 whitespace removed.\n\ 7168 If chars is given and not None, remove characters in chars instead.\n\ 7169 If chars is a str, it will be converted to unicode before stripping"); 7170 7171 static PyObject * 7172 unicode_strip(PyUnicodeObject *self, PyObject *args) 7173 { 7174 if (PyTuple_GET_SIZE(args) == 0) 7175 return do_strip(self, BOTHSTRIP); /* Common case */ 7176 else 7177 return do_argstrip(self, BOTHSTRIP, args); 7178 } 7179 7180 7181 PyDoc_STRVAR(lstrip__doc__, 7182 "S.lstrip([chars]) -> unicode\n\ 7183 \n\ 7184 Return a copy of the string S with leading whitespace removed.\n\ 7185 If chars is given and not None, remove characters in chars instead.\n\ 7186 If chars is a str, it will be converted to unicode before stripping"); 7187 7188 static PyObject * 7189 unicode_lstrip(PyUnicodeObject *self, PyObject *args) 7190 { 7191 if (PyTuple_GET_SIZE(args) == 0) 7192 return do_strip(self, LEFTSTRIP); /* Common case */ 7193 else 7194 return do_argstrip(self, LEFTSTRIP, args); 7195 } 7196 7197 7198 PyDoc_STRVAR(rstrip__doc__, 7199 "S.rstrip([chars]) -> unicode\n\ 7200 \n\ 7201 Return a copy of the string S with trailing whitespace removed.\n\ 7202 If chars is given and not None, remove characters in chars instead.\n\ 7203 If chars is a str, it will be converted to unicode before stripping"); 7204 7205 static PyObject * 7206 unicode_rstrip(PyUnicodeObject *self, PyObject *args) 7207 { 7208 if (PyTuple_GET_SIZE(args) == 0) 7209 return do_strip(self, RIGHTSTRIP); /* Common case */ 7210 else 7211 return do_argstrip(self, RIGHTSTRIP, args); 7212 } 7213 7214 7215 static PyObject* 7216 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 7217 { 7218 PyUnicodeObject *u; 7219 Py_UNICODE *p; 7220 Py_ssize_t nchars; 7221 size_t nbytes; 7222 7223 if (len < 0) 7224 len = 0; 7225 7226 if (len == 1 && PyUnicode_CheckExact(str)) { 7227 /* no repeat, return original string */ 7228 Py_INCREF(str); 7229 return (PyObject*) str; 7230 } 7231 7232 /* ensure # of chars needed doesn't overflow Py_ssize_t and # of bytes 7233 * needed doesn't overflow size_t 7234 */ 7235 if (len && str->length > PY_SSIZE_T_MAX / len) { 7236 PyErr_SetString(PyExc_OverflowError, 7237 "repeated string is too long"); 7238 return NULL; 7239 } 7240 nchars = len * str->length; 7241 nbytes = ((size_t)nchars + 1u) * sizeof(Py_UNICODE); 7242 if (nbytes / sizeof(Py_UNICODE) != ((size_t)nchars + 1u)) { 7243 PyErr_SetString(PyExc_OverflowError, 7244 "repeated string is too long"); 7245 return NULL; 7246 } 7247 u = _PyUnicode_New(nchars); 7248 if (!u) 7249 return NULL; 7250 7251 p = u->str; 7252 7253 if (str->length == 1 && len > 0) { 7254 Py_UNICODE_FILL(p, str->str[0], len); 7255 } else { 7256 Py_ssize_t done = 0; /* number of characters copied this far */ 7257 if (done < nchars) { 7258 Py_UNICODE_COPY(p, str->str, str->length); 7259 done = str->length; 7260 } 7261 while (done < nchars) { 7262 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done; 7263 Py_UNICODE_COPY(p+done, p, n); 7264 done += n; 7265 } 7266 } 7267 7268 return (PyObject*) u; 7269 } 7270 7271 PyObject *PyUnicode_Replace(PyObject *obj, 7272 PyObject *subobj, 7273 PyObject *replobj, 7274 Py_ssize_t maxcount) 7275 { 7276 PyObject *self; 7277 PyObject *str1; 7278 PyObject *str2; 7279 PyObject *result; 7280 7281 self = PyUnicode_FromObject(obj); 7282 if (self == NULL) 7283 return NULL; 7284 str1 = PyUnicode_FromObject(subobj); 7285 if (str1 == NULL) { 7286 Py_DECREF(self); 7287 return NULL; 7288 } 7289 str2 = PyUnicode_FromObject(replobj); 7290 if (str2 == NULL) { 7291 Py_DECREF(self); 7292 Py_DECREF(str1); 7293 return NULL; 7294 } 7295 result = replace((PyUnicodeObject *)self, 7296 (PyUnicodeObject *)str1, 7297 (PyUnicodeObject *)str2, 7298 maxcount); 7299 Py_DECREF(self); 7300 Py_DECREF(str1); 7301 Py_DECREF(str2); 7302 return result; 7303 } 7304 7305 PyDoc_STRVAR(replace__doc__, 7306 "S.replace(old, new[, count]) -> unicode\n\ 7307 \n\ 7308 Return a copy of S with all occurrences of substring\n\ 7309 old replaced by new. If the optional argument count is\n\ 7310 given, only the first count occurrences are replaced."); 7311 7312 static PyObject* 7313 unicode_replace(PyUnicodeObject *self, PyObject *args) 7314 { 7315 PyUnicodeObject *str1; 7316 PyUnicodeObject *str2; 7317 Py_ssize_t maxcount = -1; 7318 PyObject *result; 7319 7320 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 7321 return NULL; 7322 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 7323 if (str1 == NULL) 7324 return NULL; 7325 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 7326 if (str2 == NULL) { 7327 Py_DECREF(str1); 7328 return NULL; 7329 } 7330 7331 result = replace(self, str1, str2, maxcount); 7332 7333 Py_DECREF(str1); 7334 Py_DECREF(str2); 7335 return result; 7336 } 7337 7338 static 7339 PyObject *unicode_repr(PyObject *unicode) 7340 { 7341 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode), 7342 PyUnicode_GET_SIZE(unicode), 7343 1); 7344 } 7345 7346 PyDoc_STRVAR(rfind__doc__, 7347 "S.rfind(sub [,start [,end]]) -> int\n\ 7348 \n\ 7349 Return the highest index in S where substring sub is found,\n\ 7350 such that sub is contained within S[start:end]. Optional\n\ 7351 arguments start and end are interpreted as in slice notation.\n\ 7352 \n\ 7353 Return -1 on failure."); 7354 7355 static PyObject * 7356 unicode_rfind(PyUnicodeObject *self, PyObject *args) 7357 { 7358 PyUnicodeObject *substring; 7359 Py_ssize_t start; 7360 Py_ssize_t end; 7361 Py_ssize_t result; 7362 7363 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 7364 &start, &end)) 7365 return NULL; 7366 7367 result = stringlib_rfind_slice( 7368 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7369 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7370 start, end 7371 ); 7372 7373 Py_DECREF(substring); 7374 7375 return PyInt_FromSsize_t(result); 7376 } 7377 7378 PyDoc_STRVAR(rindex__doc__, 7379 "S.rindex(sub [,start [,end]]) -> int\n\ 7380 \n\ 7381 Like S.rfind() but raise ValueError when the substring is not found."); 7382 7383 static PyObject * 7384 unicode_rindex(PyUnicodeObject *self, PyObject *args) 7385 { 7386 PyUnicodeObject *substring; 7387 Py_ssize_t start; 7388 Py_ssize_t end; 7389 Py_ssize_t result; 7390 7391 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 7392 &start, &end)) 7393 return NULL; 7394 7395 result = stringlib_rfind_slice( 7396 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7397 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7398 start, end 7399 ); 7400 7401 Py_DECREF(substring); 7402 7403 if (result < 0) { 7404 PyErr_SetString(PyExc_ValueError, "substring not found"); 7405 return NULL; 7406 } 7407 return PyInt_FromSsize_t(result); 7408 } 7409 7410 PyDoc_STRVAR(rjust__doc__, 7411 "S.rjust(width[, fillchar]) -> unicode\n\ 7412 \n\ 7413 Return S right-justified in a Unicode string of length width. Padding is\n\ 7414 done using the specified fill character (default is a space)."); 7415 7416 static PyObject * 7417 unicode_rjust(PyUnicodeObject *self, PyObject *args) 7418 { 7419 Py_ssize_t width; 7420 Py_UNICODE fillchar = ' '; 7421 7422 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 7423 return NULL; 7424 7425 if (self->length >= width && PyUnicode_CheckExact(self)) { 7426 Py_INCREF(self); 7427 return (PyObject*) self; 7428 } 7429 7430 return (PyObject*) pad(self, width - self->length, 0, fillchar); 7431 } 7432 7433 static PyObject* 7434 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end) 7435 { 7436 /* standard clamping */ 7437 if (start < 0) 7438 start = 0; 7439 if (end < 0) 7440 end = 0; 7441 if (end > self->length) 7442 end = self->length; 7443 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) { 7444 /* full slice, return original string */ 7445 Py_INCREF(self); 7446 return (PyObject*) self; 7447 } 7448 if (start > end) 7449 start = end; 7450 /* copy slice */ 7451 return (PyObject*) PyUnicode_FromUnicode(self->str + start, 7452 end - start); 7453 } 7454 7455 PyObject *PyUnicode_Split(PyObject *s, 7456 PyObject *sep, 7457 Py_ssize_t maxsplit) 7458 { 7459 PyObject *result; 7460 7461 s = PyUnicode_FromObject(s); 7462 if (s == NULL) 7463 return NULL; 7464 if (sep != NULL) { 7465 sep = PyUnicode_FromObject(sep); 7466 if (sep == NULL) { 7467 Py_DECREF(s); 7468 return NULL; 7469 } 7470 } 7471 7472 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 7473 7474 Py_DECREF(s); 7475 Py_XDECREF(sep); 7476 return result; 7477 } 7478 7479 PyDoc_STRVAR(split__doc__, 7480 "S.split([sep [,maxsplit]]) -> list of strings\n\ 7481 \n\ 7482 Return a list of the words in S, using sep as the\n\ 7483 delimiter string. If maxsplit is given, at most maxsplit\n\ 7484 splits are done. If sep is not specified or is None, any\n\ 7485 whitespace string is a separator and empty strings are\n\ 7486 removed from the result."); 7487 7488 static PyObject* 7489 unicode_split(PyUnicodeObject *self, PyObject *args) 7490 { 7491 PyObject *substring = Py_None; 7492 Py_ssize_t maxcount = -1; 7493 7494 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 7495 return NULL; 7496 7497 if (substring == Py_None) 7498 return split(self, NULL, maxcount); 7499 else if (PyUnicode_Check(substring)) 7500 return split(self, (PyUnicodeObject *)substring, maxcount); 7501 else 7502 return PyUnicode_Split((PyObject *)self, substring, maxcount); 7503 } 7504 7505 PyObject * 7506 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 7507 { 7508 PyObject* str_obj; 7509 PyObject* sep_obj; 7510 PyObject* out; 7511 7512 str_obj = PyUnicode_FromObject(str_in); 7513 if (!str_obj) 7514 return NULL; 7515 sep_obj = PyUnicode_FromObject(sep_in); 7516 if (!sep_obj) { 7517 Py_DECREF(str_obj); 7518 return NULL; 7519 } 7520 7521 out = stringlib_partition( 7522 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 7523 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 7524 ); 7525 7526 Py_DECREF(sep_obj); 7527 Py_DECREF(str_obj); 7528 7529 return out; 7530 } 7531 7532 7533 PyObject * 7534 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 7535 { 7536 PyObject* str_obj; 7537 PyObject* sep_obj; 7538 PyObject* out; 7539 7540 str_obj = PyUnicode_FromObject(str_in); 7541 if (!str_obj) 7542 return NULL; 7543 sep_obj = PyUnicode_FromObject(sep_in); 7544 if (!sep_obj) { 7545 Py_DECREF(str_obj); 7546 return NULL; 7547 } 7548 7549 out = stringlib_rpartition( 7550 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 7551 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 7552 ); 7553 7554 Py_DECREF(sep_obj); 7555 Py_DECREF(str_obj); 7556 7557 return out; 7558 } 7559 7560 PyDoc_STRVAR(partition__doc__, 7561 "S.partition(sep) -> (head, sep, tail)\n\ 7562 \n\ 7563 Search for the separator sep in S, and return the part before it,\n\ 7564 the separator itself, and the part after it. If the separator is not\n\ 7565 found, return S and two empty strings."); 7566 7567 static PyObject* 7568 unicode_partition(PyUnicodeObject *self, PyObject *separator) 7569 { 7570 return PyUnicode_Partition((PyObject *)self, separator); 7571 } 7572 7573 PyDoc_STRVAR(rpartition__doc__, 7574 "S.rpartition(sep) -> (head, sep, tail)\n\ 7575 \n\ 7576 Search for the separator sep in S, starting at the end of S, and return\n\ 7577 the part before it, the separator itself, and the part after it. If the\n\ 7578 separator is not found, return two empty strings and S."); 7579 7580 static PyObject* 7581 unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 7582 { 7583 return PyUnicode_RPartition((PyObject *)self, separator); 7584 } 7585 7586 PyObject *PyUnicode_RSplit(PyObject *s, 7587 PyObject *sep, 7588 Py_ssize_t maxsplit) 7589 { 7590 PyObject *result; 7591 7592 s = PyUnicode_FromObject(s); 7593 if (s == NULL) 7594 return NULL; 7595 if (sep != NULL) { 7596 sep = PyUnicode_FromObject(sep); 7597 if (sep == NULL) { 7598 Py_DECREF(s); 7599 return NULL; 7600 } 7601 } 7602 7603 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 7604 7605 Py_DECREF(s); 7606 Py_XDECREF(sep); 7607 return result; 7608 } 7609 7610 PyDoc_STRVAR(rsplit__doc__, 7611 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\ 7612 \n\ 7613 Return a list of the words in S, using sep as the\n\ 7614 delimiter string, starting at the end of the string and\n\ 7615 working to the front. If maxsplit is given, at most maxsplit\n\ 7616 splits are done. If sep is not specified, any whitespace string\n\ 7617 is a separator."); 7618 7619 static PyObject* 7620 unicode_rsplit(PyUnicodeObject *self, PyObject *args) 7621 { 7622 PyObject *substring = Py_None; 7623 Py_ssize_t maxcount = -1; 7624 7625 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 7626 return NULL; 7627 7628 if (substring == Py_None) 7629 return rsplit(self, NULL, maxcount); 7630 else if (PyUnicode_Check(substring)) 7631 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 7632 else 7633 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 7634 } 7635 7636 PyDoc_STRVAR(splitlines__doc__, 7637 "S.splitlines(keepends=False) -> list of strings\n\ 7638 \n\ 7639 Return a list of the lines in S, breaking at line boundaries.\n\ 7640 Line breaks are not included in the resulting list unless keepends\n\ 7641 is given and true."); 7642 7643 static PyObject* 7644 unicode_splitlines(PyUnicodeObject *self, PyObject *args) 7645 { 7646 int keepends = 0; 7647 7648 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 7649 return NULL; 7650 7651 return PyUnicode_Splitlines((PyObject *)self, keepends); 7652 } 7653 7654 static 7655 PyObject *unicode_str(PyUnicodeObject *self) 7656 { 7657 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL); 7658 } 7659 7660 PyDoc_STRVAR(swapcase__doc__, 7661 "S.swapcase() -> unicode\n\ 7662 \n\ 7663 Return a copy of S with uppercase characters converted to lowercase\n\ 7664 and vice versa."); 7665 7666 static PyObject* 7667 unicode_swapcase(PyUnicodeObject *self) 7668 { 7669 return fixup(self, fixswapcase); 7670 } 7671 7672 PyDoc_STRVAR(translate__doc__, 7673 "S.translate(table) -> unicode\n\ 7674 \n\ 7675 Return a copy of the string S, where all characters have been mapped\n\ 7676 through the given translation table, which must be a mapping of\n\ 7677 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\ 7678 Unmapped characters are left untouched. Characters mapped to None\n\ 7679 are deleted."); 7680 7681 static PyObject* 7682 unicode_translate(PyUnicodeObject *self, PyObject *table) 7683 { 7684 return PyUnicode_TranslateCharmap(self->str, 7685 self->length, 7686 table, 7687 "ignore"); 7688 } 7689 7690 PyDoc_STRVAR(upper__doc__, 7691 "S.upper() -> unicode\n\ 7692 \n\ 7693 Return a copy of S converted to uppercase."); 7694 7695 static PyObject* 7696 unicode_upper(PyUnicodeObject *self) 7697 { 7698 return fixup(self, fixupper); 7699 } 7700 7701 PyDoc_STRVAR(zfill__doc__, 7702 "S.zfill(width) -> unicode\n\ 7703 \n\ 7704 Pad a numeric string S with zeros on the left, to fill a field\n\ 7705 of the specified width. The string S is never truncated."); 7706 7707 static PyObject * 7708 unicode_zfill(PyUnicodeObject *self, PyObject *args) 7709 { 7710 Py_ssize_t fill; 7711 PyUnicodeObject *u; 7712 7713 Py_ssize_t width; 7714 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 7715 return NULL; 7716 7717 if (self->length >= width) { 7718 if (PyUnicode_CheckExact(self)) { 7719 Py_INCREF(self); 7720 return (PyObject*) self; 7721 } 7722 else 7723 return PyUnicode_FromUnicode( 7724 PyUnicode_AS_UNICODE(self), 7725 PyUnicode_GET_SIZE(self) 7726 ); 7727 } 7728 7729 fill = width - self->length; 7730 7731 u = pad(self, fill, 0, '0'); 7732 7733 if (u == NULL) 7734 return NULL; 7735 7736 if (u->str[fill] == '+' || u->str[fill] == '-') { 7737 /* move sign to beginning of string */ 7738 u->str[0] = u->str[fill]; 7739 u->str[fill] = '0'; 7740 } 7741 7742 return (PyObject*) u; 7743 } 7744 7745 #if 0 7746 static PyObject* 7747 free_listsize(PyUnicodeObject *self) 7748 { 7749 return PyInt_FromLong(numfree); 7750 } 7751 #endif 7752 7753 PyDoc_STRVAR(startswith__doc__, 7754 "S.startswith(prefix[, start[, end]]) -> bool\n\ 7755 \n\ 7756 Return True if S starts with the specified prefix, False otherwise.\n\ 7757 With optional start, test S beginning at that position.\n\ 7758 With optional end, stop comparing S at that position.\n\ 7759 prefix can also be a tuple of strings to try."); 7760 7761 static PyObject * 7762 unicode_startswith(PyUnicodeObject *self, 7763 PyObject *args) 7764 { 7765 PyObject *subobj; 7766 PyUnicodeObject *substring; 7767 Py_ssize_t start = 0; 7768 Py_ssize_t end = PY_SSIZE_T_MAX; 7769 int result; 7770 7771 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 7772 return NULL; 7773 if (PyTuple_Check(subobj)) { 7774 Py_ssize_t i; 7775 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 7776 substring = (PyUnicodeObject *)PyUnicode_FromObject( 7777 PyTuple_GET_ITEM(subobj, i)); 7778 if (substring == NULL) 7779 return NULL; 7780 result = tailmatch(self, substring, start, end, -1); 7781 Py_DECREF(substring); 7782 if (result) { 7783 Py_RETURN_TRUE; 7784 } 7785 } 7786 /* nothing matched */ 7787 Py_RETURN_FALSE; 7788 } 7789 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 7790 if (substring == NULL) { 7791 if (PyErr_ExceptionMatches(PyExc_TypeError)) 7792 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, " 7793 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name); 7794 return NULL; 7795 } 7796 result = tailmatch(self, substring, start, end, -1); 7797 Py_DECREF(substring); 7798 return PyBool_FromLong(result); 7799 } 7800 7801 7802 PyDoc_STRVAR(endswith__doc__, 7803 "S.endswith(suffix[, start[, end]]) -> bool\n\ 7804 \n\ 7805 Return True if S ends with the specified suffix, False otherwise.\n\ 7806 With optional start, test S beginning at that position.\n\ 7807 With optional end, stop comparing S at that position.\n\ 7808 suffix can also be a tuple of strings to try."); 7809 7810 static PyObject * 7811 unicode_endswith(PyUnicodeObject *self, 7812 PyObject *args) 7813 { 7814 PyObject *subobj; 7815 PyUnicodeObject *substring; 7816 Py_ssize_t start = 0; 7817 Py_ssize_t end = PY_SSIZE_T_MAX; 7818 int result; 7819 7820 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 7821 return NULL; 7822 if (PyTuple_Check(subobj)) { 7823 Py_ssize_t i; 7824 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 7825 substring = (PyUnicodeObject *)PyUnicode_FromObject( 7826 PyTuple_GET_ITEM(subobj, i)); 7827 if (substring == NULL) 7828 return NULL; 7829 result = tailmatch(self, substring, start, end, +1); 7830 Py_DECREF(substring); 7831 if (result) { 7832 Py_RETURN_TRUE; 7833 } 7834 } 7835 Py_RETURN_FALSE; 7836 } 7837 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 7838 if (substring == NULL) { 7839 if (PyErr_ExceptionMatches(PyExc_TypeError)) 7840 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, " 7841 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name); 7842 return NULL; 7843 } 7844 result = tailmatch(self, substring, start, end, +1); 7845 Py_DECREF(substring); 7846 return PyBool_FromLong(result); 7847 } 7848 7849 7850 /* Implements do_string_format, which is unicode because of stringlib */ 7851 #include "stringlib/string_format.h" 7852 7853 PyDoc_STRVAR(format__doc__, 7854 "S.format(*args, **kwargs) -> unicode\n\ 7855 \n\ 7856 Return a formatted version of S, using substitutions from args and kwargs.\n\ 7857 The substitutions are identified by braces ('{' and '}')."); 7858 7859 static PyObject * 7860 unicode__format__(PyObject *self, PyObject *args) 7861 { 7862 PyObject *format_spec; 7863 PyObject *result = NULL; 7864 PyObject *tmp = NULL; 7865 7866 /* If 2.x, convert format_spec to the same type as value */ 7867 /* This is to allow things like u''.format('') */ 7868 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec)) 7869 goto done; 7870 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) { 7871 PyErr_Format(PyExc_TypeError, "__format__ arg must be str " 7872 "or unicode, not %s", Py_TYPE(format_spec)->tp_name); 7873 goto done; 7874 } 7875 tmp = PyObject_Unicode(format_spec); 7876 if (tmp == NULL) 7877 goto done; 7878 format_spec = tmp; 7879 7880 result = _PyUnicode_FormatAdvanced(self, 7881 PyUnicode_AS_UNICODE(format_spec), 7882 PyUnicode_GET_SIZE(format_spec)); 7883 done: 7884 Py_XDECREF(tmp); 7885 return result; 7886 } 7887 7888 PyDoc_STRVAR(p_format__doc__, 7889 "S.__format__(format_spec) -> unicode\n\ 7890 \n\ 7891 Return a formatted version of S as described by format_spec."); 7892 7893 static PyObject * 7894 unicode__sizeof__(PyUnicodeObject *v) 7895 { 7896 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) + 7897 sizeof(Py_UNICODE) * (v->length + 1)); 7898 } 7899 7900 PyDoc_STRVAR(sizeof__doc__, 7901 "S.__sizeof__() -> size of S in memory, in bytes\n\ 7902 \n\ 7903 "); 7904 7905 static PyObject * 7906 unicode_getnewargs(PyUnicodeObject *v) 7907 { 7908 return Py_BuildValue("(u#)", v->str, v->length); 7909 } 7910 7911 7912 static PyMethodDef unicode_methods[] = { 7913 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 7914 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 7915 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 7916 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 7917 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 7918 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 7919 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 7920 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 7921 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 7922 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 7923 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 7924 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 7925 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 7926 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 7927 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 7928 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 7929 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__}, 7930 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */ 7931 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 7932 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 7933 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 7934 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 7935 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 7936 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 7937 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 7938 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 7939 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 7940 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 7941 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 7942 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 7943 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 7944 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 7945 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 7946 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 7947 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 7948 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 7949 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 7950 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 7951 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 7952 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 7953 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 7954 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 7955 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS}, 7956 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS}, 7957 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 7958 #if 0 7959 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 7960 #endif 7961 7962 #if 0 7963 /* This one is just used for debugging the implementation. */ 7964 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS}, 7965 #endif 7966 7967 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 7968 {NULL, NULL} 7969 }; 7970 7971 static PyObject * 7972 unicode_mod(PyObject *v, PyObject *w) 7973 { 7974 if (!PyUnicode_Check(v)) { 7975 Py_INCREF(Py_NotImplemented); 7976 return Py_NotImplemented; 7977 } 7978 return PyUnicode_Format(v, w); 7979 } 7980 7981 static PyNumberMethods unicode_as_number = { 7982 0, /*nb_add*/ 7983 0, /*nb_subtract*/ 7984 0, /*nb_multiply*/ 7985 0, /*nb_divide*/ 7986 unicode_mod, /*nb_remainder*/ 7987 }; 7988 7989 static PySequenceMethods unicode_as_sequence = { 7990 (lenfunc) unicode_length, /* sq_length */ 7991 PyUnicode_Concat, /* sq_concat */ 7992 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 7993 (ssizeargfunc) unicode_getitem, /* sq_item */ 7994 (ssizessizeargfunc) unicode_slice, /* sq_slice */ 7995 0, /* sq_ass_item */ 7996 0, /* sq_ass_slice */ 7997 PyUnicode_Contains, /* sq_contains */ 7998 }; 7999 8000 static PyObject* 8001 unicode_subscript(PyUnicodeObject* self, PyObject* item) 8002 { 8003 if (PyIndex_Check(item)) { 8004 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 8005 if (i == -1 && PyErr_Occurred()) 8006 return NULL; 8007 if (i < 0) 8008 i += PyUnicode_GET_SIZE(self); 8009 return unicode_getitem(self, i); 8010 } else if (PySlice_Check(item)) { 8011 Py_ssize_t start, stop, step, slicelength, cur, i; 8012 Py_UNICODE* source_buf; 8013 Py_UNICODE* result_buf; 8014 PyObject* result; 8015 8016 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self), 8017 &start, &stop, &step, &slicelength) < 0) { 8018 return NULL; 8019 } 8020 8021 if (slicelength <= 0) { 8022 return PyUnicode_FromUnicode(NULL, 0); 8023 } else if (start == 0 && step == 1 && slicelength == self->length && 8024 PyUnicode_CheckExact(self)) { 8025 Py_INCREF(self); 8026 return (PyObject *)self; 8027 } else if (step == 1) { 8028 return PyUnicode_FromUnicode(self->str + start, slicelength); 8029 } else { 8030 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 8031 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength* 8032 sizeof(Py_UNICODE)); 8033 8034 if (result_buf == NULL) 8035 return PyErr_NoMemory(); 8036 8037 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 8038 result_buf[i] = source_buf[cur]; 8039 } 8040 8041 result = PyUnicode_FromUnicode(result_buf, slicelength); 8042 PyObject_FREE(result_buf); 8043 return result; 8044 } 8045 } else { 8046 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 8047 return NULL; 8048 } 8049 } 8050 8051 static PyMappingMethods unicode_as_mapping = { 8052 (lenfunc)unicode_length, /* mp_length */ 8053 (binaryfunc)unicode_subscript, /* mp_subscript */ 8054 (objobjargproc)0, /* mp_ass_subscript */ 8055 }; 8056 8057 static Py_ssize_t 8058 unicode_buffer_getreadbuf(PyUnicodeObject *self, 8059 Py_ssize_t index, 8060 const void **ptr) 8061 { 8062 if (index != 0) { 8063 PyErr_SetString(PyExc_SystemError, 8064 "accessing non-existent unicode segment"); 8065 return -1; 8066 } 8067 *ptr = (void *) self->str; 8068 return PyUnicode_GET_DATA_SIZE(self); 8069 } 8070 8071 static Py_ssize_t 8072 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index, 8073 const void **ptr) 8074 { 8075 PyErr_SetString(PyExc_TypeError, 8076 "cannot use unicode as modifiable buffer"); 8077 return -1; 8078 } 8079 8080 static int 8081 unicode_buffer_getsegcount(PyUnicodeObject *self, 8082 Py_ssize_t *lenp) 8083 { 8084 if (lenp) 8085 *lenp = PyUnicode_GET_DATA_SIZE(self); 8086 return 1; 8087 } 8088 8089 static Py_ssize_t 8090 unicode_buffer_getcharbuf(PyUnicodeObject *self, 8091 Py_ssize_t index, 8092 const void **ptr) 8093 { 8094 PyObject *str; 8095 8096 if (index != 0) { 8097 PyErr_SetString(PyExc_SystemError, 8098 "accessing non-existent unicode segment"); 8099 return -1; 8100 } 8101 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL); 8102 if (str == NULL) 8103 return -1; 8104 *ptr = (void *) PyString_AS_STRING(str); 8105 return PyString_GET_SIZE(str); 8106 } 8107 8108 /* Helpers for PyUnicode_Format() */ 8109 8110 static PyObject * 8111 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 8112 { 8113 Py_ssize_t argidx = *p_argidx; 8114 if (argidx < arglen) { 8115 (*p_argidx)++; 8116 if (arglen < 0) 8117 return args; 8118 else 8119 return PyTuple_GetItem(args, argidx); 8120 } 8121 PyErr_SetString(PyExc_TypeError, 8122 "not enough arguments for format string"); 8123 return NULL; 8124 } 8125 8126 #define F_LJUST (1<<0) 8127 #define F_SIGN (1<<1) 8128 #define F_BLANK (1<<2) 8129 #define F_ALT (1<<3) 8130 #define F_ZERO (1<<4) 8131 8132 static Py_ssize_t 8133 strtounicode(Py_UNICODE *buffer, const char *charbuffer) 8134 { 8135 register Py_ssize_t i; 8136 Py_ssize_t len = strlen(charbuffer); 8137 for (i = len - 1; i >= 0; i--) 8138 buffer[i] = (Py_UNICODE) charbuffer[i]; 8139 8140 return len; 8141 } 8142 8143 static int 8144 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x) 8145 { 8146 Py_ssize_t result; 8147 8148 PyOS_snprintf((char *)buffer, len, format, x); 8149 result = strtounicode(buffer, (char *)buffer); 8150 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); 8151 } 8152 8153 /* XXX To save some code duplication, formatfloat/long/int could have been 8154 shared with stringobject.c, converting from 8-bit to Unicode after the 8155 formatting is done. */ 8156 8157 /* Returns a new reference to a PyUnicode object, or NULL on failure. */ 8158 8159 static PyObject * 8160 formatfloat(PyObject *v, int flags, int prec, int type) 8161 { 8162 char *p; 8163 PyObject *result; 8164 double x; 8165 8166 x = PyFloat_AsDouble(v); 8167 if (x == -1.0 && PyErr_Occurred()) 8168 return NULL; 8169 8170 if (prec < 0) 8171 prec = 6; 8172 8173 p = PyOS_double_to_string(x, type, prec, 8174 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 8175 if (p == NULL) 8176 return NULL; 8177 result = PyUnicode_FromStringAndSize(p, strlen(p)); 8178 PyMem_Free(p); 8179 return result; 8180 } 8181 8182 static PyObject* 8183 formatlong(PyObject *val, int flags, int prec, int type) 8184 { 8185 char *buf; 8186 int i, len; 8187 PyObject *str; /* temporary string object. */ 8188 PyUnicodeObject *result; 8189 8190 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len); 8191 if (!str) 8192 return NULL; 8193 result = _PyUnicode_New(len); 8194 if (!result) { 8195 Py_DECREF(str); 8196 return NULL; 8197 } 8198 for (i = 0; i < len; i++) 8199 result->str[i] = buf[i]; 8200 result->str[len] = 0; 8201 Py_DECREF(str); 8202 return (PyObject*)result; 8203 } 8204 8205 static int 8206 formatint(Py_UNICODE *buf, 8207 size_t buflen, 8208 int flags, 8209 int prec, 8210 int type, 8211 PyObject *v) 8212 { 8213 /* fmt = '%#.' + `prec` + 'l' + `type` 8214 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 8215 * + 1 + 1 8216 * = 24 8217 */ 8218 char fmt[64]; /* plenty big enough! */ 8219 char *sign; 8220 long x; 8221 8222 x = PyInt_AsLong(v); 8223 if (x == -1 && PyErr_Occurred()) 8224 return -1; 8225 if (x < 0 && type == 'u') { 8226 type = 'd'; 8227 } 8228 if (x < 0 && (type == 'x' || type == 'X' || type == 'o')) 8229 sign = "-"; 8230 else 8231 sign = ""; 8232 if (prec < 0) 8233 prec = 1; 8234 8235 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal)) 8236 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11 8237 */ 8238 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) { 8239 PyErr_SetString(PyExc_OverflowError, 8240 "formatted integer is too long (precision too large?)"); 8241 return -1; 8242 } 8243 8244 if ((flags & F_ALT) && 8245 (type == 'x' || type == 'X')) { 8246 /* When converting under %#x or %#X, there are a number 8247 * of issues that cause pain: 8248 * - when 0 is being converted, the C standard leaves off 8249 * the '0x' or '0X', which is inconsistent with other 8250 * %#x/%#X conversions and inconsistent with Python's 8251 * hex() function 8252 * - there are platforms that violate the standard and 8253 * convert 0 with the '0x' or '0X' 8254 * (Metrowerks, Compaq Tru64) 8255 * - there are platforms that give '0x' when converting 8256 * under %#X, but convert 0 in accordance with the 8257 * standard (OS/2 EMX) 8258 * 8259 * We can achieve the desired consistency by inserting our 8260 * own '0x' or '0X' prefix, and substituting %x/%X in place 8261 * of %#x/%#X. 8262 * 8263 * Note that this is the same approach as used in 8264 * formatint() in stringobject.c 8265 */ 8266 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c", 8267 sign, type, prec, type); 8268 } 8269 else { 8270 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c", 8271 sign, (flags&F_ALT) ? "#" : "", 8272 prec, type); 8273 } 8274 if (sign[0]) 8275 return longtounicode(buf, buflen, fmt, -x); 8276 else 8277 return longtounicode(buf, buflen, fmt, x); 8278 } 8279 8280 static int 8281 formatchar(Py_UNICODE *buf, 8282 size_t buflen, 8283 PyObject *v) 8284 { 8285 PyObject *unistr; 8286 char *str; 8287 /* presume that the buffer is at least 2 characters long */ 8288 if (PyUnicode_Check(v)) { 8289 if (PyUnicode_GET_SIZE(v) != 1) 8290 goto onError; 8291 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 8292 } 8293 8294 else if (PyString_Check(v)) { 8295 if (PyString_GET_SIZE(v) != 1) 8296 goto onError; 8297 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail 8298 with a UnicodeDecodeError if 'char' is not decodable with the 8299 default encoding (usually ASCII, but it might be something else) */ 8300 str = PyString_AS_STRING(v); 8301 if ((unsigned char)str[0] > 0x7F) { 8302 /* the char is not ASCII; try to decode the string using the 8303 default encoding and return -1 to let the UnicodeDecodeError 8304 be raised if the string can't be decoded */ 8305 unistr = PyUnicode_Decode(str, 1, NULL, "strict"); 8306 if (unistr == NULL) 8307 return -1; 8308 buf[0] = PyUnicode_AS_UNICODE(unistr)[0]; 8309 Py_DECREF(unistr); 8310 } 8311 else 8312 buf[0] = (Py_UNICODE)str[0]; 8313 } 8314 8315 else { 8316 /* Integer input truncated to a character */ 8317 long x; 8318 x = PyInt_AsLong(v); 8319 if (x == -1 && PyErr_Occurred()) 8320 goto onError; 8321 #ifdef Py_UNICODE_WIDE 8322 if (x < 0 || x > 0x10ffff) { 8323 PyErr_SetString(PyExc_OverflowError, 8324 "%c arg not in range(0x110000) " 8325 "(wide Python build)"); 8326 return -1; 8327 } 8328 #else 8329 if (x < 0 || x > 0xffff) { 8330 PyErr_SetString(PyExc_OverflowError, 8331 "%c arg not in range(0x10000) " 8332 "(narrow Python build)"); 8333 return -1; 8334 } 8335 #endif 8336 buf[0] = (Py_UNICODE) x; 8337 } 8338 buf[1] = '\0'; 8339 return 1; 8340 8341 onError: 8342 PyErr_SetString(PyExc_TypeError, 8343 "%c requires int or char"); 8344 return -1; 8345 } 8346 8347 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 8348 8349 FORMATBUFLEN is the length of the buffer in which the ints & 8350 chars are formatted. XXX This is a magic number. Each formatting 8351 routine does bounds checking to ensure no overflow, but a better 8352 solution may be to malloc a buffer of appropriate size for each 8353 format. For now, the current solution is sufficient. 8354 */ 8355 #define FORMATBUFLEN (size_t)120 8356 8357 PyObject *PyUnicode_Format(PyObject *format, 8358 PyObject *args) 8359 { 8360 Py_UNICODE *fmt, *res; 8361 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx; 8362 int args_owned = 0; 8363 PyUnicodeObject *result = NULL; 8364 PyObject *dict = NULL; 8365 PyObject *uformat; 8366 8367 if (format == NULL || args == NULL) { 8368 PyErr_BadInternalCall(); 8369 return NULL; 8370 } 8371 uformat = PyUnicode_FromObject(format); 8372 if (uformat == NULL) 8373 return NULL; 8374 fmt = PyUnicode_AS_UNICODE(uformat); 8375 fmtcnt = PyUnicode_GET_SIZE(uformat); 8376 8377 reslen = rescnt = fmtcnt + 100; 8378 result = _PyUnicode_New(reslen); 8379 if (result == NULL) 8380 goto onError; 8381 res = PyUnicode_AS_UNICODE(result); 8382 8383 if (PyTuple_Check(args)) { 8384 arglen = PyTuple_Size(args); 8385 argidx = 0; 8386 } 8387 else { 8388 arglen = -1; 8389 argidx = -2; 8390 } 8391 if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript && 8392 !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type)) 8393 dict = args; 8394 8395 while (--fmtcnt >= 0) { 8396 if (*fmt != '%') { 8397 if (--rescnt < 0) { 8398 rescnt = fmtcnt + 100; 8399 reslen += rescnt; 8400 if (_PyUnicode_Resize(&result, reslen) < 0) 8401 goto onError; 8402 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 8403 --rescnt; 8404 } 8405 *res++ = *fmt++; 8406 } 8407 else { 8408 /* Got a format specifier */ 8409 int flags = 0; 8410 Py_ssize_t width = -1; 8411 int prec = -1; 8412 Py_UNICODE c = '\0'; 8413 Py_UNICODE fill; 8414 int isnumok; 8415 PyObject *v = NULL; 8416 PyObject *temp = NULL; 8417 Py_UNICODE *pbuf; 8418 Py_UNICODE sign; 8419 Py_ssize_t len; 8420 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */ 8421 8422 fmt++; 8423 if (*fmt == '(') { 8424 Py_UNICODE *keystart; 8425 Py_ssize_t keylen; 8426 PyObject *key; 8427 int pcount = 1; 8428 8429 if (dict == NULL) { 8430 PyErr_SetString(PyExc_TypeError, 8431 "format requires a mapping"); 8432 goto onError; 8433 } 8434 ++fmt; 8435 --fmtcnt; 8436 keystart = fmt; 8437 /* Skip over balanced parentheses */ 8438 while (pcount > 0 && --fmtcnt >= 0) { 8439 if (*fmt == ')') 8440 --pcount; 8441 else if (*fmt == '(') 8442 ++pcount; 8443 fmt++; 8444 } 8445 keylen = fmt - keystart - 1; 8446 if (fmtcnt < 0 || pcount > 0) { 8447 PyErr_SetString(PyExc_ValueError, 8448 "incomplete format key"); 8449 goto onError; 8450 } 8451 #if 0 8452 /* keys are converted to strings using UTF-8 and 8453 then looked up since Python uses strings to hold 8454 variables names etc. in its namespaces and we 8455 wouldn't want to break common idioms. */ 8456 key = PyUnicode_EncodeUTF8(keystart, 8457 keylen, 8458 NULL); 8459 #else 8460 key = PyUnicode_FromUnicode(keystart, keylen); 8461 #endif 8462 if (key == NULL) 8463 goto onError; 8464 if (args_owned) { 8465 Py_DECREF(args); 8466 args_owned = 0; 8467 } 8468 args = PyObject_GetItem(dict, key); 8469 Py_DECREF(key); 8470 if (args == NULL) { 8471 goto onError; 8472 } 8473 args_owned = 1; 8474 arglen = -1; 8475 argidx = -2; 8476 } 8477 while (--fmtcnt >= 0) { 8478 switch (c = *fmt++) { 8479 case '-': flags |= F_LJUST; continue; 8480 case '+': flags |= F_SIGN; continue; 8481 case ' ': flags |= F_BLANK; continue; 8482 case '#': flags |= F_ALT; continue; 8483 case '0': flags |= F_ZERO; continue; 8484 } 8485 break; 8486 } 8487 if (c == '*') { 8488 v = getnextarg(args, arglen, &argidx); 8489 if (v == NULL) 8490 goto onError; 8491 if (!PyInt_Check(v)) { 8492 PyErr_SetString(PyExc_TypeError, 8493 "* wants int"); 8494 goto onError; 8495 } 8496 width = PyInt_AsSsize_t(v); 8497 if (width == -1 && PyErr_Occurred()) 8498 goto onError; 8499 if (width < 0) { 8500 flags |= F_LJUST; 8501 width = -width; 8502 } 8503 if (--fmtcnt >= 0) 8504 c = *fmt++; 8505 } 8506 else if (c >= '0' && c <= '9') { 8507 width = c - '0'; 8508 while (--fmtcnt >= 0) { 8509 c = *fmt++; 8510 if (c < '0' || c > '9') 8511 break; 8512 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) { 8513 PyErr_SetString(PyExc_ValueError, 8514 "width too big"); 8515 goto onError; 8516 } 8517 width = width*10 + (c - '0'); 8518 } 8519 } 8520 if (c == '.') { 8521 prec = 0; 8522 if (--fmtcnt >= 0) 8523 c = *fmt++; 8524 if (c == '*') { 8525 v = getnextarg(args, arglen, &argidx); 8526 if (v == NULL) 8527 goto onError; 8528 if (!PyInt_Check(v)) { 8529 PyErr_SetString(PyExc_TypeError, 8530 "* wants int"); 8531 goto onError; 8532 } 8533 prec = _PyInt_AsInt(v); 8534 if (prec == -1 && PyErr_Occurred()) 8535 goto onError; 8536 if (prec < 0) 8537 prec = 0; 8538 if (--fmtcnt >= 0) 8539 c = *fmt++; 8540 } 8541 else if (c >= '0' && c <= '9') { 8542 prec = c - '0'; 8543 while (--fmtcnt >= 0) { 8544 c = *fmt++; 8545 if (c < '0' || c > '9') 8546 break; 8547 if (prec > (INT_MAX - ((int)c - '0')) / 10) { 8548 PyErr_SetString(PyExc_ValueError, 8549 "prec too big"); 8550 goto onError; 8551 } 8552 prec = prec*10 + (c - '0'); 8553 } 8554 } 8555 } /* prec */ 8556 if (fmtcnt >= 0) { 8557 if (c == 'h' || c == 'l' || c == 'L') { 8558 if (--fmtcnt >= 0) 8559 c = *fmt++; 8560 } 8561 } 8562 if (fmtcnt < 0) { 8563 PyErr_SetString(PyExc_ValueError, 8564 "incomplete format"); 8565 goto onError; 8566 } 8567 if (c != '%') { 8568 v = getnextarg(args, arglen, &argidx); 8569 if (v == NULL) 8570 goto onError; 8571 } 8572 sign = 0; 8573 fill = ' '; 8574 switch (c) { 8575 8576 case '%': 8577 pbuf = formatbuf; 8578 /* presume that buffer length is at least 1 */ 8579 pbuf[0] = '%'; 8580 len = 1; 8581 break; 8582 8583 case 's': 8584 case 'r': 8585 if (PyUnicode_CheckExact(v) && c == 's') { 8586 temp = v; 8587 Py_INCREF(temp); 8588 } 8589 else { 8590 PyObject *unicode; 8591 if (c == 's') 8592 temp = PyObject_Unicode(v); 8593 else 8594 temp = PyObject_Repr(v); 8595 if (temp == NULL) 8596 goto onError; 8597 if (PyUnicode_Check(temp)) 8598 /* nothing to do */; 8599 else if (PyString_Check(temp)) { 8600 /* convert to string to Unicode */ 8601 unicode = PyUnicode_Decode(PyString_AS_STRING(temp), 8602 PyString_GET_SIZE(temp), 8603 NULL, 8604 "strict"); 8605 Py_DECREF(temp); 8606 temp = unicode; 8607 if (temp == NULL) 8608 goto onError; 8609 } 8610 else { 8611 Py_DECREF(temp); 8612 PyErr_SetString(PyExc_TypeError, 8613 "%s argument has non-string str()"); 8614 goto onError; 8615 } 8616 } 8617 pbuf = PyUnicode_AS_UNICODE(temp); 8618 len = PyUnicode_GET_SIZE(temp); 8619 if (prec >= 0 && len > prec) 8620 len = prec; 8621 break; 8622 8623 case 'i': 8624 case 'd': 8625 case 'u': 8626 case 'o': 8627 case 'x': 8628 case 'X': 8629 if (c == 'i') 8630 c = 'd'; 8631 isnumok = 0; 8632 if (PyNumber_Check(v)) { 8633 PyObject *iobj=NULL; 8634 8635 if (PyInt_Check(v) || (PyLong_Check(v))) { 8636 iobj = v; 8637 Py_INCREF(iobj); 8638 } 8639 else { 8640 iobj = PyNumber_Int(v); 8641 if (iobj==NULL) { 8642 PyErr_Clear(); 8643 iobj = PyNumber_Long(v); 8644 } 8645 } 8646 if (iobj!=NULL) { 8647 if (PyInt_Check(iobj)) { 8648 isnumok = 1; 8649 pbuf = formatbuf; 8650 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 8651 flags, prec, c, iobj); 8652 Py_DECREF(iobj); 8653 if (len < 0) 8654 goto onError; 8655 sign = 1; 8656 } 8657 else if (PyLong_Check(iobj)) { 8658 isnumok = 1; 8659 temp = formatlong(iobj, flags, prec, c); 8660 Py_DECREF(iobj); 8661 if (!temp) 8662 goto onError; 8663 pbuf = PyUnicode_AS_UNICODE(temp); 8664 len = PyUnicode_GET_SIZE(temp); 8665 sign = 1; 8666 } 8667 else { 8668 Py_DECREF(iobj); 8669 } 8670 } 8671 } 8672 if (!isnumok) { 8673 PyErr_Format(PyExc_TypeError, 8674 "%%%c format: a number is required, " 8675 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 8676 goto onError; 8677 } 8678 if (flags & F_ZERO) 8679 fill = '0'; 8680 break; 8681 8682 case 'e': 8683 case 'E': 8684 case 'f': 8685 case 'F': 8686 case 'g': 8687 case 'G': 8688 temp = formatfloat(v, flags, prec, c); 8689 if (temp == NULL) 8690 goto onError; 8691 pbuf = PyUnicode_AS_UNICODE(temp); 8692 len = PyUnicode_GET_SIZE(temp); 8693 sign = 1; 8694 if (flags & F_ZERO) 8695 fill = '0'; 8696 break; 8697 8698 case 'c': 8699 pbuf = formatbuf; 8700 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 8701 if (len < 0) 8702 goto onError; 8703 break; 8704 8705 default: 8706 PyErr_Format(PyExc_ValueError, 8707 "unsupported format character '%c' (0x%x) " 8708 "at index %zd", 8709 (31<=c && c<=126) ? (char)c : '?', 8710 (int)c, 8711 (Py_ssize_t)(fmt - 1 - 8712 PyUnicode_AS_UNICODE(uformat))); 8713 goto onError; 8714 } 8715 if (sign) { 8716 if (*pbuf == '-' || *pbuf == '+') { 8717 sign = *pbuf++; 8718 len--; 8719 } 8720 else if (flags & F_SIGN) 8721 sign = '+'; 8722 else if (flags & F_BLANK) 8723 sign = ' '; 8724 else 8725 sign = 0; 8726 } 8727 if (width < len) 8728 width = len; 8729 if (rescnt - (sign != 0) < width) { 8730 reslen -= rescnt; 8731 rescnt = width + fmtcnt + 100; 8732 reslen += rescnt; 8733 if (reslen < 0) { 8734 Py_XDECREF(temp); 8735 PyErr_NoMemory(); 8736 goto onError; 8737 } 8738 if (_PyUnicode_Resize(&result, reslen) < 0) { 8739 Py_XDECREF(temp); 8740 goto onError; 8741 } 8742 res = PyUnicode_AS_UNICODE(result) 8743 + reslen - rescnt; 8744 } 8745 if (sign) { 8746 if (fill != ' ') 8747 *res++ = sign; 8748 rescnt--; 8749 if (width > len) 8750 width--; 8751 } 8752 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 8753 assert(pbuf[0] == '0'); 8754 assert(pbuf[1] == c); 8755 if (fill != ' ') { 8756 *res++ = *pbuf++; 8757 *res++ = *pbuf++; 8758 } 8759 rescnt -= 2; 8760 width -= 2; 8761 if (width < 0) 8762 width = 0; 8763 len -= 2; 8764 } 8765 if (width > len && !(flags & F_LJUST)) { 8766 do { 8767 --rescnt; 8768 *res++ = fill; 8769 } while (--width > len); 8770 } 8771 if (fill == ' ') { 8772 if (sign) 8773 *res++ = sign; 8774 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 8775 assert(pbuf[0] == '0'); 8776 assert(pbuf[1] == c); 8777 *res++ = *pbuf++; 8778 *res++ = *pbuf++; 8779 } 8780 } 8781 Py_UNICODE_COPY(res, pbuf, len); 8782 res += len; 8783 rescnt -= len; 8784 while (--width >= len) { 8785 --rescnt; 8786 *res++ = ' '; 8787 } 8788 if (dict && (argidx < arglen) && c != '%') { 8789 PyErr_SetString(PyExc_TypeError, 8790 "not all arguments converted during string formatting"); 8791 Py_XDECREF(temp); 8792 goto onError; 8793 } 8794 Py_XDECREF(temp); 8795 } /* '%' */ 8796 } /* until end */ 8797 if (argidx < arglen && !dict) { 8798 PyErr_SetString(PyExc_TypeError, 8799 "not all arguments converted during string formatting"); 8800 goto onError; 8801 } 8802 8803 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) 8804 goto onError; 8805 if (args_owned) { 8806 Py_DECREF(args); 8807 } 8808 Py_DECREF(uformat); 8809 return (PyObject *)result; 8810 8811 onError: 8812 Py_XDECREF(result); 8813 Py_DECREF(uformat); 8814 if (args_owned) { 8815 Py_DECREF(args); 8816 } 8817 return NULL; 8818 } 8819 8820 static PyBufferProcs unicode_as_buffer = { 8821 (readbufferproc) unicode_buffer_getreadbuf, 8822 (writebufferproc) unicode_buffer_getwritebuf, 8823 (segcountproc) unicode_buffer_getsegcount, 8824 (charbufferproc) unicode_buffer_getcharbuf, 8825 }; 8826 8827 static PyObject * 8828 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 8829 8830 static PyObject * 8831 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 8832 { 8833 PyObject *x = NULL; 8834 static char *kwlist[] = {"string", "encoding", "errors", 0}; 8835 char *encoding = NULL; 8836 char *errors = NULL; 8837 8838 if (type != &PyUnicode_Type) 8839 return unicode_subtype_new(type, args, kwds); 8840 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode", 8841 kwlist, &x, &encoding, &errors)) 8842 return NULL; 8843 if (x == NULL) 8844 return (PyObject *)_PyUnicode_New(0); 8845 if (encoding == NULL && errors == NULL) 8846 return PyObject_Unicode(x); 8847 else 8848 return PyUnicode_FromEncodedObject(x, encoding, errors); 8849 } 8850 8851 static PyObject * 8852 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 8853 { 8854 PyUnicodeObject *tmp, *pnew; 8855 Py_ssize_t n; 8856 8857 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 8858 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 8859 if (tmp == NULL) 8860 return NULL; 8861 assert(PyUnicode_Check(tmp)); 8862 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 8863 if (pnew == NULL) { 8864 Py_DECREF(tmp); 8865 return NULL; 8866 } 8867 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1)); 8868 if (pnew->str == NULL) { 8869 _Py_ForgetReference((PyObject *)pnew); 8870 PyObject_Del(pnew); 8871 Py_DECREF(tmp); 8872 return PyErr_NoMemory(); 8873 } 8874 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 8875 pnew->length = n; 8876 pnew->hash = tmp->hash; 8877 Py_DECREF(tmp); 8878 return (PyObject *)pnew; 8879 } 8880 8881 PyDoc_STRVAR(unicode_doc, 8882 "unicode(object='') -> unicode object\n\ 8883 unicode(string[, encoding[, errors]]) -> unicode object\n\ 8884 \n\ 8885 Create a new Unicode object from the given encoded string.\n\ 8886 encoding defaults to the current default string encoding.\n\ 8887 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 8888 8889 PyTypeObject PyUnicode_Type = { 8890 PyVarObject_HEAD_INIT(&PyType_Type, 0) 8891 "unicode", /* tp_name */ 8892 sizeof(PyUnicodeObject), /* tp_size */ 8893 0, /* tp_itemsize */ 8894 /* Slots */ 8895 (destructor)unicode_dealloc, /* tp_dealloc */ 8896 0, /* tp_print */ 8897 0, /* tp_getattr */ 8898 0, /* tp_setattr */ 8899 0, /* tp_compare */ 8900 unicode_repr, /* tp_repr */ 8901 &unicode_as_number, /* tp_as_number */ 8902 &unicode_as_sequence, /* tp_as_sequence */ 8903 &unicode_as_mapping, /* tp_as_mapping */ 8904 (hashfunc) unicode_hash, /* tp_hash*/ 8905 0, /* tp_call*/ 8906 (reprfunc) unicode_str, /* tp_str */ 8907 PyObject_GenericGetAttr, /* tp_getattro */ 8908 0, /* tp_setattro */ 8909 &unicode_as_buffer, /* tp_as_buffer */ 8910 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES | 8911 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 8912 unicode_doc, /* tp_doc */ 8913 0, /* tp_traverse */ 8914 0, /* tp_clear */ 8915 PyUnicode_RichCompare, /* tp_richcompare */ 8916 0, /* tp_weaklistoffset */ 8917 0, /* tp_iter */ 8918 0, /* tp_iternext */ 8919 unicode_methods, /* tp_methods */ 8920 0, /* tp_members */ 8921 0, /* tp_getset */ 8922 &PyBaseString_Type, /* tp_base */ 8923 0, /* tp_dict */ 8924 0, /* tp_descr_get */ 8925 0, /* tp_descr_set */ 8926 0, /* tp_dictoffset */ 8927 0, /* tp_init */ 8928 0, /* tp_alloc */ 8929 unicode_new, /* tp_new */ 8930 PyObject_Del, /* tp_free */ 8931 }; 8932 8933 /* Initialize the Unicode implementation */ 8934 8935 void _PyUnicode_Init(void) 8936 { 8937 /* XXX - move this array to unicodectype.c ? */ 8938 Py_UNICODE linebreak[] = { 8939 0x000A, /* LINE FEED */ 8940 0x000D, /* CARRIAGE RETURN */ 8941 0x001C, /* FILE SEPARATOR */ 8942 0x001D, /* GROUP SEPARATOR */ 8943 0x001E, /* RECORD SEPARATOR */ 8944 0x0085, /* NEXT LINE */ 8945 0x2028, /* LINE SEPARATOR */ 8946 0x2029, /* PARAGRAPH SEPARATOR */ 8947 }; 8948 8949 /* Init the implementation */ 8950 if (!unicode_empty) { 8951 unicode_empty = _PyUnicode_New(0); 8952 if (!unicode_empty) 8953 return; 8954 } 8955 8956 if (PyType_Ready(&PyUnicode_Type) < 0) 8957 Py_FatalError("Can't initialize 'unicode'"); 8958 8959 /* initialize the linebreak bloom filter */ 8960 bloom_linebreak = make_bloom_mask( 8961 linebreak, sizeof(linebreak) / sizeof(linebreak[0]) 8962 ); 8963 8964 PyType_Ready(&EncodingMapType); 8965 8966 if (PyType_Ready(&PyFieldNameIter_Type) < 0) 8967 Py_FatalError("Can't initialize field name iterator type"); 8968 8969 if (PyType_Ready(&PyFormatterIter_Type) < 0) 8970 Py_FatalError("Can't initialize formatter iter type"); 8971 } 8972 8973 /* Finalize the Unicode implementation */ 8974 8975 int 8976 PyUnicode_ClearFreeList(void) 8977 { 8978 int freelist_size = numfree; 8979 PyUnicodeObject *u; 8980 8981 for (u = free_list; u != NULL;) { 8982 PyUnicodeObject *v = u; 8983 u = *(PyUnicodeObject **)u; 8984 if (v->str) 8985 PyObject_DEL(v->str); 8986 Py_XDECREF(v->defenc); 8987 PyObject_Del(v); 8988 numfree--; 8989 } 8990 free_list = NULL; 8991 assert(numfree == 0); 8992 return freelist_size; 8993 } 8994 8995 void 8996 _PyUnicode_Fini(void) 8997 { 8998 int i; 8999 9000 Py_CLEAR(unicode_empty); 9001 9002 for (i = 0; i < 256; i++) 9003 Py_CLEAR(unicode_latin1[i]); 9004 9005 (void)PyUnicode_ClearFreeList(); 9006 } 9007 9008 #ifdef __cplusplus 9009 } 9010 #endif 9011