1 /* 2 3 Unicode implementation based on original code by Fredrik Lundh, 4 modified by Marc-Andre Lemburg <mal (at) lemburg.com> according to the 5 Unicode Integration Proposal (see file Misc/unicode.txt). 6 7 Major speed upgrades to the method implementations at the Reykjavik 8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 9 10 Copyright (c) Corporation for National Research Initiatives. 11 12 -------------------------------------------------------------------- 13 The original string type implementation is: 14 15 Copyright (c) 1999 by Secret Labs AB 16 Copyright (c) 1999 by Fredrik Lundh 17 18 By obtaining, using, and/or copying this software and/or its 19 associated documentation, you agree that you have read, understood, 20 and will comply with the following terms and conditions: 21 22 Permission to use, copy, modify, and distribute this software and its 23 associated documentation for any purpose and without fee is hereby 24 granted, provided that the above copyright notice appears in all 25 copies, and that both that copyright notice and this permission notice 26 appear in supporting documentation, and that the name of Secret Labs 27 AB or the author not be used in advertising or publicity pertaining to 28 distribution of the software without specific, written prior 29 permission. 30 31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 38 -------------------------------------------------------------------- 39 40 */ 41 42 #define PY_SSIZE_T_CLEAN 43 #include "Python.h" 44 45 #include "unicodeobject.h" 46 #include "ucnhash.h" 47 48 #ifdef MS_WINDOWS 49 #include <windows.h> 50 #endif 51 52 /* Limit for the Unicode object free list */ 53 54 #define PyUnicode_MAXFREELIST 1024 55 56 /* Limit for the Unicode object free list stay alive optimization. 57 58 The implementation will keep allocated Unicode memory intact for 59 all objects on the free list having a size less than this 60 limit. This reduces malloc() overhead for small Unicode objects. 61 62 At worst this will result in PyUnicode_MAXFREELIST * 63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 64 malloc()-overhead) bytes of unused garbage. 65 66 Setting the limit to 0 effectively turns the feature off. 67 68 Note: This is an experimental feature ! If you get core dumps when 69 using Unicode objects, turn this feature off. 70 71 */ 72 73 #define KEEPALIVE_SIZE_LIMIT 9 74 75 /* Endianness switches; defaults to little endian */ 76 77 #ifdef WORDS_BIGENDIAN 78 # define BYTEORDER_IS_BIG_ENDIAN 79 #else 80 # define BYTEORDER_IS_LITTLE_ENDIAN 81 #endif 82 83 /* --- Globals ------------------------------------------------------------ 84 85 NOTE: In the interpreter's initialization phase, some globals are currently 86 initialized dynamically as needed. In the process Unicode objects may 87 be created before the Unicode type is ready. 88 89 */ 90 91 92 #ifdef __cplusplus 93 extern "C" { 94 #endif 95 96 /* Free list for Unicode objects */ 97 static PyUnicodeObject *free_list = NULL; 98 static int numfree = 0; 99 100 /* The empty Unicode object is shared to improve performance. */ 101 static PyUnicodeObject *unicode_empty = NULL; 102 103 #define _Py_RETURN_UNICODE_EMPTY() \ 104 do { \ 105 if (unicode_empty != NULL) \ 106 Py_INCREF(unicode_empty); \ 107 else { \ 108 unicode_empty = _PyUnicode_New(0); \ 109 if (unicode_empty != NULL) \ 110 Py_INCREF(unicode_empty); \ 111 } \ 112 return (PyObject *)unicode_empty; \ 113 } while (0) 114 115 /* Single character Unicode strings in the Latin-1 range are being 116 shared as well. */ 117 static PyUnicodeObject *unicode_latin1[256] = {NULL}; 118 119 /* Default encoding to use and assume when NULL is passed as encoding 120 parameter; it is initialized by _PyUnicode_Init(). 121 122 Always use the PyUnicode_SetDefaultEncoding() and 123 PyUnicode_GetDefaultEncoding() APIs to access this global. 124 125 */ 126 static char unicode_default_encoding[100 + 1] = "ascii"; 127 128 /* Fast detection of the most frequent whitespace characters */ 129 const unsigned char _Py_ascii_whitespace[] = { 130 0, 0, 0, 0, 0, 0, 0, 0, 131 /* case 0x0009: * CHARACTER TABULATION */ 132 /* case 0x000A: * LINE FEED */ 133 /* case 0x000B: * LINE TABULATION */ 134 /* case 0x000C: * FORM FEED */ 135 /* case 0x000D: * CARRIAGE RETURN */ 136 0, 1, 1, 1, 1, 1, 0, 0, 137 0, 0, 0, 0, 0, 0, 0, 0, 138 /* case 0x001C: * FILE SEPARATOR */ 139 /* case 0x001D: * GROUP SEPARATOR */ 140 /* case 0x001E: * RECORD SEPARATOR */ 141 /* case 0x001F: * UNIT SEPARATOR */ 142 0, 0, 0, 0, 1, 1, 1, 1, 143 /* case 0x0020: * SPACE */ 144 1, 0, 0, 0, 0, 0, 0, 0, 145 0, 0, 0, 0, 0, 0, 0, 0, 146 0, 0, 0, 0, 0, 0, 0, 0, 147 0, 0, 0, 0, 0, 0, 0, 0, 148 149 0, 0, 0, 0, 0, 0, 0, 0, 150 0, 0, 0, 0, 0, 0, 0, 0, 151 0, 0, 0, 0, 0, 0, 0, 0, 152 0, 0, 0, 0, 0, 0, 0, 0, 153 0, 0, 0, 0, 0, 0, 0, 0, 154 0, 0, 0, 0, 0, 0, 0, 0, 155 0, 0, 0, 0, 0, 0, 0, 0, 156 0, 0, 0, 0, 0, 0, 0, 0 157 }; 158 159 /* Same for linebreaks */ 160 static unsigned char ascii_linebreak[] = { 161 0, 0, 0, 0, 0, 0, 0, 0, 162 /* 0x000A, * LINE FEED */ 163 /* 0x000B, * LINE TABULATION */ 164 /* 0x000C, * FORM FEED */ 165 /* 0x000D, * CARRIAGE RETURN */ 166 0, 0, 1, 1, 1, 1, 0, 0, 167 0, 0, 0, 0, 0, 0, 0, 0, 168 /* 0x001C, * FILE SEPARATOR */ 169 /* 0x001D, * GROUP SEPARATOR */ 170 /* 0x001E, * RECORD SEPARATOR */ 171 0, 0, 0, 0, 1, 1, 1, 0, 172 0, 0, 0, 0, 0, 0, 0, 0, 173 0, 0, 0, 0, 0, 0, 0, 0, 174 0, 0, 0, 0, 0, 0, 0, 0, 175 0, 0, 0, 0, 0, 0, 0, 0, 176 177 0, 0, 0, 0, 0, 0, 0, 0, 178 0, 0, 0, 0, 0, 0, 0, 0, 179 0, 0, 0, 0, 0, 0, 0, 0, 180 0, 0, 0, 0, 0, 0, 0, 0, 181 0, 0, 0, 0, 0, 0, 0, 0, 182 0, 0, 0, 0, 0, 0, 0, 0, 183 0, 0, 0, 0, 0, 0, 0, 0, 184 0, 0, 0, 0, 0, 0, 0, 0 185 }; 186 187 188 Py_UNICODE 189 PyUnicode_GetMax(void) 190 { 191 #ifdef Py_UNICODE_WIDE 192 return 0x10FFFF; 193 #else 194 /* This is actually an illegal character, so it should 195 not be passed to unichr. */ 196 return 0xFFFF; 197 #endif 198 } 199 200 /* --- Bloom Filters ----------------------------------------------------- */ 201 202 /* stuff to implement simple "bloom filters" for Unicode characters. 203 to keep things simple, we use a single bitmask, using the least 5 204 bits from each unicode characters as the bit index. */ 205 206 /* the linebreak mask is set up by Unicode_Init below */ 207 208 #if LONG_BIT >= 128 209 #define BLOOM_WIDTH 128 210 #elif LONG_BIT >= 64 211 #define BLOOM_WIDTH 64 212 #elif LONG_BIT >= 32 213 #define BLOOM_WIDTH 32 214 #else 215 #error "LONG_BIT is smaller than 32" 216 #endif 217 218 #define BLOOM_MASK unsigned long 219 220 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0; 221 222 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 223 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 224 225 #define BLOOM_LINEBREAK(ch) \ 226 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 228 229 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) 230 { 231 /* calculate simple bloom-style bitmask for a given unicode string */ 232 233 BLOOM_MASK mask; 234 Py_ssize_t i; 235 236 mask = 0; 237 for (i = 0; i < len; i++) 238 BLOOM_ADD(mask, ptr[i]); 239 240 return mask; 241 } 242 243 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) 244 { 245 Py_ssize_t i; 246 247 for (i = 0; i < setlen; i++) 248 if (set[i] == chr) 249 return 1; 250 251 return 0; 252 } 253 254 #define BLOOM_MEMBER(mask, chr, set, setlen) \ 255 BLOOM(mask, chr) && unicode_member(chr, set, setlen) 256 257 /* --- Unicode Object ----------------------------------------------------- */ 258 259 static 260 int unicode_resize(register PyUnicodeObject *unicode, 261 Py_ssize_t length) 262 { 263 void *oldstr; 264 265 /* Shortcut if there's nothing much to do. */ 266 if (unicode->length == length) 267 goto reset; 268 269 /* Resizing shared object (unicode_empty or single character 270 objects) in-place is not allowed. Use PyUnicode_Resize() 271 instead ! */ 272 273 if (unicode == unicode_empty || 274 (unicode->length == 1 && 275 unicode->str[0] < 256U && 276 unicode_latin1[unicode->str[0]] == unicode)) { 277 PyErr_SetString(PyExc_SystemError, 278 "can't resize shared unicode objects"); 279 return -1; 280 } 281 282 /* We allocate one more byte to make sure the string is Ux0000 terminated. 283 The overallocation is also used by fastsearch, which assumes that it's 284 safe to look at str[length] (without making any assumptions about what 285 it contains). */ 286 287 oldstr = unicode->str; 288 unicode->str = PyObject_REALLOC(unicode->str, 289 sizeof(Py_UNICODE) * (length + 1)); 290 if (!unicode->str) { 291 unicode->str = (Py_UNICODE *)oldstr; 292 PyErr_NoMemory(); 293 return -1; 294 } 295 unicode->str[length] = 0; 296 unicode->length = length; 297 298 reset: 299 /* Reset the object caches */ 300 if (unicode->defenc) { 301 Py_CLEAR(unicode->defenc); 302 } 303 unicode->hash = -1; 304 305 return 0; 306 } 307 308 /* We allocate one more byte to make sure the string is 309 Ux0000 terminated; some code relies on that. 310 311 XXX This allocator could further be enhanced by assuring that the 312 free list never reduces its size below 1. 313 314 */ 315 316 static 317 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) 318 { 319 register PyUnicodeObject *unicode; 320 321 /* Optimization for empty strings */ 322 if (length == 0 && unicode_empty != NULL) { 323 Py_INCREF(unicode_empty); 324 return unicode_empty; 325 } 326 327 /* Ensure we won't overflow the size. */ 328 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 329 return (PyUnicodeObject *)PyErr_NoMemory(); 330 } 331 332 /* Unicode freelist & memory allocation */ 333 if (free_list) { 334 unicode = free_list; 335 free_list = *(PyUnicodeObject **)unicode; 336 numfree--; 337 if (unicode->str) { 338 /* Keep-Alive optimization: we only upsize the buffer, 339 never downsize it. */ 340 if ((unicode->length < length) && 341 unicode_resize(unicode, length) < 0) { 342 PyObject_DEL(unicode->str); 343 unicode->str = NULL; 344 } 345 } 346 else { 347 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 348 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 349 } 350 PyObject_INIT(unicode, &PyUnicode_Type); 351 } 352 else { 353 size_t new_size; 354 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 355 if (unicode == NULL) 356 return NULL; 357 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 359 } 360 361 if (!unicode->str) { 362 PyErr_NoMemory(); 363 goto onError; 364 } 365 /* Initialize the first element to guard against cases where 366 * the caller fails before initializing str -- unicode_resize() 367 * reads str[0], and the Keep-Alive optimization can keep memory 368 * allocated for str alive across a call to unicode_dealloc(unicode). 369 * We don't want unicode_resize to read uninitialized memory in 370 * that case. 371 */ 372 unicode->str[0] = 0; 373 unicode->str[length] = 0; 374 unicode->length = length; 375 unicode->hash = -1; 376 unicode->defenc = NULL; 377 return unicode; 378 379 onError: 380 /* XXX UNREF/NEWREF interface should be more symmetrical */ 381 _Py_DEC_REFTOTAL; 382 _Py_ForgetReference((PyObject *)unicode); 383 PyObject_Del(unicode); 384 return NULL; 385 } 386 387 static 388 void unicode_dealloc(register PyUnicodeObject *unicode) 389 { 390 if (PyUnicode_CheckExact(unicode) && 391 numfree < PyUnicode_MAXFREELIST) { 392 /* Keep-Alive optimization */ 393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 394 PyObject_DEL(unicode->str); 395 unicode->str = NULL; 396 unicode->length = 0; 397 } 398 if (unicode->defenc) { 399 Py_CLEAR(unicode->defenc); 400 } 401 /* Add to free list */ 402 *(PyUnicodeObject **)unicode = free_list; 403 free_list = unicode; 404 numfree++; 405 } 406 else { 407 PyObject_DEL(unicode->str); 408 Py_XDECREF(unicode->defenc); 409 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 410 } 411 } 412 413 static 414 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length) 415 { 416 register PyUnicodeObject *v; 417 418 /* Argument checks */ 419 if (unicode == NULL) { 420 PyErr_BadInternalCall(); 421 return -1; 422 } 423 v = *unicode; 424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) { 425 PyErr_BadInternalCall(); 426 return -1; 427 } 428 429 /* Resizing unicode_empty and single character objects is not 430 possible since these are being shared. We simply return a fresh 431 copy with the same Unicode content. */ 432 if (v->length != length && 433 (v == unicode_empty || v->length == 1)) { 434 PyUnicodeObject *w = _PyUnicode_New(length); 435 if (w == NULL) 436 return -1; 437 Py_UNICODE_COPY(w->str, v->str, 438 length < v->length ? length : v->length); 439 Py_DECREF(*unicode); 440 *unicode = w; 441 return 0; 442 } 443 444 /* Note that we don't have to modify *unicode for unshared Unicode 445 objects, since we can modify them in-place. */ 446 return unicode_resize(v, length); 447 } 448 449 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) 450 { 451 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length); 452 } 453 454 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 455 Py_ssize_t size) 456 { 457 PyUnicodeObject *unicode; 458 459 /* If the Unicode data is known at construction time, we can apply 460 some optimizations which share commonly used objects. */ 461 if (u != NULL) { 462 463 /* Optimization for empty strings */ 464 if (size == 0) 465 _Py_RETURN_UNICODE_EMPTY(); 466 467 /* Single character Unicode objects in the Latin-1 range are 468 shared when using this constructor */ 469 if (size == 1 && *u < 256) { 470 unicode = unicode_latin1[*u]; 471 if (!unicode) { 472 unicode = _PyUnicode_New(1); 473 if (!unicode) 474 return NULL; 475 unicode->str[0] = *u; 476 unicode_latin1[*u] = unicode; 477 } 478 Py_INCREF(unicode); 479 return (PyObject *)unicode; 480 } 481 } 482 483 unicode = _PyUnicode_New(size); 484 if (!unicode) 485 return NULL; 486 487 /* Copy the Unicode data into the new object */ 488 if (u != NULL) 489 Py_UNICODE_COPY(unicode->str, u, size); 490 491 return (PyObject *)unicode; 492 } 493 494 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 495 { 496 PyUnicodeObject *unicode; 497 498 if (size < 0) { 499 PyErr_SetString(PyExc_SystemError, 500 "Negative size passed to PyUnicode_FromStringAndSize"); 501 return NULL; 502 } 503 504 /* If the Unicode data is known at construction time, we can apply 505 some optimizations which share commonly used objects. 506 Also, this means the input must be UTF-8, so fall back to the 507 UTF-8 decoder at the end. */ 508 if (u != NULL) { 509 510 /* Optimization for empty strings */ 511 if (size == 0) 512 _Py_RETURN_UNICODE_EMPTY(); 513 514 /* Single characters are shared when using this constructor. 515 Restrict to ASCII, since the input must be UTF-8. */ 516 if (size == 1 && Py_CHARMASK(*u) < 128) { 517 unicode = unicode_latin1[Py_CHARMASK(*u)]; 518 if (!unicode) { 519 unicode = _PyUnicode_New(1); 520 if (!unicode) 521 return NULL; 522 unicode->str[0] = Py_CHARMASK(*u); 523 unicode_latin1[Py_CHARMASK(*u)] = unicode; 524 } 525 Py_INCREF(unicode); 526 return (PyObject *)unicode; 527 } 528 529 return PyUnicode_DecodeUTF8(u, size, NULL); 530 } 531 532 unicode = _PyUnicode_New(size); 533 if (!unicode) 534 return NULL; 535 536 return (PyObject *)unicode; 537 } 538 539 PyObject *PyUnicode_FromString(const char *u) 540 { 541 size_t size = strlen(u); 542 if (size > PY_SSIZE_T_MAX) { 543 PyErr_SetString(PyExc_OverflowError, "input too long"); 544 return NULL; 545 } 546 547 return PyUnicode_FromStringAndSize(u, size); 548 } 549 550 /* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed 551 * by 'ptr', possibly combining surrogate pairs on narrow builds. 552 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character 553 * that should be returned and 'end' pointing to the end of the buffer. 554 * ('end' is used on narrow builds to detect a lone surrogate at the 555 * end of the buffer that should be returned unchanged.) 556 * The ptr and end arguments should be side-effect free and ptr must an lvalue. 557 * The type of the returned char is always Py_UCS4. 558 * 559 * Note: the macro advances ptr to next char, so it might have side-effects 560 * (especially if used with other macros). 561 */ 562 563 /* helper macros used by _Py_UNICODE_NEXT */ 564 #define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF) 565 #define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF) 566 /* Join two surrogate characters and return a single Py_UCS4 value. */ 567 #define _Py_UNICODE_JOIN_SURROGATES(high, low) \ 568 (((((Py_UCS4)(high) & 0x03FF) << 10) | \ 569 ((Py_UCS4)(low) & 0x03FF)) + 0x10000) 570 571 #ifdef Py_UNICODE_WIDE 572 #define _Py_UNICODE_NEXT(ptr, end) *(ptr)++ 573 #else 574 #define _Py_UNICODE_NEXT(ptr, end) \ 575 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \ 576 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \ 577 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \ 578 (Py_UCS4)*(ptr)++) 579 #endif 580 581 #ifdef HAVE_WCHAR_H 582 583 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4) 584 # define CONVERT_WCHAR_TO_SURROGATES 585 #endif 586 587 #ifdef CONVERT_WCHAR_TO_SURROGATES 588 589 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need 590 to convert from UTF32 to UTF16. */ 591 592 PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 593 Py_ssize_t size) 594 { 595 PyUnicodeObject *unicode; 596 register Py_ssize_t i; 597 Py_ssize_t alloc; 598 const wchar_t *orig_w; 599 600 if (w == NULL) { 601 PyErr_BadInternalCall(); 602 return NULL; 603 } 604 605 alloc = size; 606 orig_w = w; 607 for (i = size; i > 0; i--) { 608 if (*w > 0xFFFF) 609 alloc++; 610 w++; 611 } 612 w = orig_w; 613 unicode = _PyUnicode_New(alloc); 614 if (!unicode) 615 return NULL; 616 617 /* Copy the wchar_t data into the new object */ 618 { 619 register Py_UNICODE *u; 620 u = PyUnicode_AS_UNICODE(unicode); 621 for (i = size; i > 0; i--) { 622 if (*w > 0xFFFF) { 623 wchar_t ordinal = *w++; 624 ordinal -= 0x10000; 625 *u++ = 0xD800 | (ordinal >> 10); 626 *u++ = 0xDC00 | (ordinal & 0x3FF); 627 } 628 else 629 *u++ = *w++; 630 } 631 } 632 return (PyObject *)unicode; 633 } 634 635 #else 636 637 PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 638 Py_ssize_t size) 639 { 640 PyUnicodeObject *unicode; 641 642 if (w == NULL) { 643 PyErr_BadInternalCall(); 644 return NULL; 645 } 646 647 unicode = _PyUnicode_New(size); 648 if (!unicode) 649 return NULL; 650 651 /* Copy the wchar_t data into the new object */ 652 #ifdef HAVE_USABLE_WCHAR_T 653 memcpy(unicode->str, w, size * sizeof(wchar_t)); 654 #else 655 { 656 register Py_UNICODE *u; 657 register Py_ssize_t i; 658 u = PyUnicode_AS_UNICODE(unicode); 659 for (i = size; i > 0; i--) 660 *u++ = *w++; 661 } 662 #endif 663 664 return (PyObject *)unicode; 665 } 666 667 #endif /* CONVERT_WCHAR_TO_SURROGATES */ 668 669 #undef CONVERT_WCHAR_TO_SURROGATES 670 671 static void 672 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c) 673 { 674 *fmt++ = '%'; 675 if (width) { 676 if (zeropad) 677 *fmt++ = '0'; 678 fmt += sprintf(fmt, "%d", width); 679 } 680 if (precision) 681 fmt += sprintf(fmt, ".%d", precision); 682 if (longflag) 683 *fmt++ = 'l'; 684 else if (size_tflag) { 685 char *f = PY_FORMAT_SIZE_T; 686 while (*f) 687 *fmt++ = *f++; 688 } 689 *fmt++ = c; 690 *fmt = '\0'; 691 } 692 693 #define appendstring(string) \ 694 do { \ 695 for (copy = string;*copy; copy++) { \ 696 *s++ = (unsigned char)*copy; \ 697 } \ 698 } while (0) 699 700 PyObject * 701 PyUnicode_FromFormatV(const char *format, va_list vargs) 702 { 703 va_list count; 704 Py_ssize_t callcount = 0; 705 PyObject **callresults = NULL; 706 PyObject **callresult = NULL; 707 Py_ssize_t n = 0; 708 int width = 0; 709 int precision = 0; 710 int zeropad; 711 const char* f; 712 Py_UNICODE *s; 713 PyObject *string; 714 /* used by sprintf */ 715 char buffer[21]; 716 /* use abuffer instead of buffer, if we need more space 717 * (which can happen if there's a format specifier with width). */ 718 char *abuffer = NULL; 719 char *realbuffer; 720 Py_ssize_t abuffersize = 0; 721 char fmt[60]; /* should be enough for %0width.precisionld */ 722 const char *copy; 723 724 #ifdef VA_LIST_IS_ARRAY 725 Py_MEMCPY(count, vargs, sizeof(va_list)); 726 #else 727 #ifdef __va_copy 728 __va_copy(count, vargs); 729 #else 730 count = vargs; 731 #endif 732 #endif 733 /* step 1: count the number of %S/%R/%s format specifications 734 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these 735 * objects once during step 3 and put the result in an array) */ 736 for (f = format; *f; f++) { 737 if (*f == '%') { 738 f++; 739 while (*f && *f != '%' && !isalpha((unsigned)*f)) 740 f++; 741 if (!*f) 742 break; 743 if (*f == 's' || *f=='S' || *f=='R') 744 ++callcount; 745 } 746 } 747 /* step 2: allocate memory for the results of 748 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 749 if (callcount) { 750 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); 751 if (!callresults) { 752 PyErr_NoMemory(); 753 return NULL; 754 } 755 callresult = callresults; 756 } 757 /* step 3: figure out how large a buffer we need */ 758 for (f = format; *f; f++) { 759 if (*f == '%') { 760 const char* p = f++; 761 width = 0; 762 while (isdigit((unsigned)*f)) 763 width = (width*10) + *f++ - '0'; 764 precision = 0; 765 if (*f == '.') { 766 f++; 767 while (isdigit((unsigned)*f)) 768 precision = (precision*10) + *f++ - '0'; 769 } 770 771 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since 772 * they don't affect the amount of space we reserve. 773 */ 774 if ((*f == 'l' || *f == 'z') && 775 (f[1] == 'd' || f[1] == 'u')) 776 ++f; 777 778 switch (*f) { 779 case 'c': 780 { 781 int ordinal = va_arg(count, int); 782 #ifdef Py_UNICODE_WIDE 783 if (ordinal < 0 || ordinal > 0x10ffff) { 784 PyErr_SetString(PyExc_OverflowError, 785 "%c arg not in range(0x110000) " 786 "(wide Python build)"); 787 goto fail; 788 } 789 #else 790 if (ordinal < 0 || ordinal > 0xffff) { 791 PyErr_SetString(PyExc_OverflowError, 792 "%c arg not in range(0x10000) " 793 "(narrow Python build)"); 794 goto fail; 795 } 796 #endif 797 /* fall through... */ 798 } 799 case '%': 800 n++; 801 break; 802 case 'd': case 'u': case 'i': case 'x': 803 (void) va_arg(count, int); 804 if (width < precision) 805 width = precision; 806 /* 20 bytes is enough to hold a 64-bit 807 integer. Decimal takes the most space. 808 This isn't enough for octal. 809 If a width is specified we need more 810 (which we allocate later). */ 811 if (width < 20) 812 width = 20; 813 n += width; 814 if (abuffersize < width) 815 abuffersize = width; 816 break; 817 case 's': 818 { 819 /* UTF-8 */ 820 const char *s = va_arg(count, const char*); 821 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); 822 if (!str) 823 goto fail; 824 n += PyUnicode_GET_SIZE(str); 825 /* Remember the str and switch to the next slot */ 826 *callresult++ = str; 827 break; 828 } 829 case 'U': 830 { 831 PyObject *obj = va_arg(count, PyObject *); 832 assert(obj && PyUnicode_Check(obj)); 833 n += PyUnicode_GET_SIZE(obj); 834 break; 835 } 836 case 'V': 837 { 838 PyObject *obj = va_arg(count, PyObject *); 839 const char *str = va_arg(count, const char *); 840 assert(obj || str); 841 assert(!obj || PyUnicode_Check(obj)); 842 if (obj) 843 n += PyUnicode_GET_SIZE(obj); 844 else 845 n += strlen(str); 846 break; 847 } 848 case 'S': 849 { 850 PyObject *obj = va_arg(count, PyObject *); 851 PyObject *str; 852 assert(obj); 853 str = PyObject_Str(obj); 854 if (!str) 855 goto fail; 856 n += PyString_GET_SIZE(str); 857 /* Remember the str and switch to the next slot */ 858 *callresult++ = str; 859 break; 860 } 861 case 'R': 862 { 863 PyObject *obj = va_arg(count, PyObject *); 864 PyObject *repr; 865 assert(obj); 866 repr = PyObject_Repr(obj); 867 if (!repr) 868 goto fail; 869 n += PyUnicode_GET_SIZE(repr); 870 /* Remember the repr and switch to the next slot */ 871 *callresult++ = repr; 872 break; 873 } 874 case 'p': 875 (void) va_arg(count, int); 876 /* maximum 64-bit pointer representation: 877 * 0xffffffffffffffff 878 * so 19 characters is enough. 879 * XXX I count 18 -- what's the extra for? 880 */ 881 n += 19; 882 break; 883 default: 884 /* if we stumble upon an unknown 885 formatting code, copy the rest of 886 the format string to the output 887 string. (we cannot just skip the 888 code, since there's no way to know 889 what's in the argument list) */ 890 n += strlen(p); 891 goto expand; 892 } 893 } else 894 n++; 895 } 896 expand: 897 if (abuffersize > 20) { 898 /* add 1 for sprintf's trailing null byte */ 899 abuffer = PyObject_Malloc(abuffersize + 1); 900 if (!abuffer) { 901 PyErr_NoMemory(); 902 goto fail; 903 } 904 realbuffer = abuffer; 905 } 906 else 907 realbuffer = buffer; 908 /* step 4: fill the buffer */ 909 /* Since we've analyzed how much space we need for the worst case, 910 we don't have to resize the string. 911 There can be no errors beyond this point. */ 912 string = PyUnicode_FromUnicode(NULL, n); 913 if (!string) 914 goto fail; 915 916 s = PyUnicode_AS_UNICODE(string); 917 callresult = callresults; 918 919 for (f = format; *f; f++) { 920 if (*f == '%') { 921 const char* p = f++; 922 int longflag = 0; 923 int size_tflag = 0; 924 zeropad = (*f == '0'); 925 /* parse the width.precision part */ 926 width = 0; 927 while (isdigit((unsigned)*f)) 928 width = (width*10) + *f++ - '0'; 929 precision = 0; 930 if (*f == '.') { 931 f++; 932 while (isdigit((unsigned)*f)) 933 precision = (precision*10) + *f++ - '0'; 934 } 935 /* handle the long flag, but only for %ld and %lu. 936 others can be added when necessary. */ 937 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { 938 longflag = 1; 939 ++f; 940 } 941 /* handle the size_t flag. */ 942 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { 943 size_tflag = 1; 944 ++f; 945 } 946 947 switch (*f) { 948 case 'c': 949 *s++ = va_arg(vargs, int); 950 break; 951 case 'd': 952 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd'); 953 if (longflag) 954 sprintf(realbuffer, fmt, va_arg(vargs, long)); 955 else if (size_tflag) 956 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t)); 957 else 958 sprintf(realbuffer, fmt, va_arg(vargs, int)); 959 appendstring(realbuffer); 960 break; 961 case 'u': 962 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u'); 963 if (longflag) 964 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long)); 965 else if (size_tflag) 966 sprintf(realbuffer, fmt, va_arg(vargs, size_t)); 967 else 968 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int)); 969 appendstring(realbuffer); 970 break; 971 case 'i': 972 makefmt(fmt, 0, 0, zeropad, width, precision, 'i'); 973 sprintf(realbuffer, fmt, va_arg(vargs, int)); 974 appendstring(realbuffer); 975 break; 976 case 'x': 977 makefmt(fmt, 0, 0, zeropad, width, precision, 'x'); 978 sprintf(realbuffer, fmt, va_arg(vargs, int)); 979 appendstring(realbuffer); 980 break; 981 case 's': 982 { 983 /* unused, since we already have the result */ 984 (void) va_arg(vargs, char *); 985 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult), 986 PyUnicode_GET_SIZE(*callresult)); 987 s += PyUnicode_GET_SIZE(*callresult); 988 /* We're done with the unicode()/repr() => forget it */ 989 Py_DECREF(*callresult); 990 /* switch to next unicode()/repr() result */ 991 ++callresult; 992 break; 993 } 994 case 'U': 995 { 996 PyObject *obj = va_arg(vargs, PyObject *); 997 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 998 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 999 s += size; 1000 break; 1001 } 1002 case 'V': 1003 { 1004 PyObject *obj = va_arg(vargs, PyObject *); 1005 const char *str = va_arg(vargs, const char *); 1006 if (obj) { 1007 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 1008 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 1009 s += size; 1010 } else { 1011 appendstring(str); 1012 } 1013 break; 1014 } 1015 case 'S': 1016 case 'R': 1017 { 1018 const char *str = PyString_AS_STRING(*callresult); 1019 /* unused, since we already have the result */ 1020 (void) va_arg(vargs, PyObject *); 1021 appendstring(str); 1022 /* We're done with the unicode()/repr() => forget it */ 1023 Py_DECREF(*callresult); 1024 /* switch to next unicode()/repr() result */ 1025 ++callresult; 1026 break; 1027 } 1028 case 'p': 1029 sprintf(buffer, "%p", va_arg(vargs, void*)); 1030 /* %p is ill-defined: ensure leading 0x. */ 1031 if (buffer[1] == 'X') 1032 buffer[1] = 'x'; 1033 else if (buffer[1] != 'x') { 1034 memmove(buffer+2, buffer, strlen(buffer)+1); 1035 buffer[0] = '0'; 1036 buffer[1] = 'x'; 1037 } 1038 appendstring(buffer); 1039 break; 1040 case '%': 1041 *s++ = '%'; 1042 break; 1043 default: 1044 appendstring(p); 1045 goto end; 1046 } 1047 } else 1048 *s++ = *f; 1049 } 1050 1051 end: 1052 if (callresults) 1053 PyObject_Free(callresults); 1054 if (abuffer) 1055 PyObject_Free(abuffer); 1056 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string)); 1057 return string; 1058 fail: 1059 if (callresults) { 1060 PyObject **callresult2 = callresults; 1061 while (callresult2 < callresult) { 1062 Py_DECREF(*callresult2); 1063 ++callresult2; 1064 } 1065 PyObject_Free(callresults); 1066 } 1067 if (abuffer) 1068 PyObject_Free(abuffer); 1069 return NULL; 1070 } 1071 1072 #undef appendstring 1073 1074 PyObject * 1075 PyUnicode_FromFormat(const char *format, ...) 1076 { 1077 PyObject* ret; 1078 va_list vargs; 1079 1080 #ifdef HAVE_STDARG_PROTOTYPES 1081 va_start(vargs, format); 1082 #else 1083 va_start(vargs); 1084 #endif 1085 ret = PyUnicode_FromFormatV(format, vargs); 1086 va_end(vargs); 1087 return ret; 1088 } 1089 1090 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode, 1091 wchar_t *w, 1092 Py_ssize_t size) 1093 { 1094 if (unicode == NULL) { 1095 PyErr_BadInternalCall(); 1096 return -1; 1097 } 1098 1099 /* If possible, try to copy the 0-termination as well */ 1100 if (size > PyUnicode_GET_SIZE(unicode)) 1101 size = PyUnicode_GET_SIZE(unicode) + 1; 1102 1103 #ifdef HAVE_USABLE_WCHAR_T 1104 memcpy(w, unicode->str, size * sizeof(wchar_t)); 1105 #else 1106 { 1107 register Py_UNICODE *u; 1108 register Py_ssize_t i; 1109 u = PyUnicode_AS_UNICODE(unicode); 1110 for (i = size; i > 0; i--) 1111 *w++ = *u++; 1112 } 1113 #endif 1114 1115 if (size > PyUnicode_GET_SIZE(unicode)) 1116 return PyUnicode_GET_SIZE(unicode); 1117 else 1118 return size; 1119 } 1120 1121 #endif 1122 1123 PyObject *PyUnicode_FromOrdinal(int ordinal) 1124 { 1125 Py_UNICODE s[1]; 1126 1127 #ifdef Py_UNICODE_WIDE 1128 if (ordinal < 0 || ordinal > 0x10ffff) { 1129 PyErr_SetString(PyExc_ValueError, 1130 "unichr() arg not in range(0x110000) " 1131 "(wide Python build)"); 1132 return NULL; 1133 } 1134 #else 1135 if (ordinal < 0 || ordinal > 0xffff) { 1136 PyErr_SetString(PyExc_ValueError, 1137 "unichr() arg not in range(0x10000) " 1138 "(narrow Python build)"); 1139 return NULL; 1140 } 1141 #endif 1142 1143 s[0] = (Py_UNICODE)ordinal; 1144 return PyUnicode_FromUnicode(s, 1); 1145 } 1146 1147 PyObject *PyUnicode_FromObject(register PyObject *obj) 1148 { 1149 /* XXX Perhaps we should make this API an alias of 1150 PyObject_Unicode() instead ?! */ 1151 if (PyUnicode_CheckExact(obj)) { 1152 Py_INCREF(obj); 1153 return obj; 1154 } 1155 if (PyUnicode_Check(obj)) { 1156 /* For a Unicode subtype that's not a Unicode object, 1157 return a true Unicode object with the same data. */ 1158 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 1159 PyUnicode_GET_SIZE(obj)); 1160 } 1161 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); 1162 } 1163 1164 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 1165 const char *encoding, 1166 const char *errors) 1167 { 1168 const char *s = NULL; 1169 Py_ssize_t len; 1170 PyObject *v; 1171 1172 if (obj == NULL) { 1173 PyErr_BadInternalCall(); 1174 return NULL; 1175 } 1176 1177 #if 0 1178 /* For b/w compatibility we also accept Unicode objects provided 1179 that no encodings is given and then redirect to 1180 PyObject_Unicode() which then applies the additional logic for 1181 Unicode subclasses. 1182 1183 NOTE: This API should really only be used for object which 1184 represent *encoded* Unicode ! 1185 1186 */ 1187 if (PyUnicode_Check(obj)) { 1188 if (encoding) { 1189 PyErr_SetString(PyExc_TypeError, 1190 "decoding Unicode is not supported"); 1191 return NULL; 1192 } 1193 return PyObject_Unicode(obj); 1194 } 1195 #else 1196 if (PyUnicode_Check(obj)) { 1197 PyErr_SetString(PyExc_TypeError, 1198 "decoding Unicode is not supported"); 1199 return NULL; 1200 } 1201 #endif 1202 1203 /* Coerce object */ 1204 if (PyString_Check(obj)) { 1205 s = PyString_AS_STRING(obj); 1206 len = PyString_GET_SIZE(obj); 1207 } 1208 else if (PyByteArray_Check(obj)) { 1209 /* Python 2.x specific */ 1210 PyErr_Format(PyExc_TypeError, 1211 "decoding bytearray is not supported"); 1212 return NULL; 1213 } 1214 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 1215 /* Overwrite the error message with something more useful in 1216 case of a TypeError. */ 1217 if (PyErr_ExceptionMatches(PyExc_TypeError)) 1218 PyErr_Format(PyExc_TypeError, 1219 "coercing to Unicode: need string or buffer, " 1220 "%.80s found", 1221 Py_TYPE(obj)->tp_name); 1222 goto onError; 1223 } 1224 1225 /* Convert to Unicode */ 1226 if (len == 0) 1227 _Py_RETURN_UNICODE_EMPTY(); 1228 1229 v = PyUnicode_Decode(s, len, encoding, errors); 1230 return v; 1231 1232 onError: 1233 return NULL; 1234 } 1235 1236 PyObject *PyUnicode_Decode(const char *s, 1237 Py_ssize_t size, 1238 const char *encoding, 1239 const char *errors) 1240 { 1241 PyObject *buffer = NULL, *unicode; 1242 1243 if (encoding == NULL) 1244 encoding = PyUnicode_GetDefaultEncoding(); 1245 1246 /* Shortcuts for common default encodings */ 1247 if (strcmp(encoding, "utf-8") == 0) 1248 return PyUnicode_DecodeUTF8(s, size, errors); 1249 else if (strcmp(encoding, "latin-1") == 0) 1250 return PyUnicode_DecodeLatin1(s, size, errors); 1251 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1252 else if (strcmp(encoding, "mbcs") == 0) 1253 return PyUnicode_DecodeMBCS(s, size, errors); 1254 #endif 1255 else if (strcmp(encoding, "ascii") == 0) 1256 return PyUnicode_DecodeASCII(s, size, errors); 1257 1258 /* Decode via the codec registry */ 1259 buffer = PyBuffer_FromMemory((void *)s, size); 1260 if (buffer == NULL) 1261 goto onError; 1262 unicode = PyCodec_Decode(buffer, encoding, errors); 1263 if (unicode == NULL) 1264 goto onError; 1265 if (!PyUnicode_Check(unicode)) { 1266 PyErr_Format(PyExc_TypeError, 1267 "decoder did not return an unicode object (type=%.400s)", 1268 Py_TYPE(unicode)->tp_name); 1269 Py_DECREF(unicode); 1270 goto onError; 1271 } 1272 Py_DECREF(buffer); 1273 return unicode; 1274 1275 onError: 1276 Py_XDECREF(buffer); 1277 return NULL; 1278 } 1279 1280 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode, 1281 const char *encoding, 1282 const char *errors) 1283 { 1284 PyObject *v; 1285 1286 if (!PyUnicode_Check(unicode)) { 1287 PyErr_BadArgument(); 1288 goto onError; 1289 } 1290 1291 if (encoding == NULL) 1292 encoding = PyUnicode_GetDefaultEncoding(); 1293 1294 /* Decode via the codec registry */ 1295 v = PyCodec_Decode(unicode, encoding, errors); 1296 if (v == NULL) 1297 goto onError; 1298 return v; 1299 1300 onError: 1301 return NULL; 1302 } 1303 1304 PyObject *PyUnicode_Encode(const Py_UNICODE *s, 1305 Py_ssize_t size, 1306 const char *encoding, 1307 const char *errors) 1308 { 1309 PyObject *v, *unicode; 1310 1311 unicode = PyUnicode_FromUnicode(s, size); 1312 if (unicode == NULL) 1313 return NULL; 1314 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 1315 Py_DECREF(unicode); 1316 return v; 1317 } 1318 1319 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode, 1320 const char *encoding, 1321 const char *errors) 1322 { 1323 PyObject *v; 1324 1325 if (!PyUnicode_Check(unicode)) { 1326 PyErr_BadArgument(); 1327 goto onError; 1328 } 1329 1330 if (encoding == NULL) 1331 encoding = PyUnicode_GetDefaultEncoding(); 1332 1333 /* Encode via the codec registry */ 1334 v = PyCodec_Encode(unicode, encoding, errors); 1335 if (v == NULL) 1336 goto onError; 1337 return v; 1338 1339 onError: 1340 return NULL; 1341 } 1342 1343 PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 1344 const char *encoding, 1345 const char *errors) 1346 { 1347 PyObject *v; 1348 1349 if (!PyUnicode_Check(unicode)) { 1350 PyErr_BadArgument(); 1351 goto onError; 1352 } 1353 1354 if (encoding == NULL) 1355 encoding = PyUnicode_GetDefaultEncoding(); 1356 1357 /* Shortcuts for common default encodings */ 1358 if (errors == NULL) { 1359 if (strcmp(encoding, "utf-8") == 0) 1360 return PyUnicode_AsUTF8String(unicode); 1361 else if (strcmp(encoding, "latin-1") == 0) 1362 return PyUnicode_AsLatin1String(unicode); 1363 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1364 else if (strcmp(encoding, "mbcs") == 0) 1365 return PyUnicode_AsMBCSString(unicode); 1366 #endif 1367 else if (strcmp(encoding, "ascii") == 0) 1368 return PyUnicode_AsASCIIString(unicode); 1369 } 1370 1371 /* Encode via the codec registry */ 1372 v = PyCodec_Encode(unicode, encoding, errors); 1373 if (v == NULL) 1374 goto onError; 1375 if (!PyString_Check(v)) { 1376 PyErr_Format(PyExc_TypeError, 1377 "encoder did not return a string object (type=%.400s)", 1378 Py_TYPE(v)->tp_name); 1379 Py_DECREF(v); 1380 goto onError; 1381 } 1382 return v; 1383 1384 onError: 1385 return NULL; 1386 } 1387 1388 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 1389 const char *errors) 1390 { 1391 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 1392 1393 if (v) 1394 return v; 1395 v = PyUnicode_AsEncodedString(unicode, NULL, errors); 1396 if (v && errors == NULL) 1397 ((PyUnicodeObject *)unicode)->defenc = v; 1398 return v; 1399 } 1400 1401 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 1402 { 1403 if (!PyUnicode_Check(unicode)) { 1404 PyErr_BadArgument(); 1405 goto onError; 1406 } 1407 return PyUnicode_AS_UNICODE(unicode); 1408 1409 onError: 1410 return NULL; 1411 } 1412 1413 Py_ssize_t PyUnicode_GetSize(PyObject *unicode) 1414 { 1415 if (!PyUnicode_Check(unicode)) { 1416 PyErr_BadArgument(); 1417 goto onError; 1418 } 1419 return PyUnicode_GET_SIZE(unicode); 1420 1421 onError: 1422 return -1; 1423 } 1424 1425 const char *PyUnicode_GetDefaultEncoding(void) 1426 { 1427 return unicode_default_encoding; 1428 } 1429 1430 int PyUnicode_SetDefaultEncoding(const char *encoding) 1431 { 1432 PyObject *v; 1433 1434 /* Make sure the encoding is valid. As side effect, this also 1435 loads the encoding into the codec registry cache. */ 1436 v = _PyCodec_Lookup(encoding); 1437 if (v == NULL) 1438 goto onError; 1439 Py_DECREF(v); 1440 strncpy(unicode_default_encoding, 1441 encoding, 1442 sizeof(unicode_default_encoding) - 1); 1443 return 0; 1444 1445 onError: 1446 return -1; 1447 } 1448 1449 /* error handling callback helper: 1450 build arguments, call the callback and check the arguments, 1451 if no exception occurred, copy the replacement to the output 1452 and adjust various state variables. 1453 return 0 on success, -1 on error 1454 */ 1455 1456 static 1457 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 1458 const char *encoding, const char *reason, 1459 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, 1460 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 1461 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 1462 { 1463 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple"; 1464 1465 PyObject *restuple = NULL; 1466 PyObject *repunicode = NULL; 1467 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 1468 Py_ssize_t requiredsize; 1469 Py_ssize_t newpos; 1470 Py_UNICODE *repptr; 1471 Py_ssize_t repsize; 1472 int res = -1; 1473 1474 if (*errorHandler == NULL) { 1475 *errorHandler = PyCodec_LookupError(errors); 1476 if (*errorHandler == NULL) 1477 goto onError; 1478 } 1479 1480 if (*exceptionObject == NULL) { 1481 *exceptionObject = PyUnicodeDecodeError_Create( 1482 encoding, input, insize, *startinpos, *endinpos, reason); 1483 if (*exceptionObject == NULL) 1484 goto onError; 1485 } 1486 else { 1487 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) 1488 goto onError; 1489 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) 1490 goto onError; 1491 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 1492 goto onError; 1493 } 1494 1495 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 1496 if (restuple == NULL) 1497 goto onError; 1498 if (!PyTuple_Check(restuple)) { 1499 PyErr_SetString(PyExc_TypeError, &argparse[4]); 1500 goto onError; 1501 } 1502 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 1503 goto onError; 1504 if (newpos<0) 1505 newpos = insize+newpos; 1506 if (newpos<0 || newpos>insize) { 1507 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 1508 goto onError; 1509 } 1510 1511 /* need more space? (at least enough for what we 1512 have+the replacement+the rest of the string (starting 1513 at the new input position), so we won't have to check space 1514 when there are no errors in the rest of the string) */ 1515 repptr = PyUnicode_AS_UNICODE(repunicode); 1516 repsize = PyUnicode_GET_SIZE(repunicode); 1517 requiredsize = *outpos; 1518 if (requiredsize > PY_SSIZE_T_MAX - repsize) 1519 goto overflow; 1520 requiredsize += repsize; 1521 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos)) 1522 goto overflow; 1523 requiredsize += insize - newpos; 1524 if (requiredsize > outsize) { 1525 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize) 1526 requiredsize = 2*outsize; 1527 if (_PyUnicode_Resize(output, requiredsize) < 0) 1528 goto onError; 1529 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 1530 } 1531 *endinpos = newpos; 1532 *inptr = input + newpos; 1533 Py_UNICODE_COPY(*outptr, repptr, repsize); 1534 *outptr += repsize; 1535 *outpos += repsize; 1536 /* we made it! */ 1537 res = 0; 1538 1539 onError: 1540 Py_XDECREF(restuple); 1541 return res; 1542 1543 overflow: 1544 PyErr_SetString(PyExc_OverflowError, 1545 "decoded result is too long for a Python string"); 1546 goto onError; 1547 } 1548 1549 /* --- UTF-7 Codec -------------------------------------------------------- */ 1550 1551 /* See RFC2152 for details. We encode conservatively and decode liberally. */ 1552 1553 /* Three simple macros defining base-64. */ 1554 1555 /* Is c a base-64 character? */ 1556 1557 #define IS_BASE64(c) \ 1558 (isalnum(c) || (c) == '+' || (c) == '/') 1559 1560 /* given that c is a base-64 character, what is its base-64 value? */ 1561 1562 #define FROM_BASE64(c) \ 1563 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 1564 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 1565 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 1566 (c) == '+' ? 62 : 63) 1567 1568 /* What is the base-64 character of the bottom 6 bits of n? */ 1569 1570 #define TO_BASE64(n) \ 1571 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 1572 1573 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 1574 * decoded as itself. We are permissive on decoding; the only ASCII 1575 * byte not decoding to itself is the + which begins a base64 1576 * string. */ 1577 1578 #define DECODE_DIRECT(c) \ 1579 ((c) <= 127 && (c) != '+') 1580 1581 /* The UTF-7 encoder treats ASCII characters differently according to 1582 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 1583 * the above). See RFC2152. This array identifies these different 1584 * sets: 1585 * 0 : "Set D" 1586 * alphanumeric and '(),-./:? 1587 * 1 : "Set O" 1588 * !"#$%&*;<=>@[]^_`{|} 1589 * 2 : "whitespace" 1590 * ht nl cr sp 1591 * 3 : special (must be base64 encoded) 1592 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 1593 */ 1594 1595 static 1596 char utf7_category[128] = { 1597 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 1598 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 1599 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 1600 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1601 /* sp ! " # $ % & ' ( ) * + , - . / */ 1602 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 1603 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 1604 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1605 /* @ A B C D E F G H I J K L M N O */ 1606 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1607 /* P Q R S T U V W X Y Z [ \ ] ^ _ */ 1608 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 1609 /* ` a b c d e f g h i j k l m n o */ 1610 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1611 /* p q r s t u v w x y z { | } ~ del */ 1612 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 1613 }; 1614 1615 /* ENCODE_DIRECT: this character should be encoded as itself. The 1616 * answer depends on whether we are encoding set O as itself, and also 1617 * on whether we are encoding whitespace as itself. RFC2152 makes it 1618 * clear that the answers to these questions vary between 1619 * applications, so this code needs to be flexible. */ 1620 1621 #define ENCODE_DIRECT(c, directO, directWS) \ 1622 ((c) < 128 && (c) > 0 && \ 1623 ((utf7_category[(c)] == 0) || \ 1624 (directWS && (utf7_category[(c)] == 2)) || \ 1625 (directO && (utf7_category[(c)] == 1)))) 1626 1627 PyObject *PyUnicode_DecodeUTF7(const char *s, 1628 Py_ssize_t size, 1629 const char *errors) 1630 { 1631 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 1632 } 1633 1634 /* The decoder. The only state we preserve is our read position, 1635 * i.e. how many characters we have consumed. So if we end in the 1636 * middle of a shift sequence we have to back off the read position 1637 * and the output to the beginning of the sequence, otherwise we lose 1638 * all the shift state (seen bits, number of bits seen, high 1639 * surrogate). */ 1640 1641 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, 1642 Py_ssize_t size, 1643 const char *errors, 1644 Py_ssize_t *consumed) 1645 { 1646 const char *starts = s; 1647 Py_ssize_t startinpos; 1648 Py_ssize_t endinpos; 1649 Py_ssize_t outpos; 1650 const char *e; 1651 PyUnicodeObject *unicode; 1652 Py_UNICODE *p; 1653 const char *errmsg = ""; 1654 int inShift = 0; 1655 Py_UNICODE *shiftOutStart; 1656 unsigned int base64bits = 0; 1657 unsigned long base64buffer = 0; 1658 Py_UNICODE surrogate = 0; 1659 PyObject *errorHandler = NULL; 1660 PyObject *exc = NULL; 1661 1662 unicode = _PyUnicode_New(size); 1663 if (!unicode) 1664 return NULL; 1665 if (size == 0) { 1666 if (consumed) 1667 *consumed = 0; 1668 return (PyObject *)unicode; 1669 } 1670 1671 p = unicode->str; 1672 shiftOutStart = p; 1673 e = s + size; 1674 1675 while (s < e) { 1676 Py_UNICODE ch = (unsigned char) *s; 1677 1678 if (inShift) { /* in a base-64 section */ 1679 if (IS_BASE64(ch)) { /* consume a base-64 character */ 1680 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 1681 base64bits += 6; 1682 s++; 1683 if (base64bits >= 16) { 1684 /* we have enough bits for a UTF-16 value */ 1685 Py_UNICODE outCh = (Py_UNICODE) 1686 (base64buffer >> (base64bits-16)); 1687 base64bits -= 16; 1688 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 1689 assert(outCh <= 0xffff); 1690 if (surrogate) { 1691 /* expecting a second surrogate */ 1692 if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 1693 #ifdef Py_UNICODE_WIDE 1694 *p++ = (((surrogate & 0x3FF)<<10) 1695 | (outCh & 0x3FF)) + 0x10000; 1696 #else 1697 *p++ = surrogate; 1698 *p++ = outCh; 1699 #endif 1700 surrogate = 0; 1701 continue; 1702 } 1703 else { 1704 *p++ = surrogate; 1705 surrogate = 0; 1706 } 1707 } 1708 if (outCh >= 0xD800 && outCh <= 0xDBFF) { 1709 /* first surrogate */ 1710 surrogate = outCh; 1711 } 1712 else { 1713 *p++ = outCh; 1714 } 1715 } 1716 } 1717 else { /* now leaving a base-64 section */ 1718 inShift = 0; 1719 s++; 1720 if (surrogate) { 1721 *p++ = surrogate; 1722 surrogate = 0; 1723 } 1724 if (base64bits > 0) { /* left-over bits */ 1725 if (base64bits >= 6) { 1726 /* We've seen at least one base-64 character */ 1727 errmsg = "partial character in shift sequence"; 1728 goto utf7Error; 1729 } 1730 else { 1731 /* Some bits remain; they should be zero */ 1732 if (base64buffer != 0) { 1733 errmsg = "non-zero padding bits in shift sequence"; 1734 goto utf7Error; 1735 } 1736 } 1737 } 1738 if (ch != '-') { 1739 /* '-' is absorbed; other terminating 1740 characters are preserved */ 1741 *p++ = ch; 1742 } 1743 } 1744 } 1745 else if ( ch == '+' ) { 1746 startinpos = s-starts; 1747 s++; /* consume '+' */ 1748 if (s < e && *s == '-') { /* '+-' encodes '+' */ 1749 s++; 1750 *p++ = '+'; 1751 } 1752 else { /* begin base64-encoded section */ 1753 inShift = 1; 1754 shiftOutStart = p; 1755 base64bits = 0; 1756 base64buffer = 0; 1757 } 1758 } 1759 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 1760 *p++ = ch; 1761 s++; 1762 } 1763 else { 1764 startinpos = s-starts; 1765 s++; 1766 errmsg = "unexpected special character"; 1767 goto utf7Error; 1768 } 1769 continue; 1770 utf7Error: 1771 outpos = p-PyUnicode_AS_UNICODE(unicode); 1772 endinpos = s-starts; 1773 if (unicode_decode_call_errorhandler( 1774 errors, &errorHandler, 1775 "utf7", errmsg, 1776 starts, size, &startinpos, &endinpos, &exc, &s, 1777 &unicode, &outpos, &p)) 1778 goto onError; 1779 } 1780 1781 /* end of string */ 1782 1783 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 1784 /* if we're in an inconsistent state, that's an error */ 1785 if (surrogate || 1786 (base64bits >= 6) || 1787 (base64bits > 0 && base64buffer != 0)) { 1788 outpos = p-PyUnicode_AS_UNICODE(unicode); 1789 endinpos = size; 1790 if (unicode_decode_call_errorhandler( 1791 errors, &errorHandler, 1792 "utf7", "unterminated shift sequence", 1793 starts, size, &startinpos, &endinpos, &exc, &s, 1794 &unicode, &outpos, &p)) 1795 goto onError; 1796 } 1797 } 1798 1799 /* return state */ 1800 if (consumed) { 1801 if (inShift) { 1802 p = shiftOutStart; /* back off output */ 1803 *consumed = startinpos; 1804 } 1805 else { 1806 *consumed = s-starts; 1807 } 1808 } 1809 1810 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 1811 goto onError; 1812 1813 Py_XDECREF(errorHandler); 1814 Py_XDECREF(exc); 1815 return (PyObject *)unicode; 1816 1817 onError: 1818 Py_XDECREF(errorHandler); 1819 Py_XDECREF(exc); 1820 Py_DECREF(unicode); 1821 return NULL; 1822 } 1823 1824 1825 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 1826 Py_ssize_t size, 1827 int base64SetO, 1828 int base64WhiteSpace, 1829 const char *errors) 1830 { 1831 PyObject *v; 1832 /* It might be possible to tighten this worst case */ 1833 Py_ssize_t allocated = 8 * size; 1834 int inShift = 0; 1835 Py_ssize_t i = 0; 1836 unsigned int base64bits = 0; 1837 unsigned long base64buffer = 0; 1838 char * out; 1839 char * start; 1840 1841 if (allocated / 8 != size) 1842 return PyErr_NoMemory(); 1843 1844 if (size == 0) 1845 return PyString_FromStringAndSize(NULL, 0); 1846 1847 v = PyString_FromStringAndSize(NULL, allocated); 1848 if (v == NULL) 1849 return NULL; 1850 1851 start = out = PyString_AS_STRING(v); 1852 for (;i < size; ++i) { 1853 Py_UNICODE ch = s[i]; 1854 1855 if (inShift) { 1856 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 1857 /* shifting out */ 1858 if (base64bits) { /* output remaining bits */ 1859 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 1860 base64buffer = 0; 1861 base64bits = 0; 1862 } 1863 inShift = 0; 1864 /* Characters not in the BASE64 set implicitly unshift the sequence 1865 so no '-' is required, except if the character is itself a '-' */ 1866 if (IS_BASE64(ch) || ch == '-') { 1867 *out++ = '-'; 1868 } 1869 *out++ = (char) ch; 1870 } 1871 else { 1872 goto encode_char; 1873 } 1874 } 1875 else { /* not in a shift sequence */ 1876 if (ch == '+') { 1877 *out++ = '+'; 1878 *out++ = '-'; 1879 } 1880 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 1881 *out++ = (char) ch; 1882 } 1883 else { 1884 *out++ = '+'; 1885 inShift = 1; 1886 goto encode_char; 1887 } 1888 } 1889 continue; 1890 encode_char: 1891 #ifdef Py_UNICODE_WIDE 1892 if (ch >= 0x10000) { 1893 /* code first surrogate */ 1894 base64bits += 16; 1895 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 1896 while (base64bits >= 6) { 1897 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 1898 base64bits -= 6; 1899 } 1900 /* prepare second surrogate */ 1901 ch = 0xDC00 | ((ch-0x10000) & 0x3FF); 1902 } 1903 #endif 1904 base64bits += 16; 1905 base64buffer = (base64buffer << 16) | ch; 1906 while (base64bits >= 6) { 1907 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 1908 base64bits -= 6; 1909 } 1910 } 1911 if (base64bits) 1912 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 1913 if (inShift) 1914 *out++ = '-'; 1915 1916 if (_PyString_Resize(&v, out - start)) 1917 return NULL; 1918 return v; 1919 } 1920 1921 #undef IS_BASE64 1922 #undef FROM_BASE64 1923 #undef TO_BASE64 1924 #undef DECODE_DIRECT 1925 #undef ENCODE_DIRECT 1926 1927 /* --- UTF-8 Codec -------------------------------------------------------- */ 1928 1929 static 1930 char utf8_code_length[256] = { 1931 /* Map UTF-8 encoded prefix byte to sequence length. Zero means 1932 illegal prefix. See RFC 3629 for details */ 1933 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 1934 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1935 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1936 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1937 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1938 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1939 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1940 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ 1941 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ 1942 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1943 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1944 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ 1945 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ 1946 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ 1947 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ 1948 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 1949 }; 1950 1951 PyObject *PyUnicode_DecodeUTF8(const char *s, 1952 Py_ssize_t size, 1953 const char *errors) 1954 { 1955 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 1956 } 1957 1958 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, 1959 Py_ssize_t size, 1960 const char *errors, 1961 Py_ssize_t *consumed) 1962 { 1963 const char *starts = s; 1964 int n; 1965 int k; 1966 Py_ssize_t startinpos; 1967 Py_ssize_t endinpos; 1968 Py_ssize_t outpos; 1969 const char *e; 1970 PyUnicodeObject *unicode; 1971 Py_UNICODE *p; 1972 const char *errmsg = ""; 1973 PyObject *errorHandler = NULL; 1974 PyObject *exc = NULL; 1975 1976 /* Note: size will always be longer than the resulting Unicode 1977 character count */ 1978 unicode = _PyUnicode_New(size); 1979 if (!unicode) 1980 return NULL; 1981 if (size == 0) { 1982 if (consumed) 1983 *consumed = 0; 1984 return (PyObject *)unicode; 1985 } 1986 1987 /* Unpack UTF-8 encoded data */ 1988 p = unicode->str; 1989 e = s + size; 1990 1991 while (s < e) { 1992 Py_UCS4 ch = (unsigned char)*s; 1993 1994 if (ch < 0x80) { 1995 *p++ = (Py_UNICODE)ch; 1996 s++; 1997 continue; 1998 } 1999 2000 n = utf8_code_length[ch]; 2001 2002 if (s + n > e) { 2003 if (consumed) 2004 break; 2005 else { 2006 errmsg = "unexpected end of data"; 2007 startinpos = s-starts; 2008 endinpos = startinpos+1; 2009 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) 2010 endinpos++; 2011 goto utf8Error; 2012 } 2013 } 2014 2015 switch (n) { 2016 2017 case 0: 2018 errmsg = "invalid start byte"; 2019 startinpos = s-starts; 2020 endinpos = startinpos+1; 2021 goto utf8Error; 2022 2023 case 1: 2024 errmsg = "internal error"; 2025 startinpos = s-starts; 2026 endinpos = startinpos+1; 2027 goto utf8Error; 2028 2029 case 2: 2030 if ((s[1] & 0xc0) != 0x80) { 2031 errmsg = "invalid continuation byte"; 2032 startinpos = s-starts; 2033 endinpos = startinpos + 1; 2034 goto utf8Error; 2035 } 2036 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 2037 assert ((ch > 0x007F) && (ch <= 0x07FF)); 2038 *p++ = (Py_UNICODE)ch; 2039 break; 2040 2041 case 3: 2042 /* XXX: surrogates shouldn't be valid UTF-8! 2043 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 2044 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt 2045 Uncomment the 2 lines below to make them invalid, 2046 code points: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */ 2047 if ((s[1] & 0xc0) != 0x80 || 2048 (s[2] & 0xc0) != 0x80 || 2049 ((unsigned char)s[0] == 0xE0 && 2050 (unsigned char)s[1] < 0xA0)/* || 2051 ((unsigned char)s[0] == 0xED && 2052 (unsigned char)s[1] > 0x9F)*/) { 2053 errmsg = "invalid continuation byte"; 2054 startinpos = s-starts; 2055 endinpos = startinpos + 1; 2056 2057 /* if s[1] first two bits are 1 and 0, then the invalid 2058 continuation byte is s[2], so increment endinpos by 1, 2059 if not, s[1] is invalid and endinpos doesn't need to 2060 be incremented. */ 2061 if ((s[1] & 0xC0) == 0x80) 2062 endinpos++; 2063 goto utf8Error; 2064 } 2065 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 2066 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 2067 *p++ = (Py_UNICODE)ch; 2068 break; 2069 2070 case 4: 2071 if ((s[1] & 0xc0) != 0x80 || 2072 (s[2] & 0xc0) != 0x80 || 2073 (s[3] & 0xc0) != 0x80 || 2074 ((unsigned char)s[0] == 0xF0 && 2075 (unsigned char)s[1] < 0x90) || 2076 ((unsigned char)s[0] == 0xF4 && 2077 (unsigned char)s[1] > 0x8F)) { 2078 errmsg = "invalid continuation byte"; 2079 startinpos = s-starts; 2080 endinpos = startinpos + 1; 2081 if ((s[1] & 0xC0) == 0x80) { 2082 endinpos++; 2083 if ((s[2] & 0xC0) == 0x80) 2084 endinpos++; 2085 } 2086 goto utf8Error; 2087 } 2088 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 2089 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 2090 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 2091 2092 #ifdef Py_UNICODE_WIDE 2093 *p++ = (Py_UNICODE)ch; 2094 #else 2095 /* compute and append the two surrogates: */ 2096 2097 /* translate from 10000..10FFFF to 0..FFFF */ 2098 ch -= 0x10000; 2099 2100 /* high surrogate = top 10 bits added to D800 */ 2101 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 2102 2103 /* low surrogate = bottom 10 bits added to DC00 */ 2104 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 2105 #endif 2106 break; 2107 } 2108 s += n; 2109 continue; 2110 2111 utf8Error: 2112 outpos = p-PyUnicode_AS_UNICODE(unicode); 2113 if (unicode_decode_call_errorhandler( 2114 errors, &errorHandler, 2115 "utf8", errmsg, 2116 starts, size, &startinpos, &endinpos, &exc, &s, 2117 &unicode, &outpos, &p)) 2118 goto onError; 2119 } 2120 if (consumed) 2121 *consumed = s-starts; 2122 2123 /* Adjust length */ 2124 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2125 goto onError; 2126 2127 Py_XDECREF(errorHandler); 2128 Py_XDECREF(exc); 2129 return (PyObject *)unicode; 2130 2131 onError: 2132 Py_XDECREF(errorHandler); 2133 Py_XDECREF(exc); 2134 Py_DECREF(unicode); 2135 return NULL; 2136 } 2137 2138 /* Allocation strategy: if the string is short, convert into a stack buffer 2139 and allocate exactly as much space needed at the end. Else allocate the 2140 maximum possible needed (4 result bytes per Unicode character), and return 2141 the excess memory at the end. 2142 */ 2143 PyObject * 2144 PyUnicode_EncodeUTF8(const Py_UNICODE *s, 2145 Py_ssize_t size, 2146 const char *errors) 2147 { 2148 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 2149 2150 Py_ssize_t i; /* index into s of next input byte */ 2151 PyObject *v; /* result string object */ 2152 char *p; /* next free byte in output buffer */ 2153 Py_ssize_t nallocated; /* number of result bytes allocated */ 2154 Py_ssize_t nneeded; /* number of result bytes needed */ 2155 char stackbuf[MAX_SHORT_UNICHARS * 4]; 2156 2157 assert(s != NULL); 2158 assert(size >= 0); 2159 2160 if (size <= MAX_SHORT_UNICHARS) { 2161 /* Write into the stack buffer; nallocated can't overflow. 2162 * At the end, we'll allocate exactly as much heap space as it 2163 * turns out we need. 2164 */ 2165 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 2166 v = NULL; /* will allocate after we're done */ 2167 p = stackbuf; 2168 } 2169 else { 2170 /* Overallocate on the heap, and give the excess back at the end. */ 2171 nallocated = size * 4; 2172 if (nallocated / 4 != size) /* overflow! */ 2173 return PyErr_NoMemory(); 2174 v = PyString_FromStringAndSize(NULL, nallocated); 2175 if (v == NULL) 2176 return NULL; 2177 p = PyString_AS_STRING(v); 2178 } 2179 2180 for (i = 0; i < size;) { 2181 Py_UCS4 ch = s[i++]; 2182 2183 if (ch < 0x80) 2184 /* Encode ASCII */ 2185 *p++ = (char) ch; 2186 2187 else if (ch < 0x0800) { 2188 /* Encode Latin-1 */ 2189 *p++ = (char)(0xc0 | (ch >> 6)); 2190 *p++ = (char)(0x80 | (ch & 0x3f)); 2191 } 2192 else { 2193 /* Encode UCS2 Unicode ordinals */ 2194 if (ch < 0x10000) { 2195 /* Special case: check for high surrogate */ 2196 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { 2197 Py_UCS4 ch2 = s[i]; 2198 /* Check for low surrogate and combine the two to 2199 form a UCS4 value */ 2200 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2201 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 2202 i++; 2203 goto encodeUCS4; 2204 } 2205 /* Fall through: handles isolated high surrogates */ 2206 } 2207 *p++ = (char)(0xe0 | (ch >> 12)); 2208 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2209 *p++ = (char)(0x80 | (ch & 0x3f)); 2210 continue; 2211 } 2212 encodeUCS4: 2213 /* Encode UCS4 Unicode ordinals */ 2214 *p++ = (char)(0xf0 | (ch >> 18)); 2215 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 2216 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2217 *p++ = (char)(0x80 | (ch & 0x3f)); 2218 } 2219 } 2220 2221 if (v == NULL) { 2222 /* This was stack allocated. */ 2223 nneeded = p - stackbuf; 2224 assert(nneeded <= nallocated); 2225 v = PyString_FromStringAndSize(stackbuf, nneeded); 2226 } 2227 else { 2228 /* Cut back to size actually needed. */ 2229 nneeded = p - PyString_AS_STRING(v); 2230 assert(nneeded <= nallocated); 2231 if (_PyString_Resize(&v, nneeded)) 2232 return NULL; 2233 } 2234 return v; 2235 2236 #undef MAX_SHORT_UNICHARS 2237 } 2238 2239 PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 2240 { 2241 if (!PyUnicode_Check(unicode)) { 2242 PyErr_BadArgument(); 2243 return NULL; 2244 } 2245 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 2246 PyUnicode_GET_SIZE(unicode), 2247 NULL); 2248 } 2249 2250 /* --- UTF-32 Codec ------------------------------------------------------- */ 2251 2252 PyObject * 2253 PyUnicode_DecodeUTF32(const char *s, 2254 Py_ssize_t size, 2255 const char *errors, 2256 int *byteorder) 2257 { 2258 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 2259 } 2260 2261 PyObject * 2262 PyUnicode_DecodeUTF32Stateful(const char *s, 2263 Py_ssize_t size, 2264 const char *errors, 2265 int *byteorder, 2266 Py_ssize_t *consumed) 2267 { 2268 const char *starts = s; 2269 Py_ssize_t startinpos; 2270 Py_ssize_t endinpos; 2271 Py_ssize_t outpos; 2272 PyUnicodeObject *unicode; 2273 Py_UNICODE *p; 2274 #ifndef Py_UNICODE_WIDE 2275 int pairs = 0; 2276 const unsigned char *qq; 2277 #else 2278 const int pairs = 0; 2279 #endif 2280 const unsigned char *q, *e; 2281 int bo = 0; /* assume native ordering by default */ 2282 const char *errmsg = ""; 2283 /* Offsets from q for retrieving bytes in the right order. */ 2284 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 2285 int iorder[] = {0, 1, 2, 3}; 2286 #else 2287 int iorder[] = {3, 2, 1, 0}; 2288 #endif 2289 PyObject *errorHandler = NULL; 2290 PyObject *exc = NULL; 2291 2292 q = (unsigned char *)s; 2293 e = q + size; 2294 2295 if (byteorder) 2296 bo = *byteorder; 2297 2298 /* Check for BOM marks (U+FEFF) in the input and adjust current 2299 byte order setting accordingly. In native mode, the leading BOM 2300 mark is skipped, in all other modes, it is copied to the output 2301 stream as-is (giving a ZWNBSP character). */ 2302 if (bo == 0) { 2303 if (size >= 4) { 2304 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2305 (q[iorder[1]] << 8) | q[iorder[0]]; 2306 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 2307 if (bom == 0x0000FEFF) { 2308 q += 4; 2309 bo = -1; 2310 } 2311 else if (bom == 0xFFFE0000) { 2312 q += 4; 2313 bo = 1; 2314 } 2315 #else 2316 if (bom == 0x0000FEFF) { 2317 q += 4; 2318 bo = 1; 2319 } 2320 else if (bom == 0xFFFE0000) { 2321 q += 4; 2322 bo = -1; 2323 } 2324 #endif 2325 } 2326 } 2327 2328 if (bo == -1) { 2329 /* force LE */ 2330 iorder[0] = 0; 2331 iorder[1] = 1; 2332 iorder[2] = 2; 2333 iorder[3] = 3; 2334 } 2335 else if (bo == 1) { 2336 /* force BE */ 2337 iorder[0] = 3; 2338 iorder[1] = 2; 2339 iorder[2] = 1; 2340 iorder[3] = 0; 2341 } 2342 2343 /* On narrow builds we split characters outside the BMP into two 2344 code points => count how much extra space we need. */ 2345 #ifndef Py_UNICODE_WIDE 2346 for (qq = q; e - qq >= 4; qq += 4) 2347 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0) 2348 pairs++; 2349 #endif 2350 2351 /* This might be one to much, because of a BOM */ 2352 unicode = _PyUnicode_New((size+3)/4+pairs); 2353 if (!unicode) 2354 return NULL; 2355 if (size == 0) 2356 return (PyObject *)unicode; 2357 2358 /* Unpack UTF-32 encoded data */ 2359 p = unicode->str; 2360 2361 while (q < e) { 2362 Py_UCS4 ch; 2363 /* remaining bytes at the end? (size should be divisible by 4) */ 2364 if (e-q<4) { 2365 if (consumed) 2366 break; 2367 errmsg = "truncated data"; 2368 startinpos = ((const char *)q)-starts; 2369 endinpos = ((const char *)e)-starts; 2370 goto utf32Error; 2371 /* The remaining input chars are ignored if the callback 2372 chooses to skip the input */ 2373 } 2374 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2375 (q[iorder[1]] << 8) | q[iorder[0]]; 2376 2377 if (ch >= 0x110000) 2378 { 2379 errmsg = "code point not in range(0x110000)"; 2380 startinpos = ((const char *)q)-starts; 2381 endinpos = startinpos+4; 2382 goto utf32Error; 2383 } 2384 #ifndef Py_UNICODE_WIDE 2385 if (ch >= 0x10000) 2386 { 2387 *p++ = 0xD800 | ((ch-0x10000) >> 10); 2388 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 2389 } 2390 else 2391 #endif 2392 *p++ = ch; 2393 q += 4; 2394 continue; 2395 utf32Error: 2396 outpos = p-PyUnicode_AS_UNICODE(unicode); 2397 if (unicode_decode_call_errorhandler( 2398 errors, &errorHandler, 2399 "utf32", errmsg, 2400 starts, size, &startinpos, &endinpos, &exc, (const char **)&q, 2401 &unicode, &outpos, &p)) 2402 goto onError; 2403 } 2404 2405 if (byteorder) 2406 *byteorder = bo; 2407 2408 if (consumed) 2409 *consumed = (const char *)q-starts; 2410 2411 /* Adjust length */ 2412 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2413 goto onError; 2414 2415 Py_XDECREF(errorHandler); 2416 Py_XDECREF(exc); 2417 return (PyObject *)unicode; 2418 2419 onError: 2420 Py_DECREF(unicode); 2421 Py_XDECREF(errorHandler); 2422 Py_XDECREF(exc); 2423 return NULL; 2424 } 2425 2426 PyObject * 2427 PyUnicode_EncodeUTF32(const Py_UNICODE *s, 2428 Py_ssize_t size, 2429 const char *errors, 2430 int byteorder) 2431 { 2432 PyObject *v; 2433 unsigned char *p; 2434 Py_ssize_t nsize, bytesize; 2435 #ifndef Py_UNICODE_WIDE 2436 Py_ssize_t i, pairs; 2437 #else 2438 const int pairs = 0; 2439 #endif 2440 /* Offsets from p for storing byte pairs in the right order. */ 2441 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 2442 int iorder[] = {0, 1, 2, 3}; 2443 #else 2444 int iorder[] = {3, 2, 1, 0}; 2445 #endif 2446 2447 #define STORECHAR(CH) \ 2448 do { \ 2449 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 2450 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 2451 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 2452 p[iorder[0]] = (CH) & 0xff; \ 2453 p += 4; \ 2454 } while(0) 2455 2456 /* In narrow builds we can output surrogate pairs as one code point, 2457 so we need less space. */ 2458 #ifndef Py_UNICODE_WIDE 2459 for (i = pairs = 0; i < size-1; i++) 2460 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 2461 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 2462 pairs++; 2463 #endif 2464 nsize = (size - pairs + (byteorder == 0)); 2465 bytesize = nsize * 4; 2466 if (bytesize / 4 != nsize) 2467 return PyErr_NoMemory(); 2468 v = PyString_FromStringAndSize(NULL, bytesize); 2469 if (v == NULL) 2470 return NULL; 2471 2472 p = (unsigned char *)PyString_AS_STRING(v); 2473 if (byteorder == 0) 2474 STORECHAR(0xFEFF); 2475 if (size == 0) 2476 return v; 2477 2478 if (byteorder == -1) { 2479 /* force LE */ 2480 iorder[0] = 0; 2481 iorder[1] = 1; 2482 iorder[2] = 2; 2483 iorder[3] = 3; 2484 } 2485 else if (byteorder == 1) { 2486 /* force BE */ 2487 iorder[0] = 3; 2488 iorder[1] = 2; 2489 iorder[2] = 1; 2490 iorder[3] = 0; 2491 } 2492 2493 while (size-- > 0) { 2494 Py_UCS4 ch = *s++; 2495 #ifndef Py_UNICODE_WIDE 2496 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 2497 Py_UCS4 ch2 = *s; 2498 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2499 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2500 s++; 2501 size--; 2502 } 2503 } 2504 #endif 2505 STORECHAR(ch); 2506 } 2507 return v; 2508 #undef STORECHAR 2509 } 2510 2511 PyObject *PyUnicode_AsUTF32String(PyObject *unicode) 2512 { 2513 if (!PyUnicode_Check(unicode)) { 2514 PyErr_BadArgument(); 2515 return NULL; 2516 } 2517 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 2518 PyUnicode_GET_SIZE(unicode), 2519 NULL, 2520 0); 2521 } 2522 2523 /* --- UTF-16 Codec ------------------------------------------------------- */ 2524 2525 PyObject * 2526 PyUnicode_DecodeUTF16(const char *s, 2527 Py_ssize_t size, 2528 const char *errors, 2529 int *byteorder) 2530 { 2531 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 2532 } 2533 2534 PyObject * 2535 PyUnicode_DecodeUTF16Stateful(const char *s, 2536 Py_ssize_t size, 2537 const char *errors, 2538 int *byteorder, 2539 Py_ssize_t *consumed) 2540 { 2541 const char *starts = s; 2542 Py_ssize_t startinpos; 2543 Py_ssize_t endinpos; 2544 Py_ssize_t outpos; 2545 PyUnicodeObject *unicode; 2546 Py_UNICODE *p; 2547 const unsigned char *q, *e; 2548 int bo = 0; /* assume native ordering by default */ 2549 const char *errmsg = ""; 2550 /* Offsets from q for retrieving byte pairs in the right order. */ 2551 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 2552 int ihi = 1, ilo = 0; 2553 #else 2554 int ihi = 0, ilo = 1; 2555 #endif 2556 PyObject *errorHandler = NULL; 2557 PyObject *exc = NULL; 2558 2559 /* Note: size will always be longer than the resulting Unicode 2560 character count */ 2561 unicode = _PyUnicode_New(size); 2562 if (!unicode) 2563 return NULL; 2564 if (size == 0) 2565 return (PyObject *)unicode; 2566 2567 /* Unpack UTF-16 encoded data */ 2568 p = unicode->str; 2569 q = (unsigned char *)s; 2570 e = q + size; 2571 2572 if (byteorder) 2573 bo = *byteorder; 2574 2575 /* Check for BOM marks (U+FEFF) in the input and adjust current 2576 byte order setting accordingly. In native mode, the leading BOM 2577 mark is skipped, in all other modes, it is copied to the output 2578 stream as-is (giving a ZWNBSP character). */ 2579 if (bo == 0) { 2580 if (size >= 2) { 2581 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 2582 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 2583 if (bom == 0xFEFF) { 2584 q += 2; 2585 bo = -1; 2586 } 2587 else if (bom == 0xFFFE) { 2588 q += 2; 2589 bo = 1; 2590 } 2591 #else 2592 if (bom == 0xFEFF) { 2593 q += 2; 2594 bo = 1; 2595 } 2596 else if (bom == 0xFFFE) { 2597 q += 2; 2598 bo = -1; 2599 } 2600 #endif 2601 } 2602 } 2603 2604 if (bo == -1) { 2605 /* force LE */ 2606 ihi = 1; 2607 ilo = 0; 2608 } 2609 else if (bo == 1) { 2610 /* force BE */ 2611 ihi = 0; 2612 ilo = 1; 2613 } 2614 2615 while (q < e) { 2616 Py_UNICODE ch; 2617 /* remaining bytes at the end? (size should be even) */ 2618 if (e-q<2) { 2619 if (consumed) 2620 break; 2621 errmsg = "truncated data"; 2622 startinpos = ((const char *)q)-starts; 2623 endinpos = ((const char *)e)-starts; 2624 goto utf16Error; 2625 /* The remaining input chars are ignored if the callback 2626 chooses to skip the input */ 2627 } 2628 ch = (q[ihi] << 8) | q[ilo]; 2629 2630 q += 2; 2631 2632 if (ch < 0xD800 || ch > 0xDFFF) { 2633 *p++ = ch; 2634 continue; 2635 } 2636 2637 /* UTF-16 code pair: */ 2638 if (e - q < 2) { 2639 q -= 2; 2640 if (consumed) 2641 break; 2642 errmsg = "unexpected end of data"; 2643 startinpos = ((const char *)q)-starts; 2644 endinpos = ((const char *)e)-starts; 2645 goto utf16Error; 2646 } 2647 if (0xD800 <= ch && ch <= 0xDBFF) { 2648 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 2649 q += 2; 2650 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2651 #ifndef Py_UNICODE_WIDE 2652 *p++ = ch; 2653 *p++ = ch2; 2654 #else 2655 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2656 #endif 2657 continue; 2658 } 2659 else { 2660 errmsg = "illegal UTF-16 surrogate"; 2661 startinpos = (((const char *)q)-4)-starts; 2662 endinpos = startinpos+2; 2663 goto utf16Error; 2664 } 2665 2666 } 2667 errmsg = "illegal encoding"; 2668 startinpos = (((const char *)q)-2)-starts; 2669 endinpos = startinpos+2; 2670 /* Fall through to report the error */ 2671 2672 utf16Error: 2673 outpos = p-PyUnicode_AS_UNICODE(unicode); 2674 if (unicode_decode_call_errorhandler( 2675 errors, &errorHandler, 2676 "utf16", errmsg, 2677 starts, size, &startinpos, &endinpos, &exc, (const char **)&q, 2678 &unicode, &outpos, &p)) 2679 goto onError; 2680 } 2681 2682 if (byteorder) 2683 *byteorder = bo; 2684 2685 if (consumed) 2686 *consumed = (const char *)q-starts; 2687 2688 /* Adjust length */ 2689 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2690 goto onError; 2691 2692 Py_XDECREF(errorHandler); 2693 Py_XDECREF(exc); 2694 return (PyObject *)unicode; 2695 2696 onError: 2697 Py_DECREF(unicode); 2698 Py_XDECREF(errorHandler); 2699 Py_XDECREF(exc); 2700 return NULL; 2701 } 2702 2703 PyObject * 2704 PyUnicode_EncodeUTF16(const Py_UNICODE *s, 2705 Py_ssize_t size, 2706 const char *errors, 2707 int byteorder) 2708 { 2709 PyObject *v; 2710 unsigned char *p; 2711 Py_ssize_t nsize, bytesize; 2712 #ifdef Py_UNICODE_WIDE 2713 Py_ssize_t i, pairs; 2714 #else 2715 const int pairs = 0; 2716 #endif 2717 /* Offsets from p for storing byte pairs in the right order. */ 2718 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 2719 int ihi = 1, ilo = 0; 2720 #else 2721 int ihi = 0, ilo = 1; 2722 #endif 2723 2724 #define STORECHAR(CH) \ 2725 do { \ 2726 p[ihi] = ((CH) >> 8) & 0xff; \ 2727 p[ilo] = (CH) & 0xff; \ 2728 p += 2; \ 2729 } while(0) 2730 2731 #ifdef Py_UNICODE_WIDE 2732 for (i = pairs = 0; i < size; i++) 2733 if (s[i] >= 0x10000) 2734 pairs++; 2735 #endif 2736 /* 2 * (size + pairs + (byteorder == 0)) */ 2737 if (size > PY_SSIZE_T_MAX || 2738 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 2739 return PyErr_NoMemory(); 2740 nsize = size + pairs + (byteorder == 0); 2741 bytesize = nsize * 2; 2742 if (bytesize / 2 != nsize) 2743 return PyErr_NoMemory(); 2744 v = PyString_FromStringAndSize(NULL, bytesize); 2745 if (v == NULL) 2746 return NULL; 2747 2748 p = (unsigned char *)PyString_AS_STRING(v); 2749 if (byteorder == 0) 2750 STORECHAR(0xFEFF); 2751 if (size == 0) 2752 return v; 2753 2754 if (byteorder == -1) { 2755 /* force LE */ 2756 ihi = 1; 2757 ilo = 0; 2758 } 2759 else if (byteorder == 1) { 2760 /* force BE */ 2761 ihi = 0; 2762 ilo = 1; 2763 } 2764 2765 while (size-- > 0) { 2766 Py_UNICODE ch = *s++; 2767 Py_UNICODE ch2 = 0; 2768 #ifdef Py_UNICODE_WIDE 2769 if (ch >= 0x10000) { 2770 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 2771 ch = 0xD800 | ((ch-0x10000) >> 10); 2772 } 2773 #endif 2774 STORECHAR(ch); 2775 if (ch2) 2776 STORECHAR(ch2); 2777 } 2778 return v; 2779 #undef STORECHAR 2780 } 2781 2782 PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 2783 { 2784 if (!PyUnicode_Check(unicode)) { 2785 PyErr_BadArgument(); 2786 return NULL; 2787 } 2788 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 2789 PyUnicode_GET_SIZE(unicode), 2790 NULL, 2791 0); 2792 } 2793 2794 /* --- Unicode Escape Codec ----------------------------------------------- */ 2795 2796 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 2797 2798 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 2799 Py_ssize_t size, 2800 const char *errors) 2801 { 2802 const char *starts = s; 2803 Py_ssize_t startinpos; 2804 Py_ssize_t endinpos; 2805 Py_ssize_t outpos; 2806 PyUnicodeObject *v; 2807 Py_UNICODE *p; 2808 const char *end; 2809 char* message; 2810 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 2811 PyObject *errorHandler = NULL; 2812 PyObject *exc = NULL; 2813 2814 /* Escaped strings will always be longer than the resulting 2815 Unicode string, so we start with size here and then reduce the 2816 length after conversion to the true value. 2817 (but if the error callback returns a long replacement string 2818 we'll have to allocate more space) */ 2819 v = _PyUnicode_New(size); 2820 if (v == NULL) 2821 goto onError; 2822 if (size == 0) 2823 return (PyObject *)v; 2824 2825 p = PyUnicode_AS_UNICODE(v); 2826 end = s + size; 2827 2828 while (s < end) { 2829 unsigned char c; 2830 Py_UNICODE x; 2831 int digits; 2832 2833 /* Non-escape characters are interpreted as Unicode ordinals */ 2834 if (*s != '\\') { 2835 *p++ = (unsigned char) *s++; 2836 continue; 2837 } 2838 2839 startinpos = s-starts; 2840 /* \ - Escapes */ 2841 s++; 2842 c = *s++; 2843 if (s > end) 2844 c = '\0'; /* Invalid after \ */ 2845 switch (c) { 2846 2847 /* \x escapes */ 2848 case '\n': break; 2849 case '\\': *p++ = '\\'; break; 2850 case '\'': *p++ = '\''; break; 2851 case '\"': *p++ = '\"'; break; 2852 case 'b': *p++ = '\b'; break; 2853 case 'f': *p++ = '\014'; break; /* FF */ 2854 case 't': *p++ = '\t'; break; 2855 case 'n': *p++ = '\n'; break; 2856 case 'r': *p++ = '\r'; break; 2857 case 'v': *p++ = '\013'; break; /* VT */ 2858 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 2859 2860 /* \OOO (octal) escapes */ 2861 case '0': case '1': case '2': case '3': 2862 case '4': case '5': case '6': case '7': 2863 x = s[-1] - '0'; 2864 if (s < end && '0' <= *s && *s <= '7') { 2865 x = (x<<3) + *s++ - '0'; 2866 if (s < end && '0' <= *s && *s <= '7') 2867 x = (x<<3) + *s++ - '0'; 2868 } 2869 *p++ = x; 2870 break; 2871 2872 /* hex escapes */ 2873 /* \xXX */ 2874 case 'x': 2875 digits = 2; 2876 message = "truncated \\xXX escape"; 2877 goto hexescape; 2878 2879 /* \uXXXX */ 2880 case 'u': 2881 digits = 4; 2882 message = "truncated \\uXXXX escape"; 2883 goto hexescape; 2884 2885 /* \UXXXXXXXX */ 2886 case 'U': 2887 digits = 8; 2888 message = "truncated \\UXXXXXXXX escape"; 2889 hexescape: 2890 chr = 0; 2891 if (end - s < digits) { 2892 /* count only hex digits */ 2893 for (; s < end; ++s) { 2894 c = (unsigned char)*s; 2895 if (!Py_ISXDIGIT(c)) 2896 goto error; 2897 } 2898 goto error; 2899 } 2900 for (; digits--; ++s) { 2901 c = (unsigned char)*s; 2902 if (!Py_ISXDIGIT(c)) 2903 goto error; 2904 chr = (chr<<4) & ~0xF; 2905 if (c >= '0' && c <= '9') 2906 chr += c - '0'; 2907 else if (c >= 'a' && c <= 'f') 2908 chr += 10 + c - 'a'; 2909 else 2910 chr += 10 + c - 'A'; 2911 } 2912 if (chr == 0xffffffff && PyErr_Occurred()) 2913 /* _decoding_error will have already written into the 2914 target buffer. */ 2915 break; 2916 store: 2917 /* when we get here, chr is a 32-bit unicode character */ 2918 if (chr <= 0xffff) 2919 /* UCS-2 character */ 2920 *p++ = (Py_UNICODE) chr; 2921 else if (chr <= 0x10ffff) { 2922 /* UCS-4 character. Either store directly, or as 2923 surrogate pair. */ 2924 #ifdef Py_UNICODE_WIDE 2925 *p++ = chr; 2926 #else 2927 chr -= 0x10000L; 2928 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 2929 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 2930 #endif 2931 } else { 2932 message = "illegal Unicode character"; 2933 goto error; 2934 } 2935 break; 2936 2937 /* \N{name} */ 2938 case 'N': 2939 message = "malformed \\N character escape"; 2940 if (ucnhash_CAPI == NULL) { 2941 /* load the unicode data module */ 2942 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1); 2943 if (ucnhash_CAPI == NULL) 2944 goto ucnhashError; 2945 } 2946 if (*s == '{') { 2947 const char *start = s+1; 2948 /* look for the closing brace */ 2949 while (*s != '}' && s < end) 2950 s++; 2951 if (s > start && s < end && *s == '}') { 2952 /* found a name. look it up in the unicode database */ 2953 message = "unknown Unicode character name"; 2954 s++; 2955 if (s - start - 1 <= INT_MAX && 2956 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) 2957 goto store; 2958 } 2959 } 2960 goto error; 2961 2962 default: 2963 if (s > end) { 2964 message = "\\ at end of string"; 2965 s--; 2966 goto error; 2967 } 2968 else { 2969 *p++ = '\\'; 2970 *p++ = (unsigned char)s[-1]; 2971 } 2972 break; 2973 } 2974 continue; 2975 2976 error: 2977 endinpos = s-starts; 2978 outpos = p-PyUnicode_AS_UNICODE(v); 2979 if (unicode_decode_call_errorhandler( 2980 errors, &errorHandler, 2981 "unicodeescape", message, 2982 starts, size, &startinpos, &endinpos, &exc, &s, 2983 &v, &outpos, &p)) 2984 goto onError; 2985 continue; 2986 } 2987 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 2988 goto onError; 2989 Py_XDECREF(errorHandler); 2990 Py_XDECREF(exc); 2991 return (PyObject *)v; 2992 2993 ucnhashError: 2994 PyErr_SetString( 2995 PyExc_UnicodeError, 2996 "\\N escapes not supported (can't load unicodedata module)" 2997 ); 2998 Py_XDECREF(v); 2999 Py_XDECREF(errorHandler); 3000 Py_XDECREF(exc); 3001 return NULL; 3002 3003 onError: 3004 Py_XDECREF(v); 3005 Py_XDECREF(errorHandler); 3006 Py_XDECREF(exc); 3007 return NULL; 3008 } 3009 3010 /* Return a Unicode-Escape string version of the Unicode object. 3011 3012 If quotes is true, the string is enclosed in u"" or u'' quotes as 3013 appropriate. 3014 3015 */ 3016 3017 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, 3018 Py_ssize_t size, 3019 Py_UNICODE ch) 3020 { 3021 /* like wcschr, but doesn't stop at NULL characters */ 3022 3023 while (size-- > 0) { 3024 if (*s == ch) 3025 return s; 3026 s++; 3027 } 3028 3029 return NULL; 3030 } 3031 3032 static 3033 PyObject *unicodeescape_string(const Py_UNICODE *s, 3034 Py_ssize_t size, 3035 int quotes) 3036 { 3037 PyObject *repr; 3038 char *p; 3039 3040 static const char *hexdigit = "0123456789abcdef"; 3041 #ifdef Py_UNICODE_WIDE 3042 const Py_ssize_t expandsize = 10; 3043 #else 3044 const Py_ssize_t expandsize = 6; 3045 #endif 3046 3047 /* XXX(nnorwitz): rather than over-allocating, it would be 3048 better to choose a different scheme. Perhaps scan the 3049 first N-chars of the string and allocate based on that size. 3050 */ 3051 /* Initial allocation is based on the longest-possible unichr 3052 escape. 3053 3054 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 3055 unichr, so in this case it's the longest unichr escape. In 3056 narrow (UTF-16) builds this is five chars per source unichr 3057 since there are two unichrs in the surrogate pair, so in narrow 3058 (UTF-16) builds it's not the longest unichr escape. 3059 3060 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 3061 so in the narrow (UTF-16) build case it's the longest unichr 3062 escape. 3063 */ 3064 3065 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 3066 return PyErr_NoMemory(); 3067 3068 repr = PyString_FromStringAndSize(NULL, 3069 2 3070 + expandsize*size 3071 + 1); 3072 if (repr == NULL) 3073 return NULL; 3074 3075 p = PyString_AS_STRING(repr); 3076 3077 if (quotes) { 3078 *p++ = 'u'; 3079 *p++ = (findchar(s, size, '\'') && 3080 !findchar(s, size, '"')) ? '"' : '\''; 3081 } 3082 while (size-- > 0) { 3083 Py_UNICODE ch = *s++; 3084 3085 /* Escape quotes and backslashes */ 3086 if ((quotes && 3087 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') { 3088 *p++ = '\\'; 3089 *p++ = (char) ch; 3090 continue; 3091 } 3092 3093 #ifdef Py_UNICODE_WIDE 3094 /* Map 21-bit characters to '\U00xxxxxx' */ 3095 else if (ch >= 0x10000) { 3096 *p++ = '\\'; 3097 *p++ = 'U'; 3098 *p++ = hexdigit[(ch >> 28) & 0x0000000F]; 3099 *p++ = hexdigit[(ch >> 24) & 0x0000000F]; 3100 *p++ = hexdigit[(ch >> 20) & 0x0000000F]; 3101 *p++ = hexdigit[(ch >> 16) & 0x0000000F]; 3102 *p++ = hexdigit[(ch >> 12) & 0x0000000F]; 3103 *p++ = hexdigit[(ch >> 8) & 0x0000000F]; 3104 *p++ = hexdigit[(ch >> 4) & 0x0000000F]; 3105 *p++ = hexdigit[ch & 0x0000000F]; 3106 continue; 3107 } 3108 #else 3109 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 3110 else if (ch >= 0xD800 && ch < 0xDC00) { 3111 Py_UNICODE ch2; 3112 Py_UCS4 ucs; 3113 3114 ch2 = *s++; 3115 size--; 3116 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 3117 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 3118 *p++ = '\\'; 3119 *p++ = 'U'; 3120 *p++ = hexdigit[(ucs >> 28) & 0x0000000F]; 3121 *p++ = hexdigit[(ucs >> 24) & 0x0000000F]; 3122 *p++ = hexdigit[(ucs >> 20) & 0x0000000F]; 3123 *p++ = hexdigit[(ucs >> 16) & 0x0000000F]; 3124 *p++ = hexdigit[(ucs >> 12) & 0x0000000F]; 3125 *p++ = hexdigit[(ucs >> 8) & 0x0000000F]; 3126 *p++ = hexdigit[(ucs >> 4) & 0x0000000F]; 3127 *p++ = hexdigit[ucs & 0x0000000F]; 3128 continue; 3129 } 3130 /* Fall through: isolated surrogates are copied as-is */ 3131 s--; 3132 size++; 3133 } 3134 #endif 3135 3136 /* Map 16-bit characters to '\uxxxx' */ 3137 if (ch >= 256) { 3138 *p++ = '\\'; 3139 *p++ = 'u'; 3140 *p++ = hexdigit[(ch >> 12) & 0x000F]; 3141 *p++ = hexdigit[(ch >> 8) & 0x000F]; 3142 *p++ = hexdigit[(ch >> 4) & 0x000F]; 3143 *p++ = hexdigit[ch & 0x000F]; 3144 } 3145 3146 /* Map special whitespace to '\t', \n', '\r' */ 3147 else if (ch == '\t') { 3148 *p++ = '\\'; 3149 *p++ = 't'; 3150 } 3151 else if (ch == '\n') { 3152 *p++ = '\\'; 3153 *p++ = 'n'; 3154 } 3155 else if (ch == '\r') { 3156 *p++ = '\\'; 3157 *p++ = 'r'; 3158 } 3159 3160 /* Map non-printable US ASCII to '\xhh' */ 3161 else if (ch < ' ' || ch >= 0x7F) { 3162 *p++ = '\\'; 3163 *p++ = 'x'; 3164 *p++ = hexdigit[(ch >> 4) & 0x000F]; 3165 *p++ = hexdigit[ch & 0x000F]; 3166 } 3167 3168 /* Copy everything else as-is */ 3169 else 3170 *p++ = (char) ch; 3171 } 3172 if (quotes) 3173 *p++ = PyString_AS_STRING(repr)[1]; 3174 3175 *p = '\0'; 3176 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr))) 3177 return NULL; 3178 return repr; 3179 } 3180 3181 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 3182 Py_ssize_t size) 3183 { 3184 return unicodeescape_string(s, size, 0); 3185 } 3186 3187 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 3188 { 3189 if (!PyUnicode_Check(unicode)) { 3190 PyErr_BadArgument(); 3191 return NULL; 3192 } 3193 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3194 PyUnicode_GET_SIZE(unicode)); 3195 } 3196 3197 /* --- Raw Unicode Escape Codec ------------------------------------------- */ 3198 3199 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 3200 Py_ssize_t size, 3201 const char *errors) 3202 { 3203 const char *starts = s; 3204 Py_ssize_t startinpos; 3205 Py_ssize_t endinpos; 3206 Py_ssize_t outpos; 3207 PyUnicodeObject *v; 3208 Py_UNICODE *p; 3209 const char *end; 3210 const char *bs; 3211 PyObject *errorHandler = NULL; 3212 PyObject *exc = NULL; 3213 3214 /* Escaped strings will always be longer than the resulting 3215 Unicode string, so we start with size here and then reduce the 3216 length after conversion to the true value. (But decoding error 3217 handler might have to resize the string) */ 3218 v = _PyUnicode_New(size); 3219 if (v == NULL) 3220 goto onError; 3221 if (size == 0) 3222 return (PyObject *)v; 3223 p = PyUnicode_AS_UNICODE(v); 3224 end = s + size; 3225 while (s < end) { 3226 unsigned char c; 3227 Py_UCS4 x; 3228 int i; 3229 int count; 3230 3231 /* Non-escape characters are interpreted as Unicode ordinals */ 3232 if (*s != '\\') { 3233 *p++ = (unsigned char)*s++; 3234 continue; 3235 } 3236 startinpos = s-starts; 3237 3238 /* \u-escapes are only interpreted iff the number of leading 3239 backslashes if odd */ 3240 bs = s; 3241 for (;s < end;) { 3242 if (*s != '\\') 3243 break; 3244 *p++ = (unsigned char)*s++; 3245 } 3246 if (((s - bs) & 1) == 0 || 3247 s >= end || 3248 (*s != 'u' && *s != 'U')) { 3249 continue; 3250 } 3251 p--; 3252 count = *s=='u' ? 4 : 8; 3253 s++; 3254 3255 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 3256 outpos = p-PyUnicode_AS_UNICODE(v); 3257 for (x = 0, i = 0; i < count; ++i, ++s) { 3258 c = (unsigned char)*s; 3259 if (!isxdigit(c)) { 3260 endinpos = s-starts; 3261 if (unicode_decode_call_errorhandler( 3262 errors, &errorHandler, 3263 "rawunicodeescape", "truncated \\uXXXX", 3264 starts, size, &startinpos, &endinpos, &exc, &s, 3265 &v, &outpos, &p)) 3266 goto onError; 3267 goto nextByte; 3268 } 3269 x = (x<<4) & ~0xF; 3270 if (c >= '0' && c <= '9') 3271 x += c - '0'; 3272 else if (c >= 'a' && c <= 'f') 3273 x += 10 + c - 'a'; 3274 else 3275 x += 10 + c - 'A'; 3276 } 3277 if (x <= 0xffff) 3278 /* UCS-2 character */ 3279 *p++ = (Py_UNICODE) x; 3280 else if (x <= 0x10ffff) { 3281 /* UCS-4 character. Either store directly, or as 3282 surrogate pair. */ 3283 #ifdef Py_UNICODE_WIDE 3284 *p++ = (Py_UNICODE) x; 3285 #else 3286 x -= 0x10000L; 3287 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 3288 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 3289 #endif 3290 } else { 3291 endinpos = s-starts; 3292 outpos = p-PyUnicode_AS_UNICODE(v); 3293 if (unicode_decode_call_errorhandler( 3294 errors, &errorHandler, 3295 "rawunicodeescape", "\\Uxxxxxxxx out of range", 3296 starts, size, &startinpos, &endinpos, &exc, &s, 3297 &v, &outpos, &p)) 3298 goto onError; 3299 } 3300 nextByte: 3301 ; 3302 } 3303 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3304 goto onError; 3305 Py_XDECREF(errorHandler); 3306 Py_XDECREF(exc); 3307 return (PyObject *)v; 3308 3309 onError: 3310 Py_XDECREF(v); 3311 Py_XDECREF(errorHandler); 3312 Py_XDECREF(exc); 3313 return NULL; 3314 } 3315 3316 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 3317 Py_ssize_t size) 3318 { 3319 PyObject *repr; 3320 char *p; 3321 char *q; 3322 3323 static const char *hexdigit = "0123456789abcdef"; 3324 #ifdef Py_UNICODE_WIDE 3325 const Py_ssize_t expandsize = 10; 3326 #else 3327 const Py_ssize_t expandsize = 6; 3328 #endif 3329 3330 if (size > PY_SSIZE_T_MAX / expandsize) 3331 return PyErr_NoMemory(); 3332 3333 repr = PyString_FromStringAndSize(NULL, expandsize * size); 3334 if (repr == NULL) 3335 return NULL; 3336 if (size == 0) 3337 return repr; 3338 3339 p = q = PyString_AS_STRING(repr); 3340 while (size-- > 0) { 3341 Py_UNICODE ch = *s++; 3342 #ifdef Py_UNICODE_WIDE 3343 /* Map 32-bit characters to '\Uxxxxxxxx' */ 3344 if (ch >= 0x10000) { 3345 *p++ = '\\'; 3346 *p++ = 'U'; 3347 *p++ = hexdigit[(ch >> 28) & 0xf]; 3348 *p++ = hexdigit[(ch >> 24) & 0xf]; 3349 *p++ = hexdigit[(ch >> 20) & 0xf]; 3350 *p++ = hexdigit[(ch >> 16) & 0xf]; 3351 *p++ = hexdigit[(ch >> 12) & 0xf]; 3352 *p++ = hexdigit[(ch >> 8) & 0xf]; 3353 *p++ = hexdigit[(ch >> 4) & 0xf]; 3354 *p++ = hexdigit[ch & 15]; 3355 } 3356 else 3357 #else 3358 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 3359 if (ch >= 0xD800 && ch < 0xDC00) { 3360 Py_UNICODE ch2; 3361 Py_UCS4 ucs; 3362 3363 ch2 = *s++; 3364 size--; 3365 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 3366 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 3367 *p++ = '\\'; 3368 *p++ = 'U'; 3369 *p++ = hexdigit[(ucs >> 28) & 0xf]; 3370 *p++ = hexdigit[(ucs >> 24) & 0xf]; 3371 *p++ = hexdigit[(ucs >> 20) & 0xf]; 3372 *p++ = hexdigit[(ucs >> 16) & 0xf]; 3373 *p++ = hexdigit[(ucs >> 12) & 0xf]; 3374 *p++ = hexdigit[(ucs >> 8) & 0xf]; 3375 *p++ = hexdigit[(ucs >> 4) & 0xf]; 3376 *p++ = hexdigit[ucs & 0xf]; 3377 continue; 3378 } 3379 /* Fall through: isolated surrogates are copied as-is */ 3380 s--; 3381 size++; 3382 } 3383 #endif 3384 /* Map 16-bit characters to '\uxxxx' */ 3385 if (ch >= 256) { 3386 *p++ = '\\'; 3387 *p++ = 'u'; 3388 *p++ = hexdigit[(ch >> 12) & 0xf]; 3389 *p++ = hexdigit[(ch >> 8) & 0xf]; 3390 *p++ = hexdigit[(ch >> 4) & 0xf]; 3391 *p++ = hexdigit[ch & 15]; 3392 } 3393 /* Copy everything else as-is */ 3394 else 3395 *p++ = (char) ch; 3396 } 3397 *p = '\0'; 3398 if (_PyString_Resize(&repr, p - q)) 3399 return NULL; 3400 return repr; 3401 } 3402 3403 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 3404 { 3405 if (!PyUnicode_Check(unicode)) { 3406 PyErr_BadArgument(); 3407 return NULL; 3408 } 3409 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3410 PyUnicode_GET_SIZE(unicode)); 3411 } 3412 3413 /* --- Unicode Internal Codec ------------------------------------------- */ 3414 3415 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, 3416 Py_ssize_t size, 3417 const char *errors) 3418 { 3419 const char *starts = s; 3420 Py_ssize_t startinpos; 3421 Py_ssize_t endinpos; 3422 Py_ssize_t outpos; 3423 PyUnicodeObject *v; 3424 Py_UNICODE *p; 3425 const char *end; 3426 const char *reason; 3427 PyObject *errorHandler = NULL; 3428 PyObject *exc = NULL; 3429 3430 #ifdef Py_UNICODE_WIDE 3431 Py_UNICODE unimax = PyUnicode_GetMax(); 3432 #endif 3433 3434 /* XXX overflow detection missing */ 3435 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 3436 if (v == NULL) 3437 goto onError; 3438 if (PyUnicode_GetSize((PyObject *)v) == 0) 3439 return (PyObject *)v; 3440 p = PyUnicode_AS_UNICODE(v); 3441 end = s + size; 3442 3443 while (s < end) { 3444 if (end-s < Py_UNICODE_SIZE) { 3445 endinpos = end-starts; 3446 reason = "truncated input"; 3447 goto error; 3448 } 3449 memcpy(p, s, sizeof(Py_UNICODE)); 3450 #ifdef Py_UNICODE_WIDE 3451 /* We have to sanity check the raw data, otherwise doom looms for 3452 some malformed UCS-4 data. */ 3453 if (*p > unimax || *p < 0) { 3454 endinpos = s - starts + Py_UNICODE_SIZE; 3455 reason = "illegal code point (> 0x10FFFF)"; 3456 goto error; 3457 } 3458 #endif 3459 p++; 3460 s += Py_UNICODE_SIZE; 3461 continue; 3462 3463 error: 3464 startinpos = s - starts; 3465 outpos = p - PyUnicode_AS_UNICODE(v); 3466 if (unicode_decode_call_errorhandler( 3467 errors, &errorHandler, 3468 "unicode_internal", reason, 3469 starts, size, &startinpos, &endinpos, &exc, &s, 3470 &v, &outpos, &p)) { 3471 goto onError; 3472 } 3473 } 3474 3475 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3476 goto onError; 3477 Py_XDECREF(errorHandler); 3478 Py_XDECREF(exc); 3479 return (PyObject *)v; 3480 3481 onError: 3482 Py_XDECREF(v); 3483 Py_XDECREF(errorHandler); 3484 Py_XDECREF(exc); 3485 return NULL; 3486 } 3487 3488 /* --- Latin-1 Codec ------------------------------------------------------ */ 3489 3490 PyObject *PyUnicode_DecodeLatin1(const char *s, 3491 Py_ssize_t size, 3492 const char *errors) 3493 { 3494 PyUnicodeObject *v; 3495 Py_UNICODE *p; 3496 3497 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 3498 if (size == 1) { 3499 Py_UNICODE r = *(unsigned char*)s; 3500 return PyUnicode_FromUnicode(&r, 1); 3501 } 3502 3503 v = _PyUnicode_New(size); 3504 if (v == NULL) 3505 goto onError; 3506 if (size == 0) 3507 return (PyObject *)v; 3508 p = PyUnicode_AS_UNICODE(v); 3509 while (size-- > 0) 3510 *p++ = (unsigned char)*s++; 3511 return (PyObject *)v; 3512 3513 onError: 3514 Py_XDECREF(v); 3515 return NULL; 3516 } 3517 3518 /* create or adjust a UnicodeEncodeError */ 3519 static void make_encode_exception(PyObject **exceptionObject, 3520 const char *encoding, 3521 const Py_UNICODE *unicode, Py_ssize_t size, 3522 Py_ssize_t startpos, Py_ssize_t endpos, 3523 const char *reason) 3524 { 3525 if (*exceptionObject == NULL) { 3526 *exceptionObject = PyUnicodeEncodeError_Create( 3527 encoding, unicode, size, startpos, endpos, reason); 3528 } 3529 else { 3530 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 3531 goto onError; 3532 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 3533 goto onError; 3534 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 3535 goto onError; 3536 return; 3537 onError: 3538 Py_CLEAR(*exceptionObject); 3539 } 3540 } 3541 3542 /* raises a UnicodeEncodeError */ 3543 static void raise_encode_exception(PyObject **exceptionObject, 3544 const char *encoding, 3545 const Py_UNICODE *unicode, Py_ssize_t size, 3546 Py_ssize_t startpos, Py_ssize_t endpos, 3547 const char *reason) 3548 { 3549 make_encode_exception(exceptionObject, 3550 encoding, unicode, size, startpos, endpos, reason); 3551 if (*exceptionObject != NULL) 3552 PyCodec_StrictErrors(*exceptionObject); 3553 } 3554 3555 /* error handling callback helper: 3556 build arguments, call the callback and check the arguments, 3557 put the result into newpos and return the replacement string, which 3558 has to be freed by the caller */ 3559 static PyObject *unicode_encode_call_errorhandler(const char *errors, 3560 PyObject **errorHandler, 3561 const char *encoding, const char *reason, 3562 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 3563 Py_ssize_t startpos, Py_ssize_t endpos, 3564 Py_ssize_t *newpos) 3565 { 3566 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple"; 3567 3568 PyObject *restuple; 3569 PyObject *resunicode; 3570 3571 if (*errorHandler == NULL) { 3572 *errorHandler = PyCodec_LookupError(errors); 3573 if (*errorHandler == NULL) 3574 return NULL; 3575 } 3576 3577 make_encode_exception(exceptionObject, 3578 encoding, unicode, size, startpos, endpos, reason); 3579 if (*exceptionObject == NULL) 3580 return NULL; 3581 3582 restuple = PyObject_CallFunctionObjArgs( 3583 *errorHandler, *exceptionObject, NULL); 3584 if (restuple == NULL) 3585 return NULL; 3586 if (!PyTuple_Check(restuple)) { 3587 PyErr_SetString(PyExc_TypeError, &argparse[4]); 3588 Py_DECREF(restuple); 3589 return NULL; 3590 } 3591 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 3592 &resunicode, newpos)) { 3593 Py_DECREF(restuple); 3594 return NULL; 3595 } 3596 if (*newpos<0) 3597 *newpos = size+*newpos; 3598 if (*newpos<0 || *newpos>size) { 3599 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 3600 Py_DECREF(restuple); 3601 return NULL; 3602 } 3603 Py_INCREF(resunicode); 3604 Py_DECREF(restuple); 3605 return resunicode; 3606 } 3607 3608 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, 3609 Py_ssize_t size, 3610 const char *errors, 3611 int limit) 3612 { 3613 /* output object */ 3614 PyObject *res; 3615 /* pointers to the beginning and end+1 of input */ 3616 const Py_UNICODE *startp = p; 3617 const Py_UNICODE *endp = p + size; 3618 /* pointer to the beginning of the unencodable characters */ 3619 /* const Py_UNICODE *badp = NULL; */ 3620 /* pointer into the output */ 3621 char *str; 3622 /* current output position */ 3623 Py_ssize_t respos = 0; 3624 Py_ssize_t ressize; 3625 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 3626 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 3627 PyObject *errorHandler = NULL; 3628 PyObject *exc = NULL; 3629 /* the following variable is used for caching string comparisons 3630 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 3631 int known_errorHandler = -1; 3632 3633 /* allocate enough for a simple encoding without 3634 replacements, if we need more, we'll resize */ 3635 res = PyString_FromStringAndSize(NULL, size); 3636 if (res == NULL) 3637 goto onError; 3638 if (size == 0) 3639 return res; 3640 str = PyString_AS_STRING(res); 3641 ressize = size; 3642 3643 while (p<endp) { 3644 Py_UNICODE c = *p; 3645 3646 /* can we encode this? */ 3647 if (c<limit) { 3648 /* no overflow check, because we know that the space is enough */ 3649 *str++ = (char)c; 3650 ++p; 3651 } 3652 else { 3653 Py_ssize_t unicodepos = p-startp; 3654 Py_ssize_t requiredsize; 3655 PyObject *repunicode; 3656 Py_ssize_t repsize; 3657 Py_ssize_t newpos; 3658 Py_ssize_t respos; 3659 Py_UNICODE *uni2; 3660 /* startpos for collecting unencodable chars */ 3661 const Py_UNICODE *collstart = p; 3662 const Py_UNICODE *collend = p; 3663 /* find all unecodable characters */ 3664 while ((collend < endp) && ((*collend) >= limit)) 3665 ++collend; 3666 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 3667 if (known_errorHandler==-1) { 3668 if ((errors==NULL) || (!strcmp(errors, "strict"))) 3669 known_errorHandler = 1; 3670 else if (!strcmp(errors, "replace")) 3671 known_errorHandler = 2; 3672 else if (!strcmp(errors, "ignore")) 3673 known_errorHandler = 3; 3674 else if (!strcmp(errors, "xmlcharrefreplace")) 3675 known_errorHandler = 4; 3676 else 3677 known_errorHandler = 0; 3678 } 3679 switch (known_errorHandler) { 3680 case 1: /* strict */ 3681 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 3682 goto onError; 3683 case 2: /* replace */ 3684 while (collstart++ < collend) 3685 *str++ = '?'; /* fall through */ 3686 case 3: /* ignore */ 3687 p = collend; 3688 break; 3689 case 4: /* xmlcharrefreplace */ 3690 respos = str - PyString_AS_STRING(res); 3691 /* determine replacement size (temporarily (mis)uses p) */ 3692 requiredsize = respos; 3693 for (p = collstart; p < collend;) { 3694 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend); 3695 Py_ssize_t incr; 3696 if (ch < 10) 3697 incr = 2+1+1; 3698 else if (ch < 100) 3699 incr = 2+2+1; 3700 else if (ch < 1000) 3701 incr = 2+3+1; 3702 else if (ch < 10000) 3703 incr = 2+4+1; 3704 else if (ch < 100000) 3705 incr = 2+5+1; 3706 else if (ch < 1000000) 3707 incr = 2+6+1; 3708 else 3709 incr = 2+7+1; 3710 if (requiredsize > PY_SSIZE_T_MAX - incr) 3711 goto overflow; 3712 requiredsize += incr; 3713 } 3714 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend)) 3715 goto overflow; 3716 requiredsize += endp - collend; 3717 if (requiredsize > ressize) { 3718 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) 3719 requiredsize = 2*ressize; 3720 if (_PyString_Resize(&res, requiredsize)) 3721 goto onError; 3722 str = PyString_AS_STRING(res) + respos; 3723 ressize = requiredsize; 3724 } 3725 /* generate replacement (temporarily (mis)uses p) */ 3726 for (p = collstart; p < collend;) { 3727 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend); 3728 str += sprintf(str, "&#%d;", (int)ch); 3729 } 3730 p = collend; 3731 break; 3732 default: 3733 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 3734 encoding, reason, startp, size, &exc, 3735 collstart-startp, collend-startp, &newpos); 3736 if (repunicode == NULL) 3737 goto onError; 3738 /* need more space? (at least enough for what we have+the 3739 replacement+the rest of the string, so we won't have to 3740 check space for encodable characters) */ 3741 respos = str - PyString_AS_STRING(res); 3742 repsize = PyUnicode_GET_SIZE(repunicode); 3743 if (respos > PY_SSIZE_T_MAX - repsize) 3744 goto overflow; 3745 requiredsize = respos + repsize; 3746 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend)) 3747 goto overflow; 3748 requiredsize += endp - collend; 3749 if (requiredsize > ressize) { 3750 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) 3751 requiredsize = 2*ressize; 3752 if (_PyString_Resize(&res, requiredsize)) { 3753 Py_DECREF(repunicode); 3754 goto onError; 3755 } 3756 str = PyString_AS_STRING(res) + respos; 3757 ressize = requiredsize; 3758 } 3759 /* check if there is anything unencodable in the replacement 3760 and copy it to the output */ 3761 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2, ++str) { 3762 c = *uni2; 3763 if (c >= limit) { 3764 raise_encode_exception(&exc, encoding, startp, size, 3765 unicodepos, unicodepos+1, reason); 3766 Py_DECREF(repunicode); 3767 goto onError; 3768 } 3769 *str = (char)c; 3770 } 3771 p = startp + newpos; 3772 Py_DECREF(repunicode); 3773 } 3774 } 3775 } 3776 /* Resize if we allocated to much */ 3777 respos = str - PyString_AS_STRING(res); 3778 if (respos < ressize) 3779 /* If this falls res will be NULL */ 3780 _PyString_Resize(&res, respos); 3781 Py_XDECREF(errorHandler); 3782 Py_XDECREF(exc); 3783 return res; 3784 3785 overflow: 3786 PyErr_SetString(PyExc_OverflowError, 3787 "encoded result is too long for a Python string"); 3788 3789 onError: 3790 Py_XDECREF(res); 3791 Py_XDECREF(errorHandler); 3792 Py_XDECREF(exc); 3793 return NULL; 3794 } 3795 3796 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 3797 Py_ssize_t size, 3798 const char *errors) 3799 { 3800 return unicode_encode_ucs1(p, size, errors, 256); 3801 } 3802 3803 PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 3804 { 3805 if (!PyUnicode_Check(unicode)) { 3806 PyErr_BadArgument(); 3807 return NULL; 3808 } 3809 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 3810 PyUnicode_GET_SIZE(unicode), 3811 NULL); 3812 } 3813 3814 /* --- 7-bit ASCII Codec -------------------------------------------------- */ 3815 3816 PyObject *PyUnicode_DecodeASCII(const char *s, 3817 Py_ssize_t size, 3818 const char *errors) 3819 { 3820 const char *starts = s; 3821 PyUnicodeObject *v; 3822 Py_UNICODE *p; 3823 Py_ssize_t startinpos; 3824 Py_ssize_t endinpos; 3825 Py_ssize_t outpos; 3826 const char *e; 3827 PyObject *errorHandler = NULL; 3828 PyObject *exc = NULL; 3829 3830 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 3831 if (size == 1 && *(unsigned char*)s < 128) { 3832 Py_UNICODE r = *(unsigned char*)s; 3833 return PyUnicode_FromUnicode(&r, 1); 3834 } 3835 3836 v = _PyUnicode_New(size); 3837 if (v == NULL) 3838 goto onError; 3839 if (size == 0) 3840 return (PyObject *)v; 3841 p = PyUnicode_AS_UNICODE(v); 3842 e = s + size; 3843 while (s < e) { 3844 register unsigned char c = (unsigned char)*s; 3845 if (c < 128) { 3846 *p++ = c; 3847 ++s; 3848 } 3849 else { 3850 startinpos = s-starts; 3851 endinpos = startinpos + 1; 3852 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 3853 if (unicode_decode_call_errorhandler( 3854 errors, &errorHandler, 3855 "ascii", "ordinal not in range(128)", 3856 starts, size, &startinpos, &endinpos, &exc, &s, 3857 &v, &outpos, &p)) 3858 goto onError; 3859 } 3860 } 3861 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) 3862 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3863 goto onError; 3864 Py_XDECREF(errorHandler); 3865 Py_XDECREF(exc); 3866 return (PyObject *)v; 3867 3868 onError: 3869 Py_XDECREF(v); 3870 Py_XDECREF(errorHandler); 3871 Py_XDECREF(exc); 3872 return NULL; 3873 } 3874 3875 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 3876 Py_ssize_t size, 3877 const char *errors) 3878 { 3879 return unicode_encode_ucs1(p, size, errors, 128); 3880 } 3881 3882 PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 3883 { 3884 if (!PyUnicode_Check(unicode)) { 3885 PyErr_BadArgument(); 3886 return NULL; 3887 } 3888 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 3889 PyUnicode_GET_SIZE(unicode), 3890 NULL); 3891 } 3892 3893 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 3894 3895 /* --- MBCS codecs for Windows -------------------------------------------- */ 3896 3897 #if SIZEOF_INT < SIZEOF_SIZE_T 3898 #define NEED_RETRY 3899 #endif 3900 3901 /* XXX This code is limited to "true" double-byte encodings, as 3902 a) it assumes an incomplete character consists of a single byte, and 3903 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 3904 encodings, see IsDBCSLeadByteEx documentation. */ 3905 3906 static int is_dbcs_lead_byte(const char *s, int offset) 3907 { 3908 const char *curr = s + offset; 3909 3910 if (IsDBCSLeadByte(*curr)) { 3911 const char *prev = CharPrev(s, curr); 3912 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 3913 } 3914 return 0; 3915 } 3916 3917 /* 3918 * Decode MBCS string into unicode object. If 'final' is set, converts 3919 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 3920 */ 3921 static int decode_mbcs(PyUnicodeObject **v, 3922 const char *s, /* MBCS string */ 3923 int size, /* sizeof MBCS string */ 3924 int final) 3925 { 3926 Py_UNICODE *p; 3927 Py_ssize_t n = 0; 3928 int usize = 0; 3929 3930 assert(size >= 0); 3931 3932 /* Skip trailing lead-byte unless 'final' is set */ 3933 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 3934 --size; 3935 3936 /* First get the size of the result */ 3937 if (size > 0) { 3938 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 3939 if (usize == 0) { 3940 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3941 return -1; 3942 } 3943 } 3944 3945 if (*v == NULL) { 3946 /* Create unicode object */ 3947 *v = _PyUnicode_New(usize); 3948 if (*v == NULL) 3949 return -1; 3950 } 3951 else { 3952 /* Extend unicode object */ 3953 n = PyUnicode_GET_SIZE(*v); 3954 if (_PyUnicode_Resize(v, n + usize) < 0) 3955 return -1; 3956 } 3957 3958 /* Do the conversion */ 3959 if (size > 0) { 3960 p = PyUnicode_AS_UNICODE(*v) + n; 3961 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 3962 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3963 return -1; 3964 } 3965 } 3966 3967 return size; 3968 } 3969 3970 PyObject *PyUnicode_DecodeMBCSStateful(const char *s, 3971 Py_ssize_t size, 3972 const char *errors, 3973 Py_ssize_t *consumed) 3974 { 3975 PyUnicodeObject *v = NULL; 3976 int done; 3977 3978 if (consumed) 3979 *consumed = 0; 3980 3981 #ifdef NEED_RETRY 3982 retry: 3983 if (size > INT_MAX) 3984 done = decode_mbcs(&v, s, INT_MAX, 0); 3985 else 3986 #endif 3987 done = decode_mbcs(&v, s, (int)size, !consumed); 3988 3989 if (done < 0) { 3990 Py_XDECREF(v); 3991 return NULL; 3992 } 3993 3994 if (consumed) 3995 *consumed += done; 3996 3997 #ifdef NEED_RETRY 3998 if (size > INT_MAX) { 3999 s += done; 4000 size -= done; 4001 goto retry; 4002 } 4003 #endif 4004 4005 return (PyObject *)v; 4006 } 4007 4008 PyObject *PyUnicode_DecodeMBCS(const char *s, 4009 Py_ssize_t size, 4010 const char *errors) 4011 { 4012 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 4013 } 4014 4015 /* 4016 * Convert unicode into string object (MBCS). 4017 * Returns 0 if succeed, -1 otherwise. 4018 */ 4019 static int encode_mbcs(PyObject **repr, 4020 const Py_UNICODE *p, /* unicode */ 4021 int size) /* size of unicode */ 4022 { 4023 int mbcssize = 0; 4024 Py_ssize_t n = 0; 4025 4026 assert(size >= 0); 4027 4028 /* First get the size of the result */ 4029 if (size > 0) { 4030 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 4031 if (mbcssize == 0) { 4032 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4033 return -1; 4034 } 4035 } 4036 4037 if (*repr == NULL) { 4038 /* Create string object */ 4039 *repr = PyString_FromStringAndSize(NULL, mbcssize); 4040 if (*repr == NULL) 4041 return -1; 4042 } 4043 else { 4044 /* Extend string object */ 4045 n = PyString_Size(*repr); 4046 if (_PyString_Resize(repr, n + mbcssize) < 0) 4047 return -1; 4048 } 4049 4050 /* Do the conversion */ 4051 if (size > 0) { 4052 char *s = PyString_AS_STRING(*repr) + n; 4053 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 4054 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4055 return -1; 4056 } 4057 } 4058 4059 return 0; 4060 } 4061 4062 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 4063 Py_ssize_t size, 4064 const char *errors) 4065 { 4066 PyObject *repr = NULL; 4067 int ret; 4068 4069 #ifdef NEED_RETRY 4070 retry: 4071 if (size > INT_MAX) 4072 ret = encode_mbcs(&repr, p, INT_MAX); 4073 else 4074 #endif 4075 ret = encode_mbcs(&repr, p, (int)size); 4076 4077 if (ret < 0) { 4078 Py_XDECREF(repr); 4079 return NULL; 4080 } 4081 4082 #ifdef NEED_RETRY 4083 if (size > INT_MAX) { 4084 p += INT_MAX; 4085 size -= INT_MAX; 4086 goto retry; 4087 } 4088 #endif 4089 4090 return repr; 4091 } 4092 4093 PyObject *PyUnicode_AsMBCSString(PyObject *unicode) 4094 { 4095 if (!PyUnicode_Check(unicode)) { 4096 PyErr_BadArgument(); 4097 return NULL; 4098 } 4099 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 4100 PyUnicode_GET_SIZE(unicode), 4101 NULL); 4102 } 4103 4104 #undef NEED_RETRY 4105 4106 #endif /* MS_WINDOWS */ 4107 4108 /* --- Character Mapping Codec -------------------------------------------- */ 4109 4110 PyObject *PyUnicode_DecodeCharmap(const char *s, 4111 Py_ssize_t size, 4112 PyObject *mapping, 4113 const char *errors) 4114 { 4115 const char *starts = s; 4116 Py_ssize_t startinpos; 4117 Py_ssize_t endinpos; 4118 Py_ssize_t outpos; 4119 const char *e; 4120 PyUnicodeObject *v; 4121 Py_UNICODE *p; 4122 Py_ssize_t extrachars = 0; 4123 PyObject *errorHandler = NULL; 4124 PyObject *exc = NULL; 4125 Py_UNICODE *mapstring = NULL; 4126 Py_ssize_t maplen = 0; 4127 4128 /* Default to Latin-1 */ 4129 if (mapping == NULL) 4130 return PyUnicode_DecodeLatin1(s, size, errors); 4131 4132 v = _PyUnicode_New(size); 4133 if (v == NULL) 4134 goto onError; 4135 if (size == 0) 4136 return (PyObject *)v; 4137 p = PyUnicode_AS_UNICODE(v); 4138 e = s + size; 4139 if (PyUnicode_CheckExact(mapping)) { 4140 mapstring = PyUnicode_AS_UNICODE(mapping); 4141 maplen = PyUnicode_GET_SIZE(mapping); 4142 while (s < e) { 4143 unsigned char ch = *s; 4144 Py_UNICODE x = 0xfffe; /* illegal value */ 4145 4146 if (ch < maplen) 4147 x = mapstring[ch]; 4148 4149 if (x == 0xfffe) { 4150 /* undefined mapping */ 4151 outpos = p-PyUnicode_AS_UNICODE(v); 4152 startinpos = s-starts; 4153 endinpos = startinpos+1; 4154 if (unicode_decode_call_errorhandler( 4155 errors, &errorHandler, 4156 "charmap", "character maps to <undefined>", 4157 starts, size, &startinpos, &endinpos, &exc, &s, 4158 &v, &outpos, &p)) { 4159 goto onError; 4160 } 4161 continue; 4162 } 4163 *p++ = x; 4164 ++s; 4165 } 4166 } 4167 else { 4168 while (s < e) { 4169 unsigned char ch = *s; 4170 PyObject *w, *x; 4171 4172 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 4173 w = PyInt_FromLong((long)ch); 4174 if (w == NULL) 4175 goto onError; 4176 x = PyObject_GetItem(mapping, w); 4177 Py_DECREF(w); 4178 if (x == NULL) { 4179 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4180 /* No mapping found means: mapping is undefined. */ 4181 PyErr_Clear(); 4182 goto Undefined; 4183 } else 4184 goto onError; 4185 } 4186 4187 /* Apply mapping */ 4188 if (x == Py_None) 4189 goto Undefined; 4190 if (PyInt_Check(x)) { 4191 long value = PyInt_AS_LONG(x); 4192 if (value == 0xFFFE) 4193 goto Undefined; 4194 if (value < 0 || value > 0x10FFFF) { 4195 PyErr_SetString(PyExc_TypeError, 4196 "character mapping must be in range(0x110000)"); 4197 Py_DECREF(x); 4198 goto onError; 4199 } 4200 4201 #ifndef Py_UNICODE_WIDE 4202 if (value > 0xFFFF) { 4203 /* see the code for 1-n mapping below */ 4204 if (extrachars < 2) { 4205 /* resize first */ 4206 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 4207 Py_ssize_t needed = 10 - extrachars; 4208 extrachars += needed; 4209 /* XXX overflow detection missing */ 4210 if (_PyUnicode_Resize(&v, 4211 PyUnicode_GET_SIZE(v) + needed) < 0) { 4212 Py_DECREF(x); 4213 goto onError; 4214 } 4215 p = PyUnicode_AS_UNICODE(v) + oldpos; 4216 } 4217 value -= 0x10000; 4218 *p++ = 0xD800 | (value >> 10); 4219 *p++ = 0xDC00 | (value & 0x3FF); 4220 extrachars -= 2; 4221 } 4222 else 4223 #endif 4224 *p++ = (Py_UNICODE)value; 4225 } 4226 else if (PyUnicode_Check(x)) { 4227 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 4228 4229 if (targetsize == 1) { 4230 /* 1-1 mapping */ 4231 Py_UNICODE value = *PyUnicode_AS_UNICODE(x); 4232 if (value == 0xFFFE) 4233 goto Undefined; 4234 *p++ = value; 4235 } 4236 else if (targetsize > 1) { 4237 /* 1-n mapping */ 4238 if (targetsize > extrachars) { 4239 /* resize first */ 4240 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 4241 Py_ssize_t needed = (targetsize - extrachars) + \ 4242 (targetsize << 2); 4243 extrachars += needed; 4244 /* XXX overflow detection missing */ 4245 if (_PyUnicode_Resize(&v, 4246 PyUnicode_GET_SIZE(v) + needed) < 0) { 4247 Py_DECREF(x); 4248 goto onError; 4249 } 4250 p = PyUnicode_AS_UNICODE(v) + oldpos; 4251 } 4252 Py_UNICODE_COPY(p, 4253 PyUnicode_AS_UNICODE(x), 4254 targetsize); 4255 p += targetsize; 4256 extrachars -= targetsize; 4257 } 4258 /* 1-0 mapping: skip the character */ 4259 } 4260 else { 4261 /* wrong return value */ 4262 PyErr_SetString(PyExc_TypeError, 4263 "character mapping must return integer, None or unicode"); 4264 Py_DECREF(x); 4265 goto onError; 4266 } 4267 Py_DECREF(x); 4268 ++s; 4269 continue; 4270 Undefined: 4271 /* undefined mapping */ 4272 Py_XDECREF(x); 4273 outpos = p-PyUnicode_AS_UNICODE(v); 4274 startinpos = s-starts; 4275 endinpos = startinpos+1; 4276 if (unicode_decode_call_errorhandler( 4277 errors, &errorHandler, 4278 "charmap", "character maps to <undefined>", 4279 starts, size, &startinpos, &endinpos, &exc, &s, 4280 &v, &outpos, &p)) { 4281 goto onError; 4282 } 4283 } 4284 } 4285 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 4286 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4287 goto onError; 4288 Py_XDECREF(errorHandler); 4289 Py_XDECREF(exc); 4290 return (PyObject *)v; 4291 4292 onError: 4293 Py_XDECREF(errorHandler); 4294 Py_XDECREF(exc); 4295 Py_XDECREF(v); 4296 return NULL; 4297 } 4298 4299 /* Charmap encoding: the lookup table */ 4300 4301 struct encoding_map{ 4302 PyObject_HEAD 4303 unsigned char level1[32]; 4304 int count2, count3; 4305 unsigned char level23[1]; 4306 }; 4307 4308 static PyObject* 4309 encoding_map_size(PyObject *obj, PyObject* args) 4310 { 4311 struct encoding_map *map = (struct encoding_map*)obj; 4312 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 + 4313 128*map->count3); 4314 } 4315 4316 static PyMethodDef encoding_map_methods[] = { 4317 {"size", encoding_map_size, METH_NOARGS, 4318 PyDoc_STR("Return the size (in bytes) of this object") }, 4319 { 0 } 4320 }; 4321 4322 static void 4323 encoding_map_dealloc(PyObject* o) 4324 { 4325 PyObject_FREE(o); 4326 } 4327 4328 static PyTypeObject EncodingMapType = { 4329 PyVarObject_HEAD_INIT(NULL, 0) 4330 "EncodingMap", /*tp_name*/ 4331 sizeof(struct encoding_map), /*tp_basicsize*/ 4332 0, /*tp_itemsize*/ 4333 /* methods */ 4334 encoding_map_dealloc, /*tp_dealloc*/ 4335 0, /*tp_print*/ 4336 0, /*tp_getattr*/ 4337 0, /*tp_setattr*/ 4338 0, /*tp_compare*/ 4339 0, /*tp_repr*/ 4340 0, /*tp_as_number*/ 4341 0, /*tp_as_sequence*/ 4342 0, /*tp_as_mapping*/ 4343 0, /*tp_hash*/ 4344 0, /*tp_call*/ 4345 0, /*tp_str*/ 4346 0, /*tp_getattro*/ 4347 0, /*tp_setattro*/ 4348 0, /*tp_as_buffer*/ 4349 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 4350 0, /*tp_doc*/ 4351 0, /*tp_traverse*/ 4352 0, /*tp_clear*/ 4353 0, /*tp_richcompare*/ 4354 0, /*tp_weaklistoffset*/ 4355 0, /*tp_iter*/ 4356 0, /*tp_iternext*/ 4357 encoding_map_methods, /*tp_methods*/ 4358 0, /*tp_members*/ 4359 0, /*tp_getset*/ 4360 0, /*tp_base*/ 4361 0, /*tp_dict*/ 4362 0, /*tp_descr_get*/ 4363 0, /*tp_descr_set*/ 4364 0, /*tp_dictoffset*/ 4365 0, /*tp_init*/ 4366 0, /*tp_alloc*/ 4367 0, /*tp_new*/ 4368 0, /*tp_free*/ 4369 0, /*tp_is_gc*/ 4370 }; 4371 4372 PyObject* 4373 PyUnicode_BuildEncodingMap(PyObject* string) 4374 { 4375 Py_UNICODE *decode; 4376 PyObject *result; 4377 struct encoding_map *mresult; 4378 int i; 4379 int need_dict = 0; 4380 unsigned char level1[32]; 4381 unsigned char level2[512]; 4382 unsigned char *mlevel1, *mlevel2, *mlevel3; 4383 int count2 = 0, count3 = 0; 4384 4385 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) { 4386 PyErr_BadArgument(); 4387 return NULL; 4388 } 4389 decode = PyUnicode_AS_UNICODE(string); 4390 memset(level1, 0xFF, sizeof level1); 4391 memset(level2, 0xFF, sizeof level2); 4392 4393 /* If there isn't a one-to-one mapping of NULL to \0, 4394 or if there are non-BMP characters, we need to use 4395 a mapping dictionary. */ 4396 if (decode[0] != 0) 4397 need_dict = 1; 4398 for (i = 1; i < 256; i++) { 4399 int l1, l2; 4400 if (decode[i] == 0 4401 #ifdef Py_UNICODE_WIDE 4402 || decode[i] > 0xFFFF 4403 #endif 4404 ) { 4405 need_dict = 1; 4406 break; 4407 } 4408 if (decode[i] == 0xFFFE) 4409 /* unmapped character */ 4410 continue; 4411 l1 = decode[i] >> 11; 4412 l2 = decode[i] >> 7; 4413 if (level1[l1] == 0xFF) 4414 level1[l1] = count2++; 4415 if (level2[l2] == 0xFF) 4416 level2[l2] = count3++; 4417 } 4418 4419 if (count2 >= 0xFF || count3 >= 0xFF) 4420 need_dict = 1; 4421 4422 if (need_dict) { 4423 PyObject *result = PyDict_New(); 4424 PyObject *key, *value; 4425 if (!result) 4426 return NULL; 4427 for (i = 0; i < 256; i++) { 4428 value = NULL; 4429 key = PyInt_FromLong(decode[i]); 4430 value = PyInt_FromLong(i); 4431 if (!key || !value) 4432 goto failed1; 4433 if (PyDict_SetItem(result, key, value) == -1) 4434 goto failed1; 4435 Py_DECREF(key); 4436 Py_DECREF(value); 4437 } 4438 return result; 4439 failed1: 4440 Py_XDECREF(key); 4441 Py_XDECREF(value); 4442 Py_DECREF(result); 4443 return NULL; 4444 } 4445 4446 /* Create a three-level trie */ 4447 result = PyObject_MALLOC(sizeof(struct encoding_map) + 4448 16*count2 + 128*count3 - 1); 4449 if (!result) 4450 return PyErr_NoMemory(); 4451 PyObject_Init(result, &EncodingMapType); 4452 mresult = (struct encoding_map*)result; 4453 mresult->count2 = count2; 4454 mresult->count3 = count3; 4455 mlevel1 = mresult->level1; 4456 mlevel2 = mresult->level23; 4457 mlevel3 = mresult->level23 + 16*count2; 4458 memcpy(mlevel1, level1, 32); 4459 memset(mlevel2, 0xFF, 16*count2); 4460 memset(mlevel3, 0, 128*count3); 4461 count3 = 0; 4462 for (i = 1; i < 256; i++) { 4463 int o1, o2, o3, i2, i3; 4464 if (decode[i] == 0xFFFE) 4465 /* unmapped character */ 4466 continue; 4467 o1 = decode[i]>>11; 4468 o2 = (decode[i]>>7) & 0xF; 4469 i2 = 16*mlevel1[o1] + o2; 4470 if (mlevel2[i2] == 0xFF) 4471 mlevel2[i2] = count3++; 4472 o3 = decode[i] & 0x7F; 4473 i3 = 128*mlevel2[i2] + o3; 4474 mlevel3[i3] = i; 4475 } 4476 return result; 4477 } 4478 4479 static int 4480 encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 4481 { 4482 struct encoding_map *map = (struct encoding_map*)mapping; 4483 int l1 = c>>11; 4484 int l2 = (c>>7) & 0xF; 4485 int l3 = c & 0x7F; 4486 int i; 4487 4488 #ifdef Py_UNICODE_WIDE 4489 if (c > 0xFFFF) { 4490 return -1; 4491 } 4492 #endif 4493 if (c == 0) 4494 return 0; 4495 /* level 1*/ 4496 i = map->level1[l1]; 4497 if (i == 0xFF) { 4498 return -1; 4499 } 4500 /* level 2*/ 4501 i = map->level23[16*i+l2]; 4502 if (i == 0xFF) { 4503 return -1; 4504 } 4505 /* level 3 */ 4506 i = map->level23[16*map->count2 + 128*i + l3]; 4507 if (i == 0) { 4508 return -1; 4509 } 4510 return i; 4511 } 4512 4513 /* Lookup the character ch in the mapping. If the character 4514 can't be found, Py_None is returned (or NULL, if another 4515 error occurred). */ 4516 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 4517 { 4518 PyObject *w = PyInt_FromLong((long)c); 4519 PyObject *x; 4520 4521 if (w == NULL) 4522 return NULL; 4523 x = PyObject_GetItem(mapping, w); 4524 Py_DECREF(w); 4525 if (x == NULL) { 4526 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4527 /* No mapping found means: mapping is undefined. */ 4528 PyErr_Clear(); 4529 x = Py_None; 4530 Py_INCREF(x); 4531 return x; 4532 } else 4533 return NULL; 4534 } 4535 else if (x == Py_None) 4536 return x; 4537 else if (PyInt_Check(x)) { 4538 long value = PyInt_AS_LONG(x); 4539 if (value < 0 || value > 255) { 4540 PyErr_SetString(PyExc_TypeError, 4541 "character mapping must be in range(256)"); 4542 Py_DECREF(x); 4543 return NULL; 4544 } 4545 return x; 4546 } 4547 else if (PyString_Check(x)) 4548 return x; 4549 else { 4550 /* wrong return value */ 4551 PyErr_SetString(PyExc_TypeError, 4552 "character mapping must return integer, None or str"); 4553 Py_DECREF(x); 4554 return NULL; 4555 } 4556 } 4557 4558 static int 4559 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 4560 { 4561 Py_ssize_t outsize = PyString_GET_SIZE(*outobj); 4562 /* exponentially overallocate to minimize reallocations */ 4563 if (requiredsize < 2*outsize) 4564 requiredsize = 2*outsize; 4565 if (_PyString_Resize(outobj, requiredsize)) { 4566 return 0; 4567 } 4568 return 1; 4569 } 4570 4571 typedef enum charmapencode_result { 4572 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 4573 }charmapencode_result; 4574 /* lookup the character, put the result in the output string and adjust 4575 various state variables. Reallocate the output string if not enough 4576 space is available. Return a new reference to the object that 4577 was put in the output buffer, or Py_None, if the mapping was undefined 4578 (in which case no character was written) or NULL, if a 4579 reallocation error occurred. The caller must decref the result */ 4580 static 4581 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, 4582 PyObject **outobj, Py_ssize_t *outpos) 4583 { 4584 PyObject *rep; 4585 char *outstart; 4586 Py_ssize_t outsize = PyString_GET_SIZE(*outobj); 4587 4588 if (Py_TYPE(mapping) == &EncodingMapType) { 4589 int res = encoding_map_lookup(c, mapping); 4590 Py_ssize_t requiredsize = *outpos+1; 4591 if (res == -1) 4592 return enc_FAILED; 4593 if (outsize<requiredsize) 4594 if (!charmapencode_resize(outobj, outpos, requiredsize)) 4595 return enc_EXCEPTION; 4596 outstart = PyString_AS_STRING(*outobj); 4597 outstart[(*outpos)++] = (char)res; 4598 return enc_SUCCESS; 4599 } 4600 4601 rep = charmapencode_lookup(c, mapping); 4602 if (rep==NULL) 4603 return enc_EXCEPTION; 4604 else if (rep==Py_None) { 4605 Py_DECREF(rep); 4606 return enc_FAILED; 4607 } else { 4608 if (PyInt_Check(rep)) { 4609 Py_ssize_t requiredsize = *outpos+1; 4610 if (outsize<requiredsize) 4611 if (!charmapencode_resize(outobj, outpos, requiredsize)) { 4612 Py_DECREF(rep); 4613 return enc_EXCEPTION; 4614 } 4615 outstart = PyString_AS_STRING(*outobj); 4616 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep); 4617 } 4618 else { 4619 const char *repchars = PyString_AS_STRING(rep); 4620 Py_ssize_t repsize = PyString_GET_SIZE(rep); 4621 Py_ssize_t requiredsize = *outpos+repsize; 4622 if (outsize<requiredsize) 4623 if (!charmapencode_resize(outobj, outpos, requiredsize)) { 4624 Py_DECREF(rep); 4625 return enc_EXCEPTION; 4626 } 4627 outstart = PyString_AS_STRING(*outobj); 4628 memcpy(outstart + *outpos, repchars, repsize); 4629 *outpos += repsize; 4630 } 4631 } 4632 Py_DECREF(rep); 4633 return enc_SUCCESS; 4634 } 4635 4636 /* handle an error in PyUnicode_EncodeCharmap 4637 Return 0 on success, -1 on error */ 4638 static 4639 int charmap_encoding_error( 4640 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 4641 PyObject **exceptionObject, 4642 int *known_errorHandler, PyObject **errorHandler, const char *errors, 4643 PyObject **res, Py_ssize_t *respos) 4644 { 4645 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 4646 Py_ssize_t repsize; 4647 Py_ssize_t newpos; 4648 Py_UNICODE *uni2; 4649 /* startpos for collecting unencodable chars */ 4650 Py_ssize_t collstartpos = *inpos; 4651 Py_ssize_t collendpos = *inpos+1; 4652 Py_ssize_t collpos; 4653 char *encoding = "charmap"; 4654 char *reason = "character maps to <undefined>"; 4655 charmapencode_result x; 4656 4657 /* find all unencodable characters */ 4658 while (collendpos < size) { 4659 PyObject *rep; 4660 if (Py_TYPE(mapping) == &EncodingMapType) { 4661 int res = encoding_map_lookup(p[collendpos], mapping); 4662 if (res != -1) 4663 break; 4664 ++collendpos; 4665 continue; 4666 } 4667 4668 rep = charmapencode_lookup(p[collendpos], mapping); 4669 if (rep==NULL) 4670 return -1; 4671 else if (rep!=Py_None) { 4672 Py_DECREF(rep); 4673 break; 4674 } 4675 Py_DECREF(rep); 4676 ++collendpos; 4677 } 4678 /* cache callback name lookup 4679 * (if not done yet, i.e. it's the first error) */ 4680 if (*known_errorHandler==-1) { 4681 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4682 *known_errorHandler = 1; 4683 else if (!strcmp(errors, "replace")) 4684 *known_errorHandler = 2; 4685 else if (!strcmp(errors, "ignore")) 4686 *known_errorHandler = 3; 4687 else if (!strcmp(errors, "xmlcharrefreplace")) 4688 *known_errorHandler = 4; 4689 else 4690 *known_errorHandler = 0; 4691 } 4692 switch (*known_errorHandler) { 4693 case 1: /* strict */ 4694 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4695 return -1; 4696 case 2: /* replace */ 4697 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 4698 x = charmapencode_output('?', mapping, res, respos); 4699 if (x==enc_EXCEPTION) { 4700 return -1; 4701 } 4702 else if (x==enc_FAILED) { 4703 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4704 return -1; 4705 } 4706 } 4707 /* fall through */ 4708 case 3: /* ignore */ 4709 *inpos = collendpos; 4710 break; 4711 case 4: /* xmlcharrefreplace */ 4712 /* generate replacement */ 4713 for (collpos = collstartpos; collpos < collendpos;) { 4714 char buffer[2+29+1+1]; 4715 char *cp; 4716 Py_UCS4 ch = p[collpos++]; 4717 #ifndef Py_UNICODE_WIDE 4718 if ((0xD800 <= ch && ch <= 0xDBFF) && 4719 (collpos < collendpos) && 4720 (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) { 4721 ch = ((((ch & 0x03FF) << 10) | 4722 ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000); 4723 } 4724 #endif 4725 sprintf(buffer, "&#%d;", (int)ch); 4726 for (cp = buffer; *cp; ++cp) { 4727 x = charmapencode_output(*cp, mapping, res, respos); 4728 if (x==enc_EXCEPTION) 4729 return -1; 4730 else if (x==enc_FAILED) { 4731 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4732 return -1; 4733 } 4734 } 4735 } 4736 *inpos = collendpos; 4737 break; 4738 default: 4739 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 4740 encoding, reason, p, size, exceptionObject, 4741 collstartpos, collendpos, &newpos); 4742 if (repunicode == NULL) 4743 return -1; 4744 /* generate replacement */ 4745 repsize = PyUnicode_GET_SIZE(repunicode); 4746 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 4747 x = charmapencode_output(*uni2, mapping, res, respos); 4748 if (x==enc_EXCEPTION) { 4749 return -1; 4750 } 4751 else if (x==enc_FAILED) { 4752 Py_DECREF(repunicode); 4753 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4754 return -1; 4755 } 4756 } 4757 *inpos = newpos; 4758 Py_DECREF(repunicode); 4759 } 4760 return 0; 4761 } 4762 4763 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 4764 Py_ssize_t size, 4765 PyObject *mapping, 4766 const char *errors) 4767 { 4768 /* output object */ 4769 PyObject *res = NULL; 4770 /* current input position */ 4771 Py_ssize_t inpos = 0; 4772 /* current output position */ 4773 Py_ssize_t respos = 0; 4774 PyObject *errorHandler = NULL; 4775 PyObject *exc = NULL; 4776 /* the following variable is used for caching string comparisons 4777 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 4778 * 3=ignore, 4=xmlcharrefreplace */ 4779 int known_errorHandler = -1; 4780 4781 /* Default to Latin-1 */ 4782 if (mapping == NULL) 4783 return PyUnicode_EncodeLatin1(p, size, errors); 4784 4785 /* allocate enough for a simple encoding without 4786 replacements, if we need more, we'll resize */ 4787 res = PyString_FromStringAndSize(NULL, size); 4788 if (res == NULL) 4789 goto onError; 4790 if (size == 0) 4791 return res; 4792 4793 while (inpos<size) { 4794 /* try to encode it */ 4795 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 4796 if (x==enc_EXCEPTION) /* error */ 4797 goto onError; 4798 if (x==enc_FAILED) { /* unencodable character */ 4799 if (charmap_encoding_error(p, size, &inpos, mapping, 4800 &exc, 4801 &known_errorHandler, &errorHandler, errors, 4802 &res, &respos)) { 4803 goto onError; 4804 } 4805 } 4806 else 4807 /* done with this character => adjust input position */ 4808 ++inpos; 4809 } 4810 4811 /* Resize if we allocated to much */ 4812 if (respos<PyString_GET_SIZE(res)) { 4813 if (_PyString_Resize(&res, respos)) 4814 goto onError; 4815 } 4816 Py_XDECREF(exc); 4817 Py_XDECREF(errorHandler); 4818 return res; 4819 4820 onError: 4821 Py_XDECREF(res); 4822 Py_XDECREF(exc); 4823 Py_XDECREF(errorHandler); 4824 return NULL; 4825 } 4826 4827 PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 4828 PyObject *mapping) 4829 { 4830 if (!PyUnicode_Check(unicode) || mapping == NULL) { 4831 PyErr_BadArgument(); 4832 return NULL; 4833 } 4834 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 4835 PyUnicode_GET_SIZE(unicode), 4836 mapping, 4837 NULL); 4838 } 4839 4840 /* create or adjust a UnicodeTranslateError */ 4841 static void make_translate_exception(PyObject **exceptionObject, 4842 const Py_UNICODE *unicode, Py_ssize_t size, 4843 Py_ssize_t startpos, Py_ssize_t endpos, 4844 const char *reason) 4845 { 4846 if (*exceptionObject == NULL) { 4847 *exceptionObject = PyUnicodeTranslateError_Create( 4848 unicode, size, startpos, endpos, reason); 4849 } 4850 else { 4851 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 4852 goto onError; 4853 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 4854 goto onError; 4855 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 4856 goto onError; 4857 return; 4858 onError: 4859 Py_CLEAR(*exceptionObject); 4860 } 4861 } 4862 4863 /* raises a UnicodeTranslateError */ 4864 static void raise_translate_exception(PyObject **exceptionObject, 4865 const Py_UNICODE *unicode, Py_ssize_t size, 4866 Py_ssize_t startpos, Py_ssize_t endpos, 4867 const char *reason) 4868 { 4869 make_translate_exception(exceptionObject, 4870 unicode, size, startpos, endpos, reason); 4871 if (*exceptionObject != NULL) 4872 PyCodec_StrictErrors(*exceptionObject); 4873 } 4874 4875 /* error handling callback helper: 4876 build arguments, call the callback and check the arguments, 4877 put the result into newpos and return the replacement string, which 4878 has to be freed by the caller */ 4879 static PyObject *unicode_translate_call_errorhandler(const char *errors, 4880 PyObject **errorHandler, 4881 const char *reason, 4882 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 4883 Py_ssize_t startpos, Py_ssize_t endpos, 4884 Py_ssize_t *newpos) 4885 { 4886 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple"; 4887 4888 Py_ssize_t i_newpos; 4889 PyObject *restuple; 4890 PyObject *resunicode; 4891 4892 if (*errorHandler == NULL) { 4893 *errorHandler = PyCodec_LookupError(errors); 4894 if (*errorHandler == NULL) 4895 return NULL; 4896 } 4897 4898 make_translate_exception(exceptionObject, 4899 unicode, size, startpos, endpos, reason); 4900 if (*exceptionObject == NULL) 4901 return NULL; 4902 4903 restuple = PyObject_CallFunctionObjArgs( 4904 *errorHandler, *exceptionObject, NULL); 4905 if (restuple == NULL) 4906 return NULL; 4907 if (!PyTuple_Check(restuple)) { 4908 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4909 Py_DECREF(restuple); 4910 return NULL; 4911 } 4912 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 4913 &resunicode, &i_newpos)) { 4914 Py_DECREF(restuple); 4915 return NULL; 4916 } 4917 if (i_newpos<0) 4918 *newpos = size+i_newpos; 4919 else 4920 *newpos = i_newpos; 4921 if (*newpos<0 || *newpos>size) { 4922 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 4923 Py_DECREF(restuple); 4924 return NULL; 4925 } 4926 Py_INCREF(resunicode); 4927 Py_DECREF(restuple); 4928 return resunicode; 4929 } 4930 4931 /* Lookup the character ch in the mapping and put the result in result, 4932 which must be decrefed by the caller. 4933 Return 0 on success, -1 on error */ 4934 static 4935 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 4936 { 4937 PyObject *w = PyInt_FromLong((long)c); 4938 PyObject *x; 4939 4940 if (w == NULL) 4941 return -1; 4942 x = PyObject_GetItem(mapping, w); 4943 Py_DECREF(w); 4944 if (x == NULL) { 4945 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4946 /* No mapping found means: use 1:1 mapping. */ 4947 PyErr_Clear(); 4948 *result = NULL; 4949 return 0; 4950 } else 4951 return -1; 4952 } 4953 else if (x == Py_None) { 4954 *result = x; 4955 return 0; 4956 } 4957 else if (PyInt_Check(x)) { 4958 long value = PyInt_AS_LONG(x); 4959 long max = PyUnicode_GetMax(); 4960 if (value < 0 || value > max) { 4961 PyErr_Format(PyExc_TypeError, 4962 "character mapping must be in range(0x%lx)", max+1); 4963 Py_DECREF(x); 4964 return -1; 4965 } 4966 *result = x; 4967 return 0; 4968 } 4969 else if (PyUnicode_Check(x)) { 4970 *result = x; 4971 return 0; 4972 } 4973 else { 4974 /* wrong return value */ 4975 PyErr_SetString(PyExc_TypeError, 4976 "character mapping must return integer, None or unicode"); 4977 Py_DECREF(x); 4978 return -1; 4979 } 4980 } 4981 /* ensure that *outobj is at least requiredsize characters long, 4982 if not reallocate and adjust various state variables. 4983 Return 0 on success, -1 on error */ 4984 static 4985 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, 4986 Py_ssize_t requiredsize) 4987 { 4988 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj); 4989 if (requiredsize > oldsize) { 4990 /* remember old output position */ 4991 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 4992 /* exponentially overallocate to minimize reallocations */ 4993 if (requiredsize < 2 * oldsize) 4994 requiredsize = 2 * oldsize; 4995 if (PyUnicode_Resize(outobj, requiredsize) < 0) 4996 return -1; 4997 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 4998 } 4999 return 0; 5000 } 5001 /* lookup the character, put the result in the output string and adjust 5002 various state variables. Return a new reference to the object that 5003 was put in the output buffer in *result, or Py_None, if the mapping was 5004 undefined (in which case no character was written). 5005 The called must decref result. 5006 Return 0 on success, -1 on error. */ 5007 static 5008 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp, 5009 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp, 5010 PyObject **res) 5011 { 5012 if (charmaptranslate_lookup(*curinp, mapping, res)) 5013 return -1; 5014 if (*res==NULL) { 5015 /* not found => default to 1:1 mapping */ 5016 *(*outp)++ = *curinp; 5017 } 5018 else if (*res==Py_None) 5019 ; 5020 else if (PyInt_Check(*res)) { 5021 /* no overflow check, because we know that the space is enough */ 5022 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res); 5023 } 5024 else if (PyUnicode_Check(*res)) { 5025 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); 5026 if (repsize==1) { 5027 /* no overflow check, because we know that the space is enough */ 5028 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 5029 } 5030 else if (repsize!=0) { 5031 /* more than one character */ 5032 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + 5033 (insize - (curinp-startinp)) + 5034 repsize - 1; 5035 if (charmaptranslate_makespace(outobj, outp, requiredsize)) 5036 return -1; 5037 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 5038 *outp += repsize; 5039 } 5040 } 5041 else 5042 return -1; 5043 return 0; 5044 } 5045 5046 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, 5047 Py_ssize_t size, 5048 PyObject *mapping, 5049 const char *errors) 5050 { 5051 /* output object */ 5052 PyObject *res = NULL; 5053 /* pointers to the beginning and end+1 of input */ 5054 const Py_UNICODE *startp = p; 5055 const Py_UNICODE *endp = p + size; 5056 /* pointer into the output */ 5057 Py_UNICODE *str; 5058 /* current output position */ 5059 Py_ssize_t respos = 0; 5060 char *reason = "character maps to <undefined>"; 5061 PyObject *errorHandler = NULL; 5062 PyObject *exc = NULL; 5063 /* the following variable is used for caching string comparisons 5064 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 5065 * 3=ignore, 4=xmlcharrefreplace */ 5066 int known_errorHandler = -1; 5067 5068 if (mapping == NULL) { 5069 PyErr_BadArgument(); 5070 return NULL; 5071 } 5072 5073 /* allocate enough for a simple 1:1 translation without 5074 replacements, if we need more, we'll resize */ 5075 res = PyUnicode_FromUnicode(NULL, size); 5076 if (res == NULL) 5077 goto onError; 5078 if (size == 0) 5079 return res; 5080 str = PyUnicode_AS_UNICODE(res); 5081 5082 while (p<endp) { 5083 /* try to encode it */ 5084 PyObject *x = NULL; 5085 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) { 5086 Py_XDECREF(x); 5087 goto onError; 5088 } 5089 Py_XDECREF(x); 5090 if (x!=Py_None) /* it worked => adjust input pointer */ 5091 ++p; 5092 else { /* untranslatable character */ 5093 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 5094 Py_ssize_t repsize; 5095 Py_ssize_t newpos; 5096 Py_UNICODE *uni2; 5097 /* startpos for collecting untranslatable chars */ 5098 const Py_UNICODE *collstart = p; 5099 const Py_UNICODE *collend = p+1; 5100 const Py_UNICODE *coll; 5101 5102 /* find all untranslatable characters */ 5103 while (collend < endp) { 5104 if (charmaptranslate_lookup(*collend, mapping, &x)) 5105 goto onError; 5106 Py_XDECREF(x); 5107 if (x!=Py_None) 5108 break; 5109 ++collend; 5110 } 5111 /* cache callback name lookup 5112 * (if not done yet, i.e. it's the first error) */ 5113 if (known_errorHandler==-1) { 5114 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5115 known_errorHandler = 1; 5116 else if (!strcmp(errors, "replace")) 5117 known_errorHandler = 2; 5118 else if (!strcmp(errors, "ignore")) 5119 known_errorHandler = 3; 5120 else if (!strcmp(errors, "xmlcharrefreplace")) 5121 known_errorHandler = 4; 5122 else 5123 known_errorHandler = 0; 5124 } 5125 switch (known_errorHandler) { 5126 case 1: /* strict */ 5127 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 5128 goto onError; 5129 case 2: /* replace */ 5130 /* No need to check for space, this is a 1:1 replacement */ 5131 for (coll = collstart; coll<collend; ++coll) 5132 *str++ = '?'; 5133 /* fall through */ 5134 case 3: /* ignore */ 5135 p = collend; 5136 break; 5137 case 4: /* xmlcharrefreplace */ 5138 /* generate replacement (temporarily (mis)uses p) */ 5139 for (p = collstart; p < collend;) { 5140 char buffer[2+29+1+1]; 5141 char *cp; 5142 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend); 5143 sprintf(buffer, "&#%d;", (int)ch); 5144 if (charmaptranslate_makespace(&res, &str, 5145 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 5146 goto onError; 5147 for (cp = buffer; *cp; ++cp) 5148 *str++ = *cp; 5149 } 5150 p = collend; 5151 break; 5152 default: 5153 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 5154 reason, startp, size, &exc, 5155 collstart-startp, collend-startp, &newpos); 5156 if (repunicode == NULL) 5157 goto onError; 5158 /* generate replacement */ 5159 repsize = PyUnicode_GET_SIZE(repunicode); 5160 if (charmaptranslate_makespace(&res, &str, 5161 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 5162 Py_DECREF(repunicode); 5163 goto onError; 5164 } 5165 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 5166 *str++ = *uni2; 5167 p = startp + newpos; 5168 Py_DECREF(repunicode); 5169 } 5170 } 5171 } 5172 /* Resize if we allocated to much */ 5173 respos = str-PyUnicode_AS_UNICODE(res); 5174 if (respos<PyUnicode_GET_SIZE(res)) { 5175 if (PyUnicode_Resize(&res, respos) < 0) 5176 goto onError; 5177 } 5178 Py_XDECREF(exc); 5179 Py_XDECREF(errorHandler); 5180 return res; 5181 5182 onError: 5183 Py_XDECREF(res); 5184 Py_XDECREF(exc); 5185 Py_XDECREF(errorHandler); 5186 return NULL; 5187 } 5188 5189 PyObject *PyUnicode_Translate(PyObject *str, 5190 PyObject *mapping, 5191 const char *errors) 5192 { 5193 PyObject *result; 5194 5195 str = PyUnicode_FromObject(str); 5196 if (str == NULL) 5197 goto onError; 5198 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 5199 PyUnicode_GET_SIZE(str), 5200 mapping, 5201 errors); 5202 Py_DECREF(str); 5203 return result; 5204 5205 onError: 5206 Py_XDECREF(str); 5207 return NULL; 5208 } 5209 5210 /* --- Decimal Encoder ---------------------------------------------------- */ 5211 5212 int PyUnicode_EncodeDecimal(Py_UNICODE *s, 5213 Py_ssize_t length, 5214 char *output, 5215 const char *errors) 5216 { 5217 Py_UNICODE *p, *end; 5218 PyObject *errorHandler = NULL; 5219 PyObject *exc = NULL; 5220 const char *encoding = "decimal"; 5221 const char *reason = "invalid decimal Unicode string"; 5222 /* the following variable is used for caching string comparisons 5223 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 5224 int known_errorHandler = -1; 5225 5226 if (output == NULL) { 5227 PyErr_BadArgument(); 5228 return -1; 5229 } 5230 5231 p = s; 5232 end = s + length; 5233 while (p < end) { 5234 register Py_UNICODE ch = *p; 5235 int decimal; 5236 PyObject *repunicode; 5237 Py_ssize_t repsize; 5238 Py_ssize_t newpos; 5239 Py_UNICODE *uni2; 5240 Py_UNICODE *collstart; 5241 Py_UNICODE *collend; 5242 5243 if (Py_UNICODE_ISSPACE(ch)) { 5244 *output++ = ' '; 5245 ++p; 5246 continue; 5247 } 5248 decimal = Py_UNICODE_TODECIMAL(ch); 5249 if (decimal >= 0) { 5250 *output++ = '0' + decimal; 5251 ++p; 5252 continue; 5253 } 5254 if (0 < ch && ch < 256) { 5255 *output++ = (char)ch; 5256 ++p; 5257 continue; 5258 } 5259 /* All other characters are considered unencodable */ 5260 collstart = p; 5261 for (collend = p+1; collend < end; collend++) { 5262 if ((0 < *collend && *collend < 256) || 5263 Py_UNICODE_ISSPACE(*collend) || 5264 0 <= Py_UNICODE_TODECIMAL(*collend)) 5265 break; 5266 } 5267 /* cache callback name lookup 5268 * (if not done yet, i.e. it's the first error) */ 5269 if (known_errorHandler==-1) { 5270 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5271 known_errorHandler = 1; 5272 else if (!strcmp(errors, "replace")) 5273 known_errorHandler = 2; 5274 else if (!strcmp(errors, "ignore")) 5275 known_errorHandler = 3; 5276 else if (!strcmp(errors, "xmlcharrefreplace")) 5277 known_errorHandler = 4; 5278 else 5279 known_errorHandler = 0; 5280 } 5281 switch (known_errorHandler) { 5282 case 1: /* strict */ 5283 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 5284 goto onError; 5285 case 2: /* replace */ 5286 for (p = collstart; p < collend; ++p) 5287 *output++ = '?'; 5288 /* fall through */ 5289 case 3: /* ignore */ 5290 p = collend; 5291 break; 5292 case 4: /* xmlcharrefreplace */ 5293 /* generate replacement (temporarily (mis)uses p) */ 5294 for (p = collstart; p < collend;) { 5295 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend); 5296 output += sprintf(output, "&#%d;", ch); 5297 } 5298 p = collend; 5299 break; 5300 default: 5301 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 5302 encoding, reason, s, length, &exc, 5303 collstart-s, collend-s, &newpos); 5304 if (repunicode == NULL) 5305 goto onError; 5306 /* generate replacement */ 5307 repsize = PyUnicode_GET_SIZE(repunicode); 5308 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 5309 Py_UNICODE ch = *uni2; 5310 if (Py_UNICODE_ISSPACE(ch)) 5311 *output++ = ' '; 5312 else { 5313 decimal = Py_UNICODE_TODECIMAL(ch); 5314 if (decimal >= 0) 5315 *output++ = '0' + decimal; 5316 else if (0 < ch && ch < 256) 5317 *output++ = (char)ch; 5318 else { 5319 Py_DECREF(repunicode); 5320 raise_encode_exception(&exc, encoding, 5321 s, length, collstart-s, collend-s, reason); 5322 goto onError; 5323 } 5324 } 5325 } 5326 p = s + newpos; 5327 Py_DECREF(repunicode); 5328 } 5329 } 5330 /* 0-terminate the output string */ 5331 *output++ = '\0'; 5332 Py_XDECREF(exc); 5333 Py_XDECREF(errorHandler); 5334 return 0; 5335 5336 onError: 5337 Py_XDECREF(exc); 5338 Py_XDECREF(errorHandler); 5339 return -1; 5340 } 5341 5342 /* --- Helpers ------------------------------------------------------------ */ 5343 5344 #include "stringlib/unicodedefs.h" 5345 #include "stringlib/fastsearch.h" 5346 5347 #include "stringlib/count.h" 5348 #include "stringlib/find.h" 5349 #include "stringlib/partition.h" 5350 #include "stringlib/split.h" 5351 5352 /* helper macro to fixup start/end slice values */ 5353 #define ADJUST_INDICES(start, end, len) \ 5354 if (end > len) \ 5355 end = len; \ 5356 else if (end < 0) { \ 5357 end += len; \ 5358 if (end < 0) \ 5359 end = 0; \ 5360 } \ 5361 if (start < 0) { \ 5362 start += len; \ 5363 if (start < 0) \ 5364 start = 0; \ 5365 } 5366 5367 Py_ssize_t PyUnicode_Count(PyObject *str, 5368 PyObject *substr, 5369 Py_ssize_t start, 5370 Py_ssize_t end) 5371 { 5372 Py_ssize_t result; 5373 PyUnicodeObject* str_obj; 5374 PyUnicodeObject* sub_obj; 5375 5376 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 5377 if (!str_obj) 5378 return -1; 5379 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 5380 if (!sub_obj) { 5381 Py_DECREF(str_obj); 5382 return -1; 5383 } 5384 5385 ADJUST_INDICES(start, end, str_obj->length); 5386 result = stringlib_count( 5387 str_obj->str + start, end - start, sub_obj->str, sub_obj->length, 5388 PY_SSIZE_T_MAX 5389 ); 5390 5391 Py_DECREF(sub_obj); 5392 Py_DECREF(str_obj); 5393 5394 return result; 5395 } 5396 5397 Py_ssize_t PyUnicode_Find(PyObject *str, 5398 PyObject *sub, 5399 Py_ssize_t start, 5400 Py_ssize_t end, 5401 int direction) 5402 { 5403 Py_ssize_t result; 5404 5405 str = PyUnicode_FromObject(str); 5406 if (!str) 5407 return -2; 5408 sub = PyUnicode_FromObject(sub); 5409 if (!sub) { 5410 Py_DECREF(str); 5411 return -2; 5412 } 5413 5414 if (direction > 0) 5415 result = stringlib_find_slice( 5416 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 5417 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 5418 start, end 5419 ); 5420 else 5421 result = stringlib_rfind_slice( 5422 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 5423 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 5424 start, end 5425 ); 5426 5427 Py_DECREF(str); 5428 Py_DECREF(sub); 5429 5430 return result; 5431 } 5432 5433 static 5434 int tailmatch(PyUnicodeObject *self, 5435 PyUnicodeObject *substring, 5436 Py_ssize_t start, 5437 Py_ssize_t end, 5438 int direction) 5439 { 5440 if (substring->length == 0) 5441 return 1; 5442 5443 ADJUST_INDICES(start, end, self->length); 5444 end -= substring->length; 5445 if (end < start) 5446 return 0; 5447 5448 if (direction > 0) { 5449 if (Py_UNICODE_MATCH(self, end, substring)) 5450 return 1; 5451 } else { 5452 if (Py_UNICODE_MATCH(self, start, substring)) 5453 return 1; 5454 } 5455 5456 return 0; 5457 } 5458 5459 Py_ssize_t PyUnicode_Tailmatch(PyObject *str, 5460 PyObject *substr, 5461 Py_ssize_t start, 5462 Py_ssize_t end, 5463 int direction) 5464 { 5465 Py_ssize_t result; 5466 5467 str = PyUnicode_FromObject(str); 5468 if (str == NULL) 5469 return -1; 5470 substr = PyUnicode_FromObject(substr); 5471 if (substr == NULL) { 5472 Py_DECREF(str); 5473 return -1; 5474 } 5475 5476 result = tailmatch((PyUnicodeObject *)str, 5477 (PyUnicodeObject *)substr, 5478 start, end, direction); 5479 Py_DECREF(str); 5480 Py_DECREF(substr); 5481 return result; 5482 } 5483 5484 /* Apply fixfct filter to the Unicode object self and return a 5485 reference to the modified object */ 5486 5487 static 5488 PyObject *fixup(PyUnicodeObject *self, 5489 int (*fixfct)(PyUnicodeObject *s)) 5490 { 5491 5492 PyUnicodeObject *u; 5493 5494 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5495 if (u == NULL) 5496 return NULL; 5497 5498 Py_UNICODE_COPY(u->str, self->str, self->length); 5499 5500 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 5501 /* fixfct should return TRUE if it modified the buffer. If 5502 FALSE, return a reference to the original buffer instead 5503 (to save space, not time) */ 5504 Py_INCREF(self); 5505 Py_DECREF(u); 5506 return (PyObject*) self; 5507 } 5508 return (PyObject*) u; 5509 } 5510 5511 static 5512 int fixupper(PyUnicodeObject *self) 5513 { 5514 Py_ssize_t len = self->length; 5515 Py_UNICODE *s = self->str; 5516 int status = 0; 5517 5518 while (len-- > 0) { 5519 register Py_UNICODE ch; 5520 5521 ch = Py_UNICODE_TOUPPER(*s); 5522 if (ch != *s) { 5523 status = 1; 5524 *s = ch; 5525 } 5526 s++; 5527 } 5528 5529 return status; 5530 } 5531 5532 static 5533 int fixlower(PyUnicodeObject *self) 5534 { 5535 Py_ssize_t len = self->length; 5536 Py_UNICODE *s = self->str; 5537 int status = 0; 5538 5539 while (len-- > 0) { 5540 register Py_UNICODE ch; 5541 5542 ch = Py_UNICODE_TOLOWER(*s); 5543 if (ch != *s) { 5544 status = 1; 5545 *s = ch; 5546 } 5547 s++; 5548 } 5549 5550 return status; 5551 } 5552 5553 static 5554 int fixswapcase(PyUnicodeObject *self) 5555 { 5556 Py_ssize_t len = self->length; 5557 Py_UNICODE *s = self->str; 5558 int status = 0; 5559 5560 while (len-- > 0) { 5561 if (Py_UNICODE_ISUPPER(*s)) { 5562 *s = Py_UNICODE_TOLOWER(*s); 5563 status = 1; 5564 } else if (Py_UNICODE_ISLOWER(*s)) { 5565 *s = Py_UNICODE_TOUPPER(*s); 5566 status = 1; 5567 } 5568 s++; 5569 } 5570 5571 return status; 5572 } 5573 5574 static 5575 int fixcapitalize(PyUnicodeObject *self) 5576 { 5577 Py_ssize_t len = self->length; 5578 Py_UNICODE *s = self->str; 5579 int status = 0; 5580 5581 if (len == 0) 5582 return 0; 5583 if (!Py_UNICODE_ISUPPER(*s)) { 5584 *s = Py_UNICODE_TOUPPER(*s); 5585 status = 1; 5586 } 5587 s++; 5588 while (--len > 0) { 5589 if (!Py_UNICODE_ISLOWER(*s)) { 5590 *s = Py_UNICODE_TOLOWER(*s); 5591 status = 1; 5592 } 5593 s++; 5594 } 5595 return status; 5596 } 5597 5598 static 5599 int fixtitle(PyUnicodeObject *self) 5600 { 5601 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5602 register Py_UNICODE *e; 5603 int previous_is_cased; 5604 5605 /* Shortcut for single character strings */ 5606 if (PyUnicode_GET_SIZE(self) == 1) { 5607 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 5608 if (*p != ch) { 5609 *p = ch; 5610 return 1; 5611 } 5612 else 5613 return 0; 5614 } 5615 5616 e = p + PyUnicode_GET_SIZE(self); 5617 previous_is_cased = 0; 5618 for (; p < e; p++) { 5619 register const Py_UNICODE ch = *p; 5620 5621 if (previous_is_cased) 5622 *p = Py_UNICODE_TOLOWER(ch); 5623 else 5624 *p = Py_UNICODE_TOTITLE(ch); 5625 5626 if (Py_UNICODE_ISLOWER(ch) || 5627 Py_UNICODE_ISUPPER(ch) || 5628 Py_UNICODE_ISTITLE(ch)) 5629 previous_is_cased = 1; 5630 else 5631 previous_is_cased = 0; 5632 } 5633 return 1; 5634 } 5635 5636 PyObject * 5637 PyUnicode_Join(PyObject *separator, PyObject *seq) 5638 { 5639 PyObject *internal_separator = NULL; 5640 const Py_UNICODE blank = ' '; 5641 const Py_UNICODE *sep = ␣ 5642 Py_ssize_t seplen = 1; 5643 PyUnicodeObject *res = NULL; /* the result */ 5644 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */ 5645 Py_ssize_t res_used; /* # used bytes */ 5646 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ 5647 PyObject *fseq; /* PySequence_Fast(seq) */ 5648 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 5649 PyObject *item; 5650 Py_ssize_t i; 5651 5652 fseq = PySequence_Fast(seq, "can only join an iterable"); 5653 if (fseq == NULL) { 5654 return NULL; 5655 } 5656 5657 /* Grrrr. A codec may be invoked to convert str objects to 5658 * Unicode, and so it's possible to call back into Python code 5659 * during PyUnicode_FromObject(), and so it's possible for a sick 5660 * codec to change the size of fseq (if seq is a list). Therefore 5661 * we have to keep refetching the size -- can't assume seqlen 5662 * is invariant. 5663 */ 5664 seqlen = PySequence_Fast_GET_SIZE(fseq); 5665 /* If empty sequence, return u"". */ 5666 if (seqlen == 0) { 5667 res = _PyUnicode_New(0); /* empty sequence; return u"" */ 5668 goto Done; 5669 } 5670 /* If singleton sequence with an exact Unicode, return that. */ 5671 if (seqlen == 1) { 5672 item = PySequence_Fast_GET_ITEM(fseq, 0); 5673 if (PyUnicode_CheckExact(item)) { 5674 Py_INCREF(item); 5675 res = (PyUnicodeObject *)item; 5676 goto Done; 5677 } 5678 } 5679 5680 /* At least two items to join, or one that isn't exact Unicode. */ 5681 if (seqlen > 1) { 5682 /* Set up sep and seplen -- they're needed. */ 5683 if (separator == NULL) { 5684 sep = ␣ 5685 seplen = 1; 5686 } 5687 else { 5688 internal_separator = PyUnicode_FromObject(separator); 5689 if (internal_separator == NULL) 5690 goto onError; 5691 sep = PyUnicode_AS_UNICODE(internal_separator); 5692 seplen = PyUnicode_GET_SIZE(internal_separator); 5693 /* In case PyUnicode_FromObject() mutated seq. */ 5694 seqlen = PySequence_Fast_GET_SIZE(fseq); 5695 } 5696 } 5697 5698 /* Get space. */ 5699 res = _PyUnicode_New(res_alloc); 5700 if (res == NULL) 5701 goto onError; 5702 res_p = PyUnicode_AS_UNICODE(res); 5703 res_used = 0; 5704 5705 for (i = 0; i < seqlen; ++i) { 5706 Py_ssize_t itemlen; 5707 Py_ssize_t new_res_used; 5708 5709 item = PySequence_Fast_GET_ITEM(fseq, i); 5710 /* Convert item to Unicode. */ 5711 if (! PyUnicode_Check(item) && ! PyString_Check(item)) { 5712 PyErr_Format(PyExc_TypeError, 5713 "sequence item %zd: expected string or Unicode," 5714 " %.80s found", 5715 i, Py_TYPE(item)->tp_name); 5716 goto onError; 5717 } 5718 item = PyUnicode_FromObject(item); 5719 if (item == NULL) 5720 goto onError; 5721 /* We own a reference to item from here on. */ 5722 5723 /* In case PyUnicode_FromObject() mutated seq. */ 5724 seqlen = PySequence_Fast_GET_SIZE(fseq); 5725 5726 /* Make sure we have enough space for the separator and the item. */ 5727 itemlen = PyUnicode_GET_SIZE(item); 5728 new_res_used = res_used + itemlen; 5729 if (new_res_used < 0) 5730 goto Overflow; 5731 if (i < seqlen - 1) { 5732 new_res_used += seplen; 5733 if (new_res_used < 0) 5734 goto Overflow; 5735 } 5736 if (new_res_used > res_alloc) { 5737 /* double allocated size until it's big enough */ 5738 do { 5739 res_alloc += res_alloc; 5740 if (res_alloc <= 0) 5741 goto Overflow; 5742 } while (new_res_used > res_alloc); 5743 if (_PyUnicode_Resize(&res, res_alloc) < 0) { 5744 Py_DECREF(item); 5745 goto onError; 5746 } 5747 res_p = PyUnicode_AS_UNICODE(res) + res_used; 5748 } 5749 5750 /* Copy item, and maybe the separator. */ 5751 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen); 5752 res_p += itemlen; 5753 if (i < seqlen - 1) { 5754 Py_UNICODE_COPY(res_p, sep, seplen); 5755 res_p += seplen; 5756 } 5757 Py_DECREF(item); 5758 res_used = new_res_used; 5759 } 5760 5761 /* Shrink res to match the used area; this probably can't fail, 5762 * but it's cheap to check. 5763 */ 5764 if (_PyUnicode_Resize(&res, res_used) < 0) 5765 goto onError; 5766 5767 Done: 5768 Py_XDECREF(internal_separator); 5769 Py_DECREF(fseq); 5770 return (PyObject *)res; 5771 5772 Overflow: 5773 PyErr_SetString(PyExc_OverflowError, 5774 "join() result is too long for a Python string"); 5775 Py_DECREF(item); 5776 /* fall through */ 5777 5778 onError: 5779 Py_XDECREF(internal_separator); 5780 Py_DECREF(fseq); 5781 Py_XDECREF(res); 5782 return NULL; 5783 } 5784 5785 static 5786 PyUnicodeObject *pad(PyUnicodeObject *self, 5787 Py_ssize_t left, 5788 Py_ssize_t right, 5789 Py_UNICODE fill) 5790 { 5791 PyUnicodeObject *u; 5792 5793 if (left < 0) 5794 left = 0; 5795 if (right < 0) 5796 right = 0; 5797 5798 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 5799 Py_INCREF(self); 5800 return self; 5801 } 5802 5803 if (left > PY_SSIZE_T_MAX - self->length || 5804 right > PY_SSIZE_T_MAX - (left + self->length)) { 5805 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 5806 return NULL; 5807 } 5808 u = _PyUnicode_New(left + self->length + right); 5809 if (u) { 5810 if (left) 5811 Py_UNICODE_FILL(u->str, fill, left); 5812 Py_UNICODE_COPY(u->str + left, self->str, self->length); 5813 if (right) 5814 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 5815 } 5816 5817 return u; 5818 } 5819 5820 PyObject *PyUnicode_Splitlines(PyObject *string, int keepends) 5821 { 5822 PyObject *list; 5823 5824 string = PyUnicode_FromObject(string); 5825 if (string == NULL) 5826 return NULL; 5827 5828 list = stringlib_splitlines( 5829 (PyObject*) string, PyUnicode_AS_UNICODE(string), 5830 PyUnicode_GET_SIZE(string), keepends); 5831 5832 Py_DECREF(string); 5833 return list; 5834 } 5835 5836 static 5837 PyObject *split(PyUnicodeObject *self, 5838 PyUnicodeObject *substring, 5839 Py_ssize_t maxcount) 5840 { 5841 if (maxcount < 0) 5842 maxcount = PY_SSIZE_T_MAX; 5843 5844 if (substring == NULL) 5845 return stringlib_split_whitespace( 5846 (PyObject*) self, self->str, self->length, maxcount 5847 ); 5848 5849 return stringlib_split( 5850 (PyObject*) self, self->str, self->length, 5851 substring->str, substring->length, 5852 maxcount 5853 ); 5854 } 5855 5856 static 5857 PyObject *rsplit(PyUnicodeObject *self, 5858 PyUnicodeObject *substring, 5859 Py_ssize_t maxcount) 5860 { 5861 if (maxcount < 0) 5862 maxcount = PY_SSIZE_T_MAX; 5863 5864 if (substring == NULL) 5865 return stringlib_rsplit_whitespace( 5866 (PyObject*) self, self->str, self->length, maxcount 5867 ); 5868 5869 return stringlib_rsplit( 5870 (PyObject*) self, self->str, self->length, 5871 substring->str, substring->length, 5872 maxcount 5873 ); 5874 } 5875 5876 static 5877 PyObject *replace(PyUnicodeObject *self, 5878 PyUnicodeObject *str1, 5879 PyUnicodeObject *str2, 5880 Py_ssize_t maxcount) 5881 { 5882 PyUnicodeObject *u; 5883 5884 if (maxcount < 0) 5885 maxcount = PY_SSIZE_T_MAX; 5886 else if (maxcount == 0 || self->length == 0) 5887 goto nothing; 5888 5889 if (str1->length == str2->length) { 5890 Py_ssize_t i; 5891 /* same length */ 5892 if (str1->length == 0) 5893 goto nothing; 5894 if (str1->length == 1) { 5895 /* replace characters */ 5896 Py_UNICODE u1, u2; 5897 if (!findchar(self->str, self->length, str1->str[0])) 5898 goto nothing; 5899 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5900 if (!u) 5901 return NULL; 5902 Py_UNICODE_COPY(u->str, self->str, self->length); 5903 u1 = str1->str[0]; 5904 u2 = str2->str[0]; 5905 for (i = 0; i < u->length; i++) 5906 if (u->str[i] == u1) { 5907 if (--maxcount < 0) 5908 break; 5909 u->str[i] = u2; 5910 } 5911 } else { 5912 i = stringlib_find( 5913 self->str, self->length, str1->str, str1->length, 0 5914 ); 5915 if (i < 0) 5916 goto nothing; 5917 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5918 if (!u) 5919 return NULL; 5920 Py_UNICODE_COPY(u->str, self->str, self->length); 5921 5922 /* change everything in-place, starting with this one */ 5923 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 5924 i += str1->length; 5925 5926 while ( --maxcount > 0) { 5927 i = stringlib_find(self->str+i, self->length-i, 5928 str1->str, str1->length, 5929 i); 5930 if (i == -1) 5931 break; 5932 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 5933 i += str1->length; 5934 } 5935 } 5936 } else { 5937 5938 Py_ssize_t n, i, j; 5939 Py_ssize_t product, new_size, delta; 5940 Py_UNICODE *p; 5941 5942 /* replace strings */ 5943 n = stringlib_count(self->str, self->length, str1->str, str1->length, 5944 maxcount); 5945 if (n == 0) 5946 goto nothing; 5947 /* new_size = self->length + n * (str2->length - str1->length)); */ 5948 delta = (str2->length - str1->length); 5949 if (delta == 0) { 5950 new_size = self->length; 5951 } else { 5952 product = n * (str2->length - str1->length); 5953 if ((product / (str2->length - str1->length)) != n) { 5954 PyErr_SetString(PyExc_OverflowError, 5955 "replace string is too long"); 5956 return NULL; 5957 } 5958 new_size = self->length + product; 5959 if (new_size < 0) { 5960 PyErr_SetString(PyExc_OverflowError, 5961 "replace string is too long"); 5962 return NULL; 5963 } 5964 } 5965 u = _PyUnicode_New(new_size); 5966 if (!u) 5967 return NULL; 5968 i = 0; 5969 p = u->str; 5970 if (str1->length > 0) { 5971 while (n-- > 0) { 5972 /* look for next match */ 5973 j = stringlib_find(self->str+i, self->length-i, 5974 str1->str, str1->length, 5975 i); 5976 if (j == -1) 5977 break; 5978 else if (j > i) { 5979 /* copy unchanged part [i:j] */ 5980 Py_UNICODE_COPY(p, self->str+i, j-i); 5981 p += j - i; 5982 } 5983 /* copy substitution string */ 5984 if (str2->length > 0) { 5985 Py_UNICODE_COPY(p, str2->str, str2->length); 5986 p += str2->length; 5987 } 5988 i = j + str1->length; 5989 } 5990 if (i < self->length) 5991 /* copy tail [i:] */ 5992 Py_UNICODE_COPY(p, self->str+i, self->length-i); 5993 } else { 5994 /* interleave */ 5995 while (n > 0) { 5996 Py_UNICODE_COPY(p, str2->str, str2->length); 5997 p += str2->length; 5998 if (--n <= 0) 5999 break; 6000 *p++ = self->str[i++]; 6001 } 6002 Py_UNICODE_COPY(p, self->str+i, self->length-i); 6003 } 6004 } 6005 return (PyObject *) u; 6006 6007 nothing: 6008 /* nothing to replace; return original string (when possible) */ 6009 if (PyUnicode_CheckExact(self)) { 6010 Py_INCREF(self); 6011 return (PyObject *) self; 6012 } 6013 return PyUnicode_FromUnicode(self->str, self->length); 6014 } 6015 6016 /* --- Unicode Object Methods --------------------------------------------- */ 6017 6018 PyDoc_STRVAR(title__doc__, 6019 "S.title() -> unicode\n\ 6020 \n\ 6021 Return a titlecased version of S, i.e. words start with title case\n\ 6022 characters, all remaining cased characters have lower case."); 6023 6024 static PyObject* 6025 unicode_title(PyUnicodeObject *self) 6026 { 6027 return fixup(self, fixtitle); 6028 } 6029 6030 PyDoc_STRVAR(capitalize__doc__, 6031 "S.capitalize() -> unicode\n\ 6032 \n\ 6033 Return a capitalized version of S, i.e. make the first character\n\ 6034 have upper case and the rest lower case."); 6035 6036 static PyObject* 6037 unicode_capitalize(PyUnicodeObject *self) 6038 { 6039 return fixup(self, fixcapitalize); 6040 } 6041 6042 #if 0 6043 PyDoc_STRVAR(capwords__doc__, 6044 "S.capwords() -> unicode\n\ 6045 \n\ 6046 Apply .capitalize() to all words in S and return the result with\n\ 6047 normalized whitespace (all whitespace strings are replaced by ' ')."); 6048 6049 static PyObject* 6050 unicode_capwords(PyUnicodeObject *self) 6051 { 6052 PyObject *list; 6053 PyObject *item; 6054 Py_ssize_t i; 6055 6056 /* Split into words */ 6057 list = split(self, NULL, -1); 6058 if (!list) 6059 return NULL; 6060 6061 /* Capitalize each word */ 6062 for (i = 0; i < PyList_GET_SIZE(list); i++) { 6063 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 6064 fixcapitalize); 6065 if (item == NULL) 6066 goto onError; 6067 Py_DECREF(PyList_GET_ITEM(list, i)); 6068 PyList_SET_ITEM(list, i, item); 6069 } 6070 6071 /* Join the words to form a new string */ 6072 item = PyUnicode_Join(NULL, list); 6073 6074 onError: 6075 Py_DECREF(list); 6076 return (PyObject *)item; 6077 } 6078 #endif 6079 6080 /* Argument converter. Coerces to a single unicode character */ 6081 6082 static int 6083 convert_uc(PyObject *obj, void *addr) 6084 { 6085 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr; 6086 PyObject *uniobj; 6087 Py_UNICODE *unistr; 6088 6089 uniobj = PyUnicode_FromObject(obj); 6090 if (uniobj == NULL) { 6091 PyErr_SetString(PyExc_TypeError, 6092 "The fill character cannot be converted to Unicode"); 6093 return 0; 6094 } 6095 if (PyUnicode_GET_SIZE(uniobj) != 1) { 6096 PyErr_SetString(PyExc_TypeError, 6097 "The fill character must be exactly one character long"); 6098 Py_DECREF(uniobj); 6099 return 0; 6100 } 6101 unistr = PyUnicode_AS_UNICODE(uniobj); 6102 *fillcharloc = unistr[0]; 6103 Py_DECREF(uniobj); 6104 return 1; 6105 } 6106 6107 PyDoc_STRVAR(center__doc__, 6108 "S.center(width[, fillchar]) -> unicode\n\ 6109 \n\ 6110 Return S centered in a Unicode string of length width. Padding is\n\ 6111 done using the specified fill character (default is a space)"); 6112 6113 static PyObject * 6114 unicode_center(PyUnicodeObject *self, PyObject *args) 6115 { 6116 Py_ssize_t marg, left; 6117 Py_ssize_t width; 6118 Py_UNICODE fillchar = ' '; 6119 6120 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 6121 return NULL; 6122 6123 if (self->length >= width && PyUnicode_CheckExact(self)) { 6124 Py_INCREF(self); 6125 return (PyObject*) self; 6126 } 6127 6128 marg = width - self->length; 6129 left = marg / 2 + (marg & width & 1); 6130 6131 return (PyObject*) pad(self, left, marg - left, fillchar); 6132 } 6133 6134 #if 0 6135 6136 /* This code should go into some future Unicode collation support 6137 module. The basic comparison should compare ordinals on a naive 6138 basis (this is what Java does and thus Jython too). */ 6139 6140 /* speedy UTF-16 code point order comparison */ 6141 /* gleaned from: */ 6142 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 6143 6144 static short utf16Fixup[32] = 6145 { 6146 0, 0, 0, 0, 0, 0, 0, 0, 6147 0, 0, 0, 0, 0, 0, 0, 0, 6148 0, 0, 0, 0, 0, 0, 0, 0, 6149 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 6150 }; 6151 6152 static int 6153 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 6154 { 6155 Py_ssize_t len1, len2; 6156 6157 Py_UNICODE *s1 = str1->str; 6158 Py_UNICODE *s2 = str2->str; 6159 6160 len1 = str1->length; 6161 len2 = str2->length; 6162 6163 while (len1 > 0 && len2 > 0) { 6164 Py_UNICODE c1, c2; 6165 6166 c1 = *s1++; 6167 c2 = *s2++; 6168 6169 if (c1 > (1<<11) * 26) 6170 c1 += utf16Fixup[c1>>11]; 6171 if (c2 > (1<<11) * 26) 6172 c2 += utf16Fixup[c2>>11]; 6173 /* now c1 and c2 are in UTF-32-compatible order */ 6174 6175 if (c1 != c2) 6176 return (c1 < c2) ? -1 : 1; 6177 6178 len1--; len2--; 6179 } 6180 6181 return (len1 < len2) ? -1 : (len1 != len2); 6182 } 6183 6184 #else 6185 6186 static int 6187 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 6188 { 6189 register Py_ssize_t len1, len2; 6190 6191 Py_UNICODE *s1 = str1->str; 6192 Py_UNICODE *s2 = str2->str; 6193 6194 len1 = str1->length; 6195 len2 = str2->length; 6196 6197 while (len1 > 0 && len2 > 0) { 6198 Py_UNICODE c1, c2; 6199 6200 c1 = *s1++; 6201 c2 = *s2++; 6202 6203 if (c1 != c2) 6204 return (c1 < c2) ? -1 : 1; 6205 6206 len1--; len2--; 6207 } 6208 6209 return (len1 < len2) ? -1 : (len1 != len2); 6210 } 6211 6212 #endif 6213 6214 int PyUnicode_Compare(PyObject *left, 6215 PyObject *right) 6216 { 6217 PyUnicodeObject *u = NULL, *v = NULL; 6218 int result; 6219 6220 /* Coerce the two arguments */ 6221 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 6222 if (u == NULL) 6223 goto onError; 6224 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 6225 if (v == NULL) 6226 goto onError; 6227 6228 /* Shortcut for empty or interned objects */ 6229 if (v == u) { 6230 Py_DECREF(u); 6231 Py_DECREF(v); 6232 return 0; 6233 } 6234 6235 result = unicode_compare(u, v); 6236 6237 Py_DECREF(u); 6238 Py_DECREF(v); 6239 return result; 6240 6241 onError: 6242 Py_XDECREF(u); 6243 Py_XDECREF(v); 6244 return -1; 6245 } 6246 6247 PyObject *PyUnicode_RichCompare(PyObject *left, 6248 PyObject *right, 6249 int op) 6250 { 6251 int result; 6252 6253 result = PyUnicode_Compare(left, right); 6254 if (result == -1 && PyErr_Occurred()) 6255 goto onError; 6256 6257 /* Convert the return value to a Boolean */ 6258 switch (op) { 6259 case Py_EQ: 6260 result = (result == 0); 6261 break; 6262 case Py_NE: 6263 result = (result != 0); 6264 break; 6265 case Py_LE: 6266 result = (result <= 0); 6267 break; 6268 case Py_GE: 6269 result = (result >= 0); 6270 break; 6271 case Py_LT: 6272 result = (result == -1); 6273 break; 6274 case Py_GT: 6275 result = (result == 1); 6276 break; 6277 } 6278 return PyBool_FromLong(result); 6279 6280 onError: 6281 6282 /* Standard case 6283 6284 Type errors mean that PyUnicode_FromObject() could not convert 6285 one of the arguments (usually the right hand side) to Unicode, 6286 ie. we can't handle the comparison request. However, it is 6287 possible that the other object knows a comparison method, which 6288 is why we return Py_NotImplemented to give the other object a 6289 chance. 6290 6291 */ 6292 if (PyErr_ExceptionMatches(PyExc_TypeError)) { 6293 PyErr_Clear(); 6294 Py_INCREF(Py_NotImplemented); 6295 return Py_NotImplemented; 6296 } 6297 if (op != Py_EQ && op != Py_NE) 6298 return NULL; 6299 6300 /* Equality comparison. 6301 6302 This is a special case: we silence any PyExc_UnicodeDecodeError 6303 and instead turn it into a PyErr_UnicodeWarning. 6304 6305 */ 6306 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) 6307 return NULL; 6308 PyErr_Clear(); 6309 if (PyErr_Warn(PyExc_UnicodeWarning, 6310 (op == Py_EQ) ? 6311 "Unicode equal comparison " 6312 "failed to convert both arguments to Unicode - " 6313 "interpreting them as being unequal" : 6314 "Unicode unequal comparison " 6315 "failed to convert both arguments to Unicode - " 6316 "interpreting them as being unequal" 6317 ) < 0) 6318 return NULL; 6319 result = (op == Py_NE); 6320 return PyBool_FromLong(result); 6321 } 6322 6323 int PyUnicode_Contains(PyObject *container, 6324 PyObject *element) 6325 { 6326 PyObject *str, *sub; 6327 int result; 6328 6329 /* Coerce the two arguments */ 6330 sub = PyUnicode_FromObject(element); 6331 if (!sub) { 6332 return -1; 6333 } 6334 6335 str = PyUnicode_FromObject(container); 6336 if (!str) { 6337 Py_DECREF(sub); 6338 return -1; 6339 } 6340 6341 result = stringlib_contains_obj(str, sub); 6342 6343 Py_DECREF(str); 6344 Py_DECREF(sub); 6345 6346 return result; 6347 } 6348 6349 /* Concat to string or Unicode object giving a new Unicode object. */ 6350 6351 PyObject *PyUnicode_Concat(PyObject *left, 6352 PyObject *right) 6353 { 6354 PyUnicodeObject *u = NULL, *v = NULL, *w; 6355 6356 /* Coerce the two arguments */ 6357 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 6358 if (u == NULL) 6359 goto onError; 6360 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 6361 if (v == NULL) 6362 goto onError; 6363 6364 /* Shortcuts */ 6365 if (v == unicode_empty) { 6366 Py_DECREF(v); 6367 return (PyObject *)u; 6368 } 6369 if (u == unicode_empty) { 6370 Py_DECREF(u); 6371 return (PyObject *)v; 6372 } 6373 6374 /* Concat the two Unicode strings */ 6375 w = _PyUnicode_New(u->length + v->length); 6376 if (w == NULL) 6377 goto onError; 6378 Py_UNICODE_COPY(w->str, u->str, u->length); 6379 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 6380 6381 Py_DECREF(u); 6382 Py_DECREF(v); 6383 return (PyObject *)w; 6384 6385 onError: 6386 Py_XDECREF(u); 6387 Py_XDECREF(v); 6388 return NULL; 6389 } 6390 6391 PyDoc_STRVAR(count__doc__, 6392 "S.count(sub[, start[, end]]) -> int\n\ 6393 \n\ 6394 Return the number of non-overlapping occurrences of substring sub in\n\ 6395 Unicode string S[start:end]. Optional arguments start and end are\n\ 6396 interpreted as in slice notation."); 6397 6398 static PyObject * 6399 unicode_count(PyUnicodeObject *self, PyObject *args) 6400 { 6401 PyUnicodeObject *substring; 6402 Py_ssize_t start = 0; 6403 Py_ssize_t end = PY_SSIZE_T_MAX; 6404 PyObject *result; 6405 6406 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 6407 &start, &end)) 6408 return NULL; 6409 6410 ADJUST_INDICES(start, end, self->length); 6411 result = PyInt_FromSsize_t( 6412 stringlib_count(self->str + start, end - start, 6413 substring->str, substring->length, 6414 PY_SSIZE_T_MAX) 6415 ); 6416 6417 Py_DECREF(substring); 6418 6419 return result; 6420 } 6421 6422 PyDoc_STRVAR(encode__doc__, 6423 "S.encode([encoding[,errors]]) -> string or unicode\n\ 6424 \n\ 6425 Encodes S using the codec registered for encoding. encoding defaults\n\ 6426 to the default encoding. errors may be given to set a different error\n\ 6427 handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 6428 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 6429 'xmlcharrefreplace' as well as any other name registered with\n\ 6430 codecs.register_error that can handle UnicodeEncodeErrors."); 6431 6432 static PyObject * 6433 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) 6434 { 6435 static char *kwlist[] = {"encoding", "errors", 0}; 6436 char *encoding = NULL; 6437 char *errors = NULL; 6438 PyObject *v; 6439 6440 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 6441 kwlist, &encoding, &errors)) 6442 return NULL; 6443 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors); 6444 if (v == NULL) 6445 goto onError; 6446 if (!PyString_Check(v) && !PyUnicode_Check(v)) { 6447 PyErr_Format(PyExc_TypeError, 6448 "encoder did not return a string/unicode object " 6449 "(type=%.400s)", 6450 Py_TYPE(v)->tp_name); 6451 Py_DECREF(v); 6452 return NULL; 6453 } 6454 return v; 6455 6456 onError: 6457 return NULL; 6458 } 6459 6460 PyDoc_STRVAR(decode__doc__, 6461 "S.decode([encoding[,errors]]) -> string or unicode\n\ 6462 \n\ 6463 Decodes S using the codec registered for encoding. encoding defaults\n\ 6464 to the default encoding. errors may be given to set a different error\n\ 6465 handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 6466 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\ 6467 as well as any other name registered with codecs.register_error that is\n\ 6468 able to handle UnicodeDecodeErrors."); 6469 6470 static PyObject * 6471 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) 6472 { 6473 static char *kwlist[] = {"encoding", "errors", 0}; 6474 char *encoding = NULL; 6475 char *errors = NULL; 6476 PyObject *v; 6477 6478 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode", 6479 kwlist, &encoding, &errors)) 6480 return NULL; 6481 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors); 6482 if (v == NULL) 6483 goto onError; 6484 if (!PyString_Check(v) && !PyUnicode_Check(v)) { 6485 PyErr_Format(PyExc_TypeError, 6486 "decoder did not return a string/unicode object " 6487 "(type=%.400s)", 6488 Py_TYPE(v)->tp_name); 6489 Py_DECREF(v); 6490 return NULL; 6491 } 6492 return v; 6493 6494 onError: 6495 return NULL; 6496 } 6497 6498 PyDoc_STRVAR(expandtabs__doc__, 6499 "S.expandtabs([tabsize]) -> unicode\n\ 6500 \n\ 6501 Return a copy of S where all tab characters are expanded using spaces.\n\ 6502 If tabsize is not given, a tab size of 8 characters is assumed."); 6503 6504 static PyObject* 6505 unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 6506 { 6507 Py_UNICODE *e; 6508 Py_UNICODE *p; 6509 Py_UNICODE *q; 6510 Py_UNICODE *qe; 6511 Py_ssize_t i, j, incr; 6512 PyUnicodeObject *u; 6513 int tabsize = 8; 6514 6515 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 6516 return NULL; 6517 6518 /* First pass: determine size of output string */ 6519 i = 0; /* chars up to and including most recent \n or \r */ 6520 j = 0; /* chars since most recent \n or \r (use in tab calculations) */ 6521 e = self->str + self->length; /* end of input */ 6522 for (p = self->str; p < e; p++) 6523 if (*p == '\t') { 6524 if (tabsize > 0) { 6525 incr = tabsize - (j % tabsize); /* cannot overflow */ 6526 if (j > PY_SSIZE_T_MAX - incr) 6527 goto overflow1; 6528 j += incr; 6529 } 6530 } 6531 else { 6532 if (j > PY_SSIZE_T_MAX - 1) 6533 goto overflow1; 6534 j++; 6535 if (*p == '\n' || *p == '\r') { 6536 if (i > PY_SSIZE_T_MAX - j) 6537 goto overflow1; 6538 i += j; 6539 j = 0; 6540 } 6541 } 6542 6543 if (i > PY_SSIZE_T_MAX - j) 6544 goto overflow1; 6545 6546 /* Second pass: create output string and fill it */ 6547 u = _PyUnicode_New(i + j); 6548 if (!u) 6549 return NULL; 6550 6551 j = 0; /* same as in first pass */ 6552 q = u->str; /* next output char */ 6553 qe = u->str + u->length; /* end of output */ 6554 6555 for (p = self->str; p < e; p++) 6556 if (*p == '\t') { 6557 if (tabsize > 0) { 6558 i = tabsize - (j % tabsize); 6559 j += i; 6560 while (i--) { 6561 if (q >= qe) 6562 goto overflow2; 6563 *q++ = ' '; 6564 } 6565 } 6566 } 6567 else { 6568 if (q >= qe) 6569 goto overflow2; 6570 *q++ = *p; 6571 j++; 6572 if (*p == '\n' || *p == '\r') 6573 j = 0; 6574 } 6575 6576 return (PyObject*) u; 6577 6578 overflow2: 6579 Py_DECREF(u); 6580 overflow1: 6581 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 6582 return NULL; 6583 } 6584 6585 PyDoc_STRVAR(find__doc__, 6586 "S.find(sub [,start [,end]]) -> int\n\ 6587 \n\ 6588 Return the lowest index in S where substring sub is found,\n\ 6589 such that sub is contained within S[start:end]. Optional\n\ 6590 arguments start and end are interpreted as in slice notation.\n\ 6591 \n\ 6592 Return -1 on failure."); 6593 6594 static PyObject * 6595 unicode_find(PyUnicodeObject *self, PyObject *args) 6596 { 6597 PyUnicodeObject *substring; 6598 Py_ssize_t start; 6599 Py_ssize_t end; 6600 Py_ssize_t result; 6601 6602 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 6603 &start, &end)) 6604 return NULL; 6605 6606 result = stringlib_find_slice( 6607 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 6608 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 6609 start, end 6610 ); 6611 6612 Py_DECREF(substring); 6613 6614 return PyInt_FromSsize_t(result); 6615 } 6616 6617 static PyObject * 6618 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) 6619 { 6620 if (index < 0 || index >= self->length) { 6621 PyErr_SetString(PyExc_IndexError, "string index out of range"); 6622 return NULL; 6623 } 6624 6625 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 6626 } 6627 6628 static long 6629 unicode_hash(PyUnicodeObject *self) 6630 { 6631 /* Since Unicode objects compare equal to their ASCII string 6632 counterparts, they should use the individual character values 6633 as basis for their hash value. This is needed to assure that 6634 strings and Unicode objects behave in the same way as 6635 dictionary keys. */ 6636 6637 register Py_ssize_t len; 6638 register Py_UNICODE *p; 6639 register long x; 6640 6641 #ifdef Py_DEBUG 6642 assert(_Py_HashSecret_Initialized); 6643 #endif 6644 if (self->hash != -1) 6645 return self->hash; 6646 len = PyUnicode_GET_SIZE(self); 6647 /* 6648 We make the hash of the empty string be 0, rather than using 6649 (prefix ^ suffix), since this slightly obfuscates the hash secret 6650 */ 6651 if (len == 0) { 6652 self->hash = 0; 6653 return 0; 6654 } 6655 p = PyUnicode_AS_UNICODE(self); 6656 x = _Py_HashSecret.prefix; 6657 x ^= *p << 7; 6658 while (--len >= 0) 6659 x = (1000003*x) ^ *p++; 6660 x ^= PyUnicode_GET_SIZE(self); 6661 x ^= _Py_HashSecret.suffix; 6662 if (x == -1) 6663 x = -2; 6664 self->hash = x; 6665 return x; 6666 } 6667 6668 PyDoc_STRVAR(index__doc__, 6669 "S.index(sub [,start [,end]]) -> int\n\ 6670 \n\ 6671 Like S.find() but raise ValueError when the substring is not found."); 6672 6673 static PyObject * 6674 unicode_index(PyUnicodeObject *self, PyObject *args) 6675 { 6676 Py_ssize_t result; 6677 PyUnicodeObject *substring; 6678 Py_ssize_t start; 6679 Py_ssize_t end; 6680 6681 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 6682 &start, &end)) 6683 return NULL; 6684 6685 result = stringlib_find_slice( 6686 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 6687 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 6688 start, end 6689 ); 6690 6691 Py_DECREF(substring); 6692 6693 if (result < 0) { 6694 PyErr_SetString(PyExc_ValueError, "substring not found"); 6695 return NULL; 6696 } 6697 6698 return PyInt_FromSsize_t(result); 6699 } 6700 6701 PyDoc_STRVAR(islower__doc__, 6702 "S.islower() -> bool\n\ 6703 \n\ 6704 Return True if all cased characters in S are lowercase and there is\n\ 6705 at least one cased character in S, False otherwise."); 6706 6707 static PyObject* 6708 unicode_islower(PyUnicodeObject *self) 6709 { 6710 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6711 register const Py_UNICODE *e; 6712 int cased; 6713 6714 /* Shortcut for single character strings */ 6715 if (PyUnicode_GET_SIZE(self) == 1) 6716 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 6717 6718 /* Special case for empty strings */ 6719 if (PyUnicode_GET_SIZE(self) == 0) 6720 return PyBool_FromLong(0); 6721 6722 e = p + PyUnicode_GET_SIZE(self); 6723 cased = 0; 6724 for (; p < e; p++) { 6725 register const Py_UNICODE ch = *p; 6726 6727 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 6728 return PyBool_FromLong(0); 6729 else if (!cased && Py_UNICODE_ISLOWER(ch)) 6730 cased = 1; 6731 } 6732 return PyBool_FromLong(cased); 6733 } 6734 6735 PyDoc_STRVAR(isupper__doc__, 6736 "S.isupper() -> bool\n\ 6737 \n\ 6738 Return True if all cased characters in S are uppercase and there is\n\ 6739 at least one cased character in S, False otherwise."); 6740 6741 static PyObject* 6742 unicode_isupper(PyUnicodeObject *self) 6743 { 6744 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6745 register const Py_UNICODE *e; 6746 int cased; 6747 6748 /* Shortcut for single character strings */ 6749 if (PyUnicode_GET_SIZE(self) == 1) 6750 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 6751 6752 /* Special case for empty strings */ 6753 if (PyUnicode_GET_SIZE(self) == 0) 6754 return PyBool_FromLong(0); 6755 6756 e = p + PyUnicode_GET_SIZE(self); 6757 cased = 0; 6758 for (; p < e; p++) { 6759 register const Py_UNICODE ch = *p; 6760 6761 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 6762 return PyBool_FromLong(0); 6763 else if (!cased && Py_UNICODE_ISUPPER(ch)) 6764 cased = 1; 6765 } 6766 return PyBool_FromLong(cased); 6767 } 6768 6769 PyDoc_STRVAR(istitle__doc__, 6770 "S.istitle() -> bool\n\ 6771 \n\ 6772 Return True if S is a titlecased string and there is at least one\n\ 6773 character in S, i.e. upper- and titlecase characters may only\n\ 6774 follow uncased characters and lowercase characters only cased ones.\n\ 6775 Return False otherwise."); 6776 6777 static PyObject* 6778 unicode_istitle(PyUnicodeObject *self) 6779 { 6780 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6781 register const Py_UNICODE *e; 6782 int cased, previous_is_cased; 6783 6784 /* Shortcut for single character strings */ 6785 if (PyUnicode_GET_SIZE(self) == 1) 6786 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 6787 (Py_UNICODE_ISUPPER(*p) != 0)); 6788 6789 /* Special case for empty strings */ 6790 if (PyUnicode_GET_SIZE(self) == 0) 6791 return PyBool_FromLong(0); 6792 6793 e = p + PyUnicode_GET_SIZE(self); 6794 cased = 0; 6795 previous_is_cased = 0; 6796 for (; p < e; p++) { 6797 register const Py_UNICODE ch = *p; 6798 6799 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 6800 if (previous_is_cased) 6801 return PyBool_FromLong(0); 6802 previous_is_cased = 1; 6803 cased = 1; 6804 } 6805 else if (Py_UNICODE_ISLOWER(ch)) { 6806 if (!previous_is_cased) 6807 return PyBool_FromLong(0); 6808 previous_is_cased = 1; 6809 cased = 1; 6810 } 6811 else 6812 previous_is_cased = 0; 6813 } 6814 return PyBool_FromLong(cased); 6815 } 6816 6817 PyDoc_STRVAR(isspace__doc__, 6818 "S.isspace() -> bool\n\ 6819 \n\ 6820 Return True if all characters in S are whitespace\n\ 6821 and there is at least one character in S, False otherwise."); 6822 6823 static PyObject* 6824 unicode_isspace(PyUnicodeObject *self) 6825 { 6826 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6827 register const Py_UNICODE *e; 6828 6829 /* Shortcut for single character strings */ 6830 if (PyUnicode_GET_SIZE(self) == 1 && 6831 Py_UNICODE_ISSPACE(*p)) 6832 return PyBool_FromLong(1); 6833 6834 /* Special case for empty strings */ 6835 if (PyUnicode_GET_SIZE(self) == 0) 6836 return PyBool_FromLong(0); 6837 6838 e = p + PyUnicode_GET_SIZE(self); 6839 for (; p < e; p++) { 6840 if (!Py_UNICODE_ISSPACE(*p)) 6841 return PyBool_FromLong(0); 6842 } 6843 return PyBool_FromLong(1); 6844 } 6845 6846 PyDoc_STRVAR(isalpha__doc__, 6847 "S.isalpha() -> bool\n\ 6848 \n\ 6849 Return True if all characters in S are alphabetic\n\ 6850 and there is at least one character in S, False otherwise."); 6851 6852 static PyObject* 6853 unicode_isalpha(PyUnicodeObject *self) 6854 { 6855 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6856 register const Py_UNICODE *e; 6857 6858 /* Shortcut for single character strings */ 6859 if (PyUnicode_GET_SIZE(self) == 1 && 6860 Py_UNICODE_ISALPHA(*p)) 6861 return PyBool_FromLong(1); 6862 6863 /* Special case for empty strings */ 6864 if (PyUnicode_GET_SIZE(self) == 0) 6865 return PyBool_FromLong(0); 6866 6867 e = p + PyUnicode_GET_SIZE(self); 6868 for (; p < e; p++) { 6869 if (!Py_UNICODE_ISALPHA(*p)) 6870 return PyBool_FromLong(0); 6871 } 6872 return PyBool_FromLong(1); 6873 } 6874 6875 PyDoc_STRVAR(isalnum__doc__, 6876 "S.isalnum() -> bool\n\ 6877 \n\ 6878 Return True if all characters in S are alphanumeric\n\ 6879 and there is at least one character in S, False otherwise."); 6880 6881 static PyObject* 6882 unicode_isalnum(PyUnicodeObject *self) 6883 { 6884 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6885 register const Py_UNICODE *e; 6886 6887 /* Shortcut for single character strings */ 6888 if (PyUnicode_GET_SIZE(self) == 1 && 6889 Py_UNICODE_ISALNUM(*p)) 6890 return PyBool_FromLong(1); 6891 6892 /* Special case for empty strings */ 6893 if (PyUnicode_GET_SIZE(self) == 0) 6894 return PyBool_FromLong(0); 6895 6896 e = p + PyUnicode_GET_SIZE(self); 6897 for (; p < e; p++) { 6898 if (!Py_UNICODE_ISALNUM(*p)) 6899 return PyBool_FromLong(0); 6900 } 6901 return PyBool_FromLong(1); 6902 } 6903 6904 PyDoc_STRVAR(isdecimal__doc__, 6905 "S.isdecimal() -> bool\n\ 6906 \n\ 6907 Return True if there are only decimal characters in S,\n\ 6908 False otherwise."); 6909 6910 static PyObject* 6911 unicode_isdecimal(PyUnicodeObject *self) 6912 { 6913 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6914 register const Py_UNICODE *e; 6915 6916 /* Shortcut for single character strings */ 6917 if (PyUnicode_GET_SIZE(self) == 1 && 6918 Py_UNICODE_ISDECIMAL(*p)) 6919 return PyBool_FromLong(1); 6920 6921 /* Special case for empty strings */ 6922 if (PyUnicode_GET_SIZE(self) == 0) 6923 return PyBool_FromLong(0); 6924 6925 e = p + PyUnicode_GET_SIZE(self); 6926 for (; p < e; p++) { 6927 if (!Py_UNICODE_ISDECIMAL(*p)) 6928 return PyBool_FromLong(0); 6929 } 6930 return PyBool_FromLong(1); 6931 } 6932 6933 PyDoc_STRVAR(isdigit__doc__, 6934 "S.isdigit() -> bool\n\ 6935 \n\ 6936 Return True if all characters in S are digits\n\ 6937 and there is at least one character in S, False otherwise."); 6938 6939 static PyObject* 6940 unicode_isdigit(PyUnicodeObject *self) 6941 { 6942 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6943 register const Py_UNICODE *e; 6944 6945 /* Shortcut for single character strings */ 6946 if (PyUnicode_GET_SIZE(self) == 1 && 6947 Py_UNICODE_ISDIGIT(*p)) 6948 return PyBool_FromLong(1); 6949 6950 /* Special case for empty strings */ 6951 if (PyUnicode_GET_SIZE(self) == 0) 6952 return PyBool_FromLong(0); 6953 6954 e = p + PyUnicode_GET_SIZE(self); 6955 for (; p < e; p++) { 6956 if (!Py_UNICODE_ISDIGIT(*p)) 6957 return PyBool_FromLong(0); 6958 } 6959 return PyBool_FromLong(1); 6960 } 6961 6962 PyDoc_STRVAR(isnumeric__doc__, 6963 "S.isnumeric() -> bool\n\ 6964 \n\ 6965 Return True if there are only numeric characters in S,\n\ 6966 False otherwise."); 6967 6968 static PyObject* 6969 unicode_isnumeric(PyUnicodeObject *self) 6970 { 6971 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6972 register const Py_UNICODE *e; 6973 6974 /* Shortcut for single character strings */ 6975 if (PyUnicode_GET_SIZE(self) == 1 && 6976 Py_UNICODE_ISNUMERIC(*p)) 6977 return PyBool_FromLong(1); 6978 6979 /* Special case for empty strings */ 6980 if (PyUnicode_GET_SIZE(self) == 0) 6981 return PyBool_FromLong(0); 6982 6983 e = p + PyUnicode_GET_SIZE(self); 6984 for (; p < e; p++) { 6985 if (!Py_UNICODE_ISNUMERIC(*p)) 6986 return PyBool_FromLong(0); 6987 } 6988 return PyBool_FromLong(1); 6989 } 6990 6991 PyDoc_STRVAR(join__doc__, 6992 "S.join(iterable) -> unicode\n\ 6993 \n\ 6994 Return a string which is the concatenation of the strings in the\n\ 6995 iterable. The separator between elements is S."); 6996 6997 static PyObject* 6998 unicode_join(PyObject *self, PyObject *data) 6999 { 7000 return PyUnicode_Join(self, data); 7001 } 7002 7003 static Py_ssize_t 7004 unicode_length(PyUnicodeObject *self) 7005 { 7006 return self->length; 7007 } 7008 7009 PyDoc_STRVAR(ljust__doc__, 7010 "S.ljust(width[, fillchar]) -> int\n\ 7011 \n\ 7012 Return S left-justified in a Unicode string of length width. Padding is\n\ 7013 done using the specified fill character (default is a space)."); 7014 7015 static PyObject * 7016 unicode_ljust(PyUnicodeObject *self, PyObject *args) 7017 { 7018 Py_ssize_t width; 7019 Py_UNICODE fillchar = ' '; 7020 7021 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 7022 return NULL; 7023 7024 if (self->length >= width && PyUnicode_CheckExact(self)) { 7025 Py_INCREF(self); 7026 return (PyObject*) self; 7027 } 7028 7029 return (PyObject*) pad(self, 0, width - self->length, fillchar); 7030 } 7031 7032 PyDoc_STRVAR(lower__doc__, 7033 "S.lower() -> unicode\n\ 7034 \n\ 7035 Return a copy of the string S converted to lowercase."); 7036 7037 static PyObject* 7038 unicode_lower(PyUnicodeObject *self) 7039 { 7040 return fixup(self, fixlower); 7041 } 7042 7043 #define LEFTSTRIP 0 7044 #define RIGHTSTRIP 1 7045 #define BOTHSTRIP 2 7046 7047 /* Arrays indexed by above */ 7048 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 7049 7050 #define STRIPNAME(i) (stripformat[i]+3) 7051 7052 /* externally visible for str.strip(unicode) */ 7053 PyObject * 7054 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 7055 { 7056 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 7057 Py_ssize_t len = PyUnicode_GET_SIZE(self); 7058 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 7059 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); 7060 Py_ssize_t i, j; 7061 7062 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); 7063 7064 i = 0; 7065 if (striptype != RIGHTSTRIP) { 7066 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { 7067 i++; 7068 } 7069 } 7070 7071 j = len; 7072 if (striptype != LEFTSTRIP) { 7073 do { 7074 j--; 7075 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); 7076 j++; 7077 } 7078 7079 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 7080 Py_INCREF(self); 7081 return (PyObject*)self; 7082 } 7083 else 7084 return PyUnicode_FromUnicode(s+i, j-i); 7085 } 7086 7087 7088 static PyObject * 7089 do_strip(PyUnicodeObject *self, int striptype) 7090 { 7091 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 7092 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; 7093 7094 i = 0; 7095 if (striptype != RIGHTSTRIP) { 7096 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 7097 i++; 7098 } 7099 } 7100 7101 j = len; 7102 if (striptype != LEFTSTRIP) { 7103 do { 7104 j--; 7105 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 7106 j++; 7107 } 7108 7109 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 7110 Py_INCREF(self); 7111 return (PyObject*)self; 7112 } 7113 else 7114 return PyUnicode_FromUnicode(s+i, j-i); 7115 } 7116 7117 7118 static PyObject * 7119 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 7120 { 7121 PyObject *sep = NULL; 7122 7123 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 7124 return NULL; 7125 7126 if (sep != NULL && sep != Py_None) { 7127 if (PyUnicode_Check(sep)) 7128 return _PyUnicode_XStrip(self, striptype, sep); 7129 else if (PyString_Check(sep)) { 7130 PyObject *res; 7131 sep = PyUnicode_FromObject(sep); 7132 if (sep==NULL) 7133 return NULL; 7134 res = _PyUnicode_XStrip(self, striptype, sep); 7135 Py_DECREF(sep); 7136 return res; 7137 } 7138 else { 7139 PyErr_Format(PyExc_TypeError, 7140 "%s arg must be None, unicode or str", 7141 STRIPNAME(striptype)); 7142 return NULL; 7143 } 7144 } 7145 7146 return do_strip(self, striptype); 7147 } 7148 7149 7150 PyDoc_STRVAR(strip__doc__, 7151 "S.strip([chars]) -> unicode\n\ 7152 \n\ 7153 Return a copy of the string S with leading and trailing\n\ 7154 whitespace removed.\n\ 7155 If chars is given and not None, remove characters in chars instead.\n\ 7156 If chars is a str, it will be converted to unicode before stripping"); 7157 7158 static PyObject * 7159 unicode_strip(PyUnicodeObject *self, PyObject *args) 7160 { 7161 if (PyTuple_GET_SIZE(args) == 0) 7162 return do_strip(self, BOTHSTRIP); /* Common case */ 7163 else 7164 return do_argstrip(self, BOTHSTRIP, args); 7165 } 7166 7167 7168 PyDoc_STRVAR(lstrip__doc__, 7169 "S.lstrip([chars]) -> unicode\n\ 7170 \n\ 7171 Return a copy of the string S with leading whitespace removed.\n\ 7172 If chars is given and not None, remove characters in chars instead.\n\ 7173 If chars is a str, it will be converted to unicode before stripping"); 7174 7175 static PyObject * 7176 unicode_lstrip(PyUnicodeObject *self, PyObject *args) 7177 { 7178 if (PyTuple_GET_SIZE(args) == 0) 7179 return do_strip(self, LEFTSTRIP); /* Common case */ 7180 else 7181 return do_argstrip(self, LEFTSTRIP, args); 7182 } 7183 7184 7185 PyDoc_STRVAR(rstrip__doc__, 7186 "S.rstrip([chars]) -> unicode\n\ 7187 \n\ 7188 Return a copy of the string S with trailing whitespace removed.\n\ 7189 If chars is given and not None, remove characters in chars instead.\n\ 7190 If chars is a str, it will be converted to unicode before stripping"); 7191 7192 static PyObject * 7193 unicode_rstrip(PyUnicodeObject *self, PyObject *args) 7194 { 7195 if (PyTuple_GET_SIZE(args) == 0) 7196 return do_strip(self, RIGHTSTRIP); /* Common case */ 7197 else 7198 return do_argstrip(self, RIGHTSTRIP, args); 7199 } 7200 7201 7202 static PyObject* 7203 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 7204 { 7205 PyUnicodeObject *u; 7206 Py_UNICODE *p; 7207 Py_ssize_t nchars; 7208 size_t nbytes; 7209 7210 if (len < 0) 7211 len = 0; 7212 7213 if (len == 1 && PyUnicode_CheckExact(str)) { 7214 /* no repeat, return original string */ 7215 Py_INCREF(str); 7216 return (PyObject*) str; 7217 } 7218 7219 /* ensure # of chars needed doesn't overflow int and # of bytes 7220 * needed doesn't overflow size_t 7221 */ 7222 nchars = len * str->length; 7223 if (len && nchars / len != str->length) { 7224 PyErr_SetString(PyExc_OverflowError, 7225 "repeated string is too long"); 7226 return NULL; 7227 } 7228 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 7229 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 7230 PyErr_SetString(PyExc_OverflowError, 7231 "repeated string is too long"); 7232 return NULL; 7233 } 7234 u = _PyUnicode_New(nchars); 7235 if (!u) 7236 return NULL; 7237 7238 p = u->str; 7239 7240 if (str->length == 1 && len > 0) { 7241 Py_UNICODE_FILL(p, str->str[0], len); 7242 } else { 7243 Py_ssize_t done = 0; /* number of characters copied this far */ 7244 if (done < nchars) { 7245 Py_UNICODE_COPY(p, str->str, str->length); 7246 done = str->length; 7247 } 7248 while (done < nchars) { 7249 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done; 7250 Py_UNICODE_COPY(p+done, p, n); 7251 done += n; 7252 } 7253 } 7254 7255 return (PyObject*) u; 7256 } 7257 7258 PyObject *PyUnicode_Replace(PyObject *obj, 7259 PyObject *subobj, 7260 PyObject *replobj, 7261 Py_ssize_t maxcount) 7262 { 7263 PyObject *self; 7264 PyObject *str1; 7265 PyObject *str2; 7266 PyObject *result; 7267 7268 self = PyUnicode_FromObject(obj); 7269 if (self == NULL) 7270 return NULL; 7271 str1 = PyUnicode_FromObject(subobj); 7272 if (str1 == NULL) { 7273 Py_DECREF(self); 7274 return NULL; 7275 } 7276 str2 = PyUnicode_FromObject(replobj); 7277 if (str2 == NULL) { 7278 Py_DECREF(self); 7279 Py_DECREF(str1); 7280 return NULL; 7281 } 7282 result = replace((PyUnicodeObject *)self, 7283 (PyUnicodeObject *)str1, 7284 (PyUnicodeObject *)str2, 7285 maxcount); 7286 Py_DECREF(self); 7287 Py_DECREF(str1); 7288 Py_DECREF(str2); 7289 return result; 7290 } 7291 7292 PyDoc_STRVAR(replace__doc__, 7293 "S.replace(old, new[, count]) -> unicode\n\ 7294 \n\ 7295 Return a copy of S with all occurrences of substring\n\ 7296 old replaced by new. If the optional argument count is\n\ 7297 given, only the first count occurrences are replaced."); 7298 7299 static PyObject* 7300 unicode_replace(PyUnicodeObject *self, PyObject *args) 7301 { 7302 PyUnicodeObject *str1; 7303 PyUnicodeObject *str2; 7304 Py_ssize_t maxcount = -1; 7305 PyObject *result; 7306 7307 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 7308 return NULL; 7309 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 7310 if (str1 == NULL) 7311 return NULL; 7312 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 7313 if (str2 == NULL) { 7314 Py_DECREF(str1); 7315 return NULL; 7316 } 7317 7318 result = replace(self, str1, str2, maxcount); 7319 7320 Py_DECREF(str1); 7321 Py_DECREF(str2); 7322 return result; 7323 } 7324 7325 static 7326 PyObject *unicode_repr(PyObject *unicode) 7327 { 7328 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode), 7329 PyUnicode_GET_SIZE(unicode), 7330 1); 7331 } 7332 7333 PyDoc_STRVAR(rfind__doc__, 7334 "S.rfind(sub [,start [,end]]) -> int\n\ 7335 \n\ 7336 Return the highest index in S where substring sub is found,\n\ 7337 such that sub is contained within S[start:end]. Optional\n\ 7338 arguments start and end are interpreted as in slice notation.\n\ 7339 \n\ 7340 Return -1 on failure."); 7341 7342 static PyObject * 7343 unicode_rfind(PyUnicodeObject *self, PyObject *args) 7344 { 7345 PyUnicodeObject *substring; 7346 Py_ssize_t start; 7347 Py_ssize_t end; 7348 Py_ssize_t result; 7349 7350 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 7351 &start, &end)) 7352 return NULL; 7353 7354 result = stringlib_rfind_slice( 7355 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7356 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7357 start, end 7358 ); 7359 7360 Py_DECREF(substring); 7361 7362 return PyInt_FromSsize_t(result); 7363 } 7364 7365 PyDoc_STRVAR(rindex__doc__, 7366 "S.rindex(sub [,start [,end]]) -> int\n\ 7367 \n\ 7368 Like S.rfind() but raise ValueError when the substring is not found."); 7369 7370 static PyObject * 7371 unicode_rindex(PyUnicodeObject *self, PyObject *args) 7372 { 7373 PyUnicodeObject *substring; 7374 Py_ssize_t start; 7375 Py_ssize_t end; 7376 Py_ssize_t result; 7377 7378 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 7379 &start, &end)) 7380 return NULL; 7381 7382 result = stringlib_rfind_slice( 7383 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7384 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7385 start, end 7386 ); 7387 7388 Py_DECREF(substring); 7389 7390 if (result < 0) { 7391 PyErr_SetString(PyExc_ValueError, "substring not found"); 7392 return NULL; 7393 } 7394 return PyInt_FromSsize_t(result); 7395 } 7396 7397 PyDoc_STRVAR(rjust__doc__, 7398 "S.rjust(width[, fillchar]) -> unicode\n\ 7399 \n\ 7400 Return S right-justified in a Unicode string of length width. Padding is\n\ 7401 done using the specified fill character (default is a space)."); 7402 7403 static PyObject * 7404 unicode_rjust(PyUnicodeObject *self, PyObject *args) 7405 { 7406 Py_ssize_t width; 7407 Py_UNICODE fillchar = ' '; 7408 7409 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 7410 return NULL; 7411 7412 if (self->length >= width && PyUnicode_CheckExact(self)) { 7413 Py_INCREF(self); 7414 return (PyObject*) self; 7415 } 7416 7417 return (PyObject*) pad(self, width - self->length, 0, fillchar); 7418 } 7419 7420 static PyObject* 7421 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end) 7422 { 7423 /* standard clamping */ 7424 if (start < 0) 7425 start = 0; 7426 if (end < 0) 7427 end = 0; 7428 if (end > self->length) 7429 end = self->length; 7430 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) { 7431 /* full slice, return original string */ 7432 Py_INCREF(self); 7433 return (PyObject*) self; 7434 } 7435 if (start > end) 7436 start = end; 7437 /* copy slice */ 7438 return (PyObject*) PyUnicode_FromUnicode(self->str + start, 7439 end - start); 7440 } 7441 7442 PyObject *PyUnicode_Split(PyObject *s, 7443 PyObject *sep, 7444 Py_ssize_t maxsplit) 7445 { 7446 PyObject *result; 7447 7448 s = PyUnicode_FromObject(s); 7449 if (s == NULL) 7450 return NULL; 7451 if (sep != NULL) { 7452 sep = PyUnicode_FromObject(sep); 7453 if (sep == NULL) { 7454 Py_DECREF(s); 7455 return NULL; 7456 } 7457 } 7458 7459 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 7460 7461 Py_DECREF(s); 7462 Py_XDECREF(sep); 7463 return result; 7464 } 7465 7466 PyDoc_STRVAR(split__doc__, 7467 "S.split([sep [,maxsplit]]) -> list of strings\n\ 7468 \n\ 7469 Return a list of the words in S, using sep as the\n\ 7470 delimiter string. If maxsplit is given, at most maxsplit\n\ 7471 splits are done. If sep is not specified or is None, any\n\ 7472 whitespace string is a separator and empty strings are\n\ 7473 removed from the result."); 7474 7475 static PyObject* 7476 unicode_split(PyUnicodeObject *self, PyObject *args) 7477 { 7478 PyObject *substring = Py_None; 7479 Py_ssize_t maxcount = -1; 7480 7481 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 7482 return NULL; 7483 7484 if (substring == Py_None) 7485 return split(self, NULL, maxcount); 7486 else if (PyUnicode_Check(substring)) 7487 return split(self, (PyUnicodeObject *)substring, maxcount); 7488 else 7489 return PyUnicode_Split((PyObject *)self, substring, maxcount); 7490 } 7491 7492 PyObject * 7493 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 7494 { 7495 PyObject* str_obj; 7496 PyObject* sep_obj; 7497 PyObject* out; 7498 7499 str_obj = PyUnicode_FromObject(str_in); 7500 if (!str_obj) 7501 return NULL; 7502 sep_obj = PyUnicode_FromObject(sep_in); 7503 if (!sep_obj) { 7504 Py_DECREF(str_obj); 7505 return NULL; 7506 } 7507 7508 out = stringlib_partition( 7509 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 7510 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 7511 ); 7512 7513 Py_DECREF(sep_obj); 7514 Py_DECREF(str_obj); 7515 7516 return out; 7517 } 7518 7519 7520 PyObject * 7521 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 7522 { 7523 PyObject* str_obj; 7524 PyObject* sep_obj; 7525 PyObject* out; 7526 7527 str_obj = PyUnicode_FromObject(str_in); 7528 if (!str_obj) 7529 return NULL; 7530 sep_obj = PyUnicode_FromObject(sep_in); 7531 if (!sep_obj) { 7532 Py_DECREF(str_obj); 7533 return NULL; 7534 } 7535 7536 out = stringlib_rpartition( 7537 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 7538 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 7539 ); 7540 7541 Py_DECREF(sep_obj); 7542 Py_DECREF(str_obj); 7543 7544 return out; 7545 } 7546 7547 PyDoc_STRVAR(partition__doc__, 7548 "S.partition(sep) -> (head, sep, tail)\n\ 7549 \n\ 7550 Search for the separator sep in S, and return the part before it,\n\ 7551 the separator itself, and the part after it. If the separator is not\n\ 7552 found, return S and two empty strings."); 7553 7554 static PyObject* 7555 unicode_partition(PyUnicodeObject *self, PyObject *separator) 7556 { 7557 return PyUnicode_Partition((PyObject *)self, separator); 7558 } 7559 7560 PyDoc_STRVAR(rpartition__doc__, 7561 "S.rpartition(sep) -> (head, sep, tail)\n\ 7562 \n\ 7563 Search for the separator sep in S, starting at the end of S, and return\n\ 7564 the part before it, the separator itself, and the part after it. If the\n\ 7565 separator is not found, return two empty strings and S."); 7566 7567 static PyObject* 7568 unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 7569 { 7570 return PyUnicode_RPartition((PyObject *)self, separator); 7571 } 7572 7573 PyObject *PyUnicode_RSplit(PyObject *s, 7574 PyObject *sep, 7575 Py_ssize_t maxsplit) 7576 { 7577 PyObject *result; 7578 7579 s = PyUnicode_FromObject(s); 7580 if (s == NULL) 7581 return NULL; 7582 if (sep != NULL) { 7583 sep = PyUnicode_FromObject(sep); 7584 if (sep == NULL) { 7585 Py_DECREF(s); 7586 return NULL; 7587 } 7588 } 7589 7590 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 7591 7592 Py_DECREF(s); 7593 Py_XDECREF(sep); 7594 return result; 7595 } 7596 7597 PyDoc_STRVAR(rsplit__doc__, 7598 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\ 7599 \n\ 7600 Return a list of the words in S, using sep as the\n\ 7601 delimiter string, starting at the end of the string and\n\ 7602 working to the front. If maxsplit is given, at most maxsplit\n\ 7603 splits are done. If sep is not specified, any whitespace string\n\ 7604 is a separator."); 7605 7606 static PyObject* 7607 unicode_rsplit(PyUnicodeObject *self, PyObject *args) 7608 { 7609 PyObject *substring = Py_None; 7610 Py_ssize_t maxcount = -1; 7611 7612 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 7613 return NULL; 7614 7615 if (substring == Py_None) 7616 return rsplit(self, NULL, maxcount); 7617 else if (PyUnicode_Check(substring)) 7618 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 7619 else 7620 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 7621 } 7622 7623 PyDoc_STRVAR(splitlines__doc__, 7624 "S.splitlines(keepends=False) -> list of strings\n\ 7625 \n\ 7626 Return a list of the lines in S, breaking at line boundaries.\n\ 7627 Line breaks are not included in the resulting list unless keepends\n\ 7628 is given and true."); 7629 7630 static PyObject* 7631 unicode_splitlines(PyUnicodeObject *self, PyObject *args) 7632 { 7633 int keepends = 0; 7634 7635 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 7636 return NULL; 7637 7638 return PyUnicode_Splitlines((PyObject *)self, keepends); 7639 } 7640 7641 static 7642 PyObject *unicode_str(PyUnicodeObject *self) 7643 { 7644 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL); 7645 } 7646 7647 PyDoc_STRVAR(swapcase__doc__, 7648 "S.swapcase() -> unicode\n\ 7649 \n\ 7650 Return a copy of S with uppercase characters converted to lowercase\n\ 7651 and vice versa."); 7652 7653 static PyObject* 7654 unicode_swapcase(PyUnicodeObject *self) 7655 { 7656 return fixup(self, fixswapcase); 7657 } 7658 7659 PyDoc_STRVAR(translate__doc__, 7660 "S.translate(table) -> unicode\n\ 7661 \n\ 7662 Return a copy of the string S, where all characters have been mapped\n\ 7663 through the given translation table, which must be a mapping of\n\ 7664 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\ 7665 Unmapped characters are left untouched. Characters mapped to None\n\ 7666 are deleted."); 7667 7668 static PyObject* 7669 unicode_translate(PyUnicodeObject *self, PyObject *table) 7670 { 7671 return PyUnicode_TranslateCharmap(self->str, 7672 self->length, 7673 table, 7674 "ignore"); 7675 } 7676 7677 PyDoc_STRVAR(upper__doc__, 7678 "S.upper() -> unicode\n\ 7679 \n\ 7680 Return a copy of S converted to uppercase."); 7681 7682 static PyObject* 7683 unicode_upper(PyUnicodeObject *self) 7684 { 7685 return fixup(self, fixupper); 7686 } 7687 7688 PyDoc_STRVAR(zfill__doc__, 7689 "S.zfill(width) -> unicode\n\ 7690 \n\ 7691 Pad a numeric string S with zeros on the left, to fill a field\n\ 7692 of the specified width. The string S is never truncated."); 7693 7694 static PyObject * 7695 unicode_zfill(PyUnicodeObject *self, PyObject *args) 7696 { 7697 Py_ssize_t fill; 7698 PyUnicodeObject *u; 7699 7700 Py_ssize_t width; 7701 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 7702 return NULL; 7703 7704 if (self->length >= width) { 7705 if (PyUnicode_CheckExact(self)) { 7706 Py_INCREF(self); 7707 return (PyObject*) self; 7708 } 7709 else 7710 return PyUnicode_FromUnicode( 7711 PyUnicode_AS_UNICODE(self), 7712 PyUnicode_GET_SIZE(self) 7713 ); 7714 } 7715 7716 fill = width - self->length; 7717 7718 u = pad(self, fill, 0, '0'); 7719 7720 if (u == NULL) 7721 return NULL; 7722 7723 if (u->str[fill] == '+' || u->str[fill] == '-') { 7724 /* move sign to beginning of string */ 7725 u->str[0] = u->str[fill]; 7726 u->str[fill] = '0'; 7727 } 7728 7729 return (PyObject*) u; 7730 } 7731 7732 #if 0 7733 static PyObject* 7734 free_listsize(PyUnicodeObject *self) 7735 { 7736 return PyInt_FromLong(numfree); 7737 } 7738 #endif 7739 7740 PyDoc_STRVAR(startswith__doc__, 7741 "S.startswith(prefix[, start[, end]]) -> bool\n\ 7742 \n\ 7743 Return True if S starts with the specified prefix, False otherwise.\n\ 7744 With optional start, test S beginning at that position.\n\ 7745 With optional end, stop comparing S at that position.\n\ 7746 prefix can also be a tuple of strings to try."); 7747 7748 static PyObject * 7749 unicode_startswith(PyUnicodeObject *self, 7750 PyObject *args) 7751 { 7752 PyObject *subobj; 7753 PyUnicodeObject *substring; 7754 Py_ssize_t start = 0; 7755 Py_ssize_t end = PY_SSIZE_T_MAX; 7756 int result; 7757 7758 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 7759 return NULL; 7760 if (PyTuple_Check(subobj)) { 7761 Py_ssize_t i; 7762 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 7763 substring = (PyUnicodeObject *)PyUnicode_FromObject( 7764 PyTuple_GET_ITEM(subobj, i)); 7765 if (substring == NULL) 7766 return NULL; 7767 result = tailmatch(self, substring, start, end, -1); 7768 Py_DECREF(substring); 7769 if (result) { 7770 Py_RETURN_TRUE; 7771 } 7772 } 7773 /* nothing matched */ 7774 Py_RETURN_FALSE; 7775 } 7776 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 7777 if (substring == NULL) { 7778 if (PyErr_ExceptionMatches(PyExc_TypeError)) 7779 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, " 7780 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name); 7781 return NULL; 7782 } 7783 result = tailmatch(self, substring, start, end, -1); 7784 Py_DECREF(substring); 7785 return PyBool_FromLong(result); 7786 } 7787 7788 7789 PyDoc_STRVAR(endswith__doc__, 7790 "S.endswith(suffix[, start[, end]]) -> bool\n\ 7791 \n\ 7792 Return True if S ends with the specified suffix, False otherwise.\n\ 7793 With optional start, test S beginning at that position.\n\ 7794 With optional end, stop comparing S at that position.\n\ 7795 suffix can also be a tuple of strings to try."); 7796 7797 static PyObject * 7798 unicode_endswith(PyUnicodeObject *self, 7799 PyObject *args) 7800 { 7801 PyObject *subobj; 7802 PyUnicodeObject *substring; 7803 Py_ssize_t start = 0; 7804 Py_ssize_t end = PY_SSIZE_T_MAX; 7805 int result; 7806 7807 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 7808 return NULL; 7809 if (PyTuple_Check(subobj)) { 7810 Py_ssize_t i; 7811 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 7812 substring = (PyUnicodeObject *)PyUnicode_FromObject( 7813 PyTuple_GET_ITEM(subobj, i)); 7814 if (substring == NULL) 7815 return NULL; 7816 result = tailmatch(self, substring, start, end, +1); 7817 Py_DECREF(substring); 7818 if (result) { 7819 Py_RETURN_TRUE; 7820 } 7821 } 7822 Py_RETURN_FALSE; 7823 } 7824 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 7825 if (substring == NULL) { 7826 if (PyErr_ExceptionMatches(PyExc_TypeError)) 7827 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, " 7828 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name); 7829 return NULL; 7830 } 7831 result = tailmatch(self, substring, start, end, +1); 7832 Py_DECREF(substring); 7833 return PyBool_FromLong(result); 7834 } 7835 7836 7837 /* Implements do_string_format, which is unicode because of stringlib */ 7838 #include "stringlib/string_format.h" 7839 7840 PyDoc_STRVAR(format__doc__, 7841 "S.format(*args, **kwargs) -> unicode\n\ 7842 \n\ 7843 Return a formatted version of S, using substitutions from args and kwargs.\n\ 7844 The substitutions are identified by braces ('{' and '}')."); 7845 7846 static PyObject * 7847 unicode__format__(PyObject *self, PyObject *args) 7848 { 7849 PyObject *format_spec; 7850 PyObject *result = NULL; 7851 PyObject *tmp = NULL; 7852 7853 /* If 2.x, convert format_spec to the same type as value */ 7854 /* This is to allow things like u''.format('') */ 7855 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec)) 7856 goto done; 7857 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) { 7858 PyErr_Format(PyExc_TypeError, "__format__ arg must be str " 7859 "or unicode, not %s", Py_TYPE(format_spec)->tp_name); 7860 goto done; 7861 } 7862 tmp = PyObject_Unicode(format_spec); 7863 if (tmp == NULL) 7864 goto done; 7865 format_spec = tmp; 7866 7867 result = _PyUnicode_FormatAdvanced(self, 7868 PyUnicode_AS_UNICODE(format_spec), 7869 PyUnicode_GET_SIZE(format_spec)); 7870 done: 7871 Py_XDECREF(tmp); 7872 return result; 7873 } 7874 7875 PyDoc_STRVAR(p_format__doc__, 7876 "S.__format__(format_spec) -> unicode\n\ 7877 \n\ 7878 Return a formatted version of S as described by format_spec."); 7879 7880 static PyObject * 7881 unicode__sizeof__(PyUnicodeObject *v) 7882 { 7883 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) + 7884 sizeof(Py_UNICODE) * (v->length + 1)); 7885 } 7886 7887 PyDoc_STRVAR(sizeof__doc__, 7888 "S.__sizeof__() -> size of S in memory, in bytes\n\ 7889 \n\ 7890 "); 7891 7892 static PyObject * 7893 unicode_getnewargs(PyUnicodeObject *v) 7894 { 7895 return Py_BuildValue("(u#)", v->str, v->length); 7896 } 7897 7898 7899 static PyMethodDef unicode_methods[] = { 7900 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 7901 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 7902 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 7903 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 7904 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 7905 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 7906 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 7907 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 7908 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 7909 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 7910 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 7911 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 7912 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 7913 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 7914 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 7915 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 7916 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__}, 7917 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */ 7918 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 7919 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 7920 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 7921 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 7922 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 7923 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 7924 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 7925 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 7926 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 7927 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 7928 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 7929 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 7930 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 7931 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 7932 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 7933 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 7934 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 7935 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 7936 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 7937 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 7938 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 7939 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 7940 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 7941 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 7942 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS}, 7943 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS}, 7944 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 7945 #if 0 7946 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 7947 #endif 7948 7949 #if 0 7950 /* This one is just used for debugging the implementation. */ 7951 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS}, 7952 #endif 7953 7954 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 7955 {NULL, NULL} 7956 }; 7957 7958 static PyObject * 7959 unicode_mod(PyObject *v, PyObject *w) 7960 { 7961 if (!PyUnicode_Check(v)) { 7962 Py_INCREF(Py_NotImplemented); 7963 return Py_NotImplemented; 7964 } 7965 return PyUnicode_Format(v, w); 7966 } 7967 7968 static PyNumberMethods unicode_as_number = { 7969 0, /*nb_add*/ 7970 0, /*nb_subtract*/ 7971 0, /*nb_multiply*/ 7972 0, /*nb_divide*/ 7973 unicode_mod, /*nb_remainder*/ 7974 }; 7975 7976 static PySequenceMethods unicode_as_sequence = { 7977 (lenfunc) unicode_length, /* sq_length */ 7978 PyUnicode_Concat, /* sq_concat */ 7979 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 7980 (ssizeargfunc) unicode_getitem, /* sq_item */ 7981 (ssizessizeargfunc) unicode_slice, /* sq_slice */ 7982 0, /* sq_ass_item */ 7983 0, /* sq_ass_slice */ 7984 PyUnicode_Contains, /* sq_contains */ 7985 }; 7986 7987 static PyObject* 7988 unicode_subscript(PyUnicodeObject* self, PyObject* item) 7989 { 7990 if (PyIndex_Check(item)) { 7991 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 7992 if (i == -1 && PyErr_Occurred()) 7993 return NULL; 7994 if (i < 0) 7995 i += PyUnicode_GET_SIZE(self); 7996 return unicode_getitem(self, i); 7997 } else if (PySlice_Check(item)) { 7998 Py_ssize_t start, stop, step, slicelength, cur, i; 7999 Py_UNICODE* source_buf; 8000 Py_UNICODE* result_buf; 8001 PyObject* result; 8002 8003 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self), 8004 &start, &stop, &step, &slicelength) < 0) { 8005 return NULL; 8006 } 8007 8008 if (slicelength <= 0) { 8009 return PyUnicode_FromUnicode(NULL, 0); 8010 } else if (start == 0 && step == 1 && slicelength == self->length && 8011 PyUnicode_CheckExact(self)) { 8012 Py_INCREF(self); 8013 return (PyObject *)self; 8014 } else if (step == 1) { 8015 return PyUnicode_FromUnicode(self->str + start, slicelength); 8016 } else { 8017 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 8018 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength* 8019 sizeof(Py_UNICODE)); 8020 8021 if (result_buf == NULL) 8022 return PyErr_NoMemory(); 8023 8024 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 8025 result_buf[i] = source_buf[cur]; 8026 } 8027 8028 result = PyUnicode_FromUnicode(result_buf, slicelength); 8029 PyObject_FREE(result_buf); 8030 return result; 8031 } 8032 } else { 8033 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 8034 return NULL; 8035 } 8036 } 8037 8038 static PyMappingMethods unicode_as_mapping = { 8039 (lenfunc)unicode_length, /* mp_length */ 8040 (binaryfunc)unicode_subscript, /* mp_subscript */ 8041 (objobjargproc)0, /* mp_ass_subscript */ 8042 }; 8043 8044 static Py_ssize_t 8045 unicode_buffer_getreadbuf(PyUnicodeObject *self, 8046 Py_ssize_t index, 8047 const void **ptr) 8048 { 8049 if (index != 0) { 8050 PyErr_SetString(PyExc_SystemError, 8051 "accessing non-existent unicode segment"); 8052 return -1; 8053 } 8054 *ptr = (void *) self->str; 8055 return PyUnicode_GET_DATA_SIZE(self); 8056 } 8057 8058 static Py_ssize_t 8059 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index, 8060 const void **ptr) 8061 { 8062 PyErr_SetString(PyExc_TypeError, 8063 "cannot use unicode as modifiable buffer"); 8064 return -1; 8065 } 8066 8067 static int 8068 unicode_buffer_getsegcount(PyUnicodeObject *self, 8069 Py_ssize_t *lenp) 8070 { 8071 if (lenp) 8072 *lenp = PyUnicode_GET_DATA_SIZE(self); 8073 return 1; 8074 } 8075 8076 static Py_ssize_t 8077 unicode_buffer_getcharbuf(PyUnicodeObject *self, 8078 Py_ssize_t index, 8079 const void **ptr) 8080 { 8081 PyObject *str; 8082 8083 if (index != 0) { 8084 PyErr_SetString(PyExc_SystemError, 8085 "accessing non-existent unicode segment"); 8086 return -1; 8087 } 8088 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL); 8089 if (str == NULL) 8090 return -1; 8091 *ptr = (void *) PyString_AS_STRING(str); 8092 return PyString_GET_SIZE(str); 8093 } 8094 8095 /* Helpers for PyUnicode_Format() */ 8096 8097 static PyObject * 8098 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 8099 { 8100 Py_ssize_t argidx = *p_argidx; 8101 if (argidx < arglen) { 8102 (*p_argidx)++; 8103 if (arglen < 0) 8104 return args; 8105 else 8106 return PyTuple_GetItem(args, argidx); 8107 } 8108 PyErr_SetString(PyExc_TypeError, 8109 "not enough arguments for format string"); 8110 return NULL; 8111 } 8112 8113 #define F_LJUST (1<<0) 8114 #define F_SIGN (1<<1) 8115 #define F_BLANK (1<<2) 8116 #define F_ALT (1<<3) 8117 #define F_ZERO (1<<4) 8118 8119 static Py_ssize_t 8120 strtounicode(Py_UNICODE *buffer, const char *charbuffer) 8121 { 8122 register Py_ssize_t i; 8123 Py_ssize_t len = strlen(charbuffer); 8124 for (i = len - 1; i >= 0; i--) 8125 buffer[i] = (Py_UNICODE) charbuffer[i]; 8126 8127 return len; 8128 } 8129 8130 static int 8131 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x) 8132 { 8133 Py_ssize_t result; 8134 8135 PyOS_snprintf((char *)buffer, len, format, x); 8136 result = strtounicode(buffer, (char *)buffer); 8137 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); 8138 } 8139 8140 /* XXX To save some code duplication, formatfloat/long/int could have been 8141 shared with stringobject.c, converting from 8-bit to Unicode after the 8142 formatting is done. */ 8143 8144 /* Returns a new reference to a PyUnicode object, or NULL on failure. */ 8145 8146 static PyObject * 8147 formatfloat(PyObject *v, int flags, int prec, int type) 8148 { 8149 char *p; 8150 PyObject *result; 8151 double x; 8152 8153 x = PyFloat_AsDouble(v); 8154 if (x == -1.0 && PyErr_Occurred()) 8155 return NULL; 8156 8157 if (prec < 0) 8158 prec = 6; 8159 8160 p = PyOS_double_to_string(x, type, prec, 8161 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 8162 if (p == NULL) 8163 return NULL; 8164 result = PyUnicode_FromStringAndSize(p, strlen(p)); 8165 PyMem_Free(p); 8166 return result; 8167 } 8168 8169 static PyObject* 8170 formatlong(PyObject *val, int flags, int prec, int type) 8171 { 8172 char *buf; 8173 int i, len; 8174 PyObject *str; /* temporary string object. */ 8175 PyUnicodeObject *result; 8176 8177 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len); 8178 if (!str) 8179 return NULL; 8180 result = _PyUnicode_New(len); 8181 if (!result) { 8182 Py_DECREF(str); 8183 return NULL; 8184 } 8185 for (i = 0; i < len; i++) 8186 result->str[i] = buf[i]; 8187 result->str[len] = 0; 8188 Py_DECREF(str); 8189 return (PyObject*)result; 8190 } 8191 8192 static int 8193 formatint(Py_UNICODE *buf, 8194 size_t buflen, 8195 int flags, 8196 int prec, 8197 int type, 8198 PyObject *v) 8199 { 8200 /* fmt = '%#.' + `prec` + 'l' + `type` 8201 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 8202 * + 1 + 1 8203 * = 24 8204 */ 8205 char fmt[64]; /* plenty big enough! */ 8206 char *sign; 8207 long x; 8208 8209 x = PyInt_AsLong(v); 8210 if (x == -1 && PyErr_Occurred()) 8211 return -1; 8212 if (x < 0 && type == 'u') { 8213 type = 'd'; 8214 } 8215 if (x < 0 && (type == 'x' || type == 'X' || type == 'o')) 8216 sign = "-"; 8217 else 8218 sign = ""; 8219 if (prec < 0) 8220 prec = 1; 8221 8222 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal)) 8223 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11 8224 */ 8225 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) { 8226 PyErr_SetString(PyExc_OverflowError, 8227 "formatted integer is too long (precision too large?)"); 8228 return -1; 8229 } 8230 8231 if ((flags & F_ALT) && 8232 (type == 'x' || type == 'X')) { 8233 /* When converting under %#x or %#X, there are a number 8234 * of issues that cause pain: 8235 * - when 0 is being converted, the C standard leaves off 8236 * the '0x' or '0X', which is inconsistent with other 8237 * %#x/%#X conversions and inconsistent with Python's 8238 * hex() function 8239 * - there are platforms that violate the standard and 8240 * convert 0 with the '0x' or '0X' 8241 * (Metrowerks, Compaq Tru64) 8242 * - there are platforms that give '0x' when converting 8243 * under %#X, but convert 0 in accordance with the 8244 * standard (OS/2 EMX) 8245 * 8246 * We can achieve the desired consistency by inserting our 8247 * own '0x' or '0X' prefix, and substituting %x/%X in place 8248 * of %#x/%#X. 8249 * 8250 * Note that this is the same approach as used in 8251 * formatint() in stringobject.c 8252 */ 8253 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c", 8254 sign, type, prec, type); 8255 } 8256 else { 8257 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c", 8258 sign, (flags&F_ALT) ? "#" : "", 8259 prec, type); 8260 } 8261 if (sign[0]) 8262 return longtounicode(buf, buflen, fmt, -x); 8263 else 8264 return longtounicode(buf, buflen, fmt, x); 8265 } 8266 8267 static int 8268 formatchar(Py_UNICODE *buf, 8269 size_t buflen, 8270 PyObject *v) 8271 { 8272 PyObject *unistr; 8273 char *str; 8274 /* presume that the buffer is at least 2 characters long */ 8275 if (PyUnicode_Check(v)) { 8276 if (PyUnicode_GET_SIZE(v) != 1) 8277 goto onError; 8278 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 8279 } 8280 8281 else if (PyString_Check(v)) { 8282 if (PyString_GET_SIZE(v) != 1) 8283 goto onError; 8284 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail 8285 with a UnicodeDecodeError if 'char' is not decodable with the 8286 default encoding (usually ASCII, but it might be something else) */ 8287 str = PyString_AS_STRING(v); 8288 if ((unsigned char)str[0] > 0x7F) { 8289 /* the char is not ASCII; try to decode the string using the 8290 default encoding and return -1 to let the UnicodeDecodeError 8291 be raised if the string can't be decoded */ 8292 unistr = PyUnicode_Decode(str, 1, NULL, "strict"); 8293 if (unistr == NULL) 8294 return -1; 8295 buf[0] = PyUnicode_AS_UNICODE(unistr)[0]; 8296 Py_DECREF(unistr); 8297 } 8298 else 8299 buf[0] = (Py_UNICODE)str[0]; 8300 } 8301 8302 else { 8303 /* Integer input truncated to a character */ 8304 long x; 8305 x = PyInt_AsLong(v); 8306 if (x == -1 && PyErr_Occurred()) 8307 goto onError; 8308 #ifdef Py_UNICODE_WIDE 8309 if (x < 0 || x > 0x10ffff) { 8310 PyErr_SetString(PyExc_OverflowError, 8311 "%c arg not in range(0x110000) " 8312 "(wide Python build)"); 8313 return -1; 8314 } 8315 #else 8316 if (x < 0 || x > 0xffff) { 8317 PyErr_SetString(PyExc_OverflowError, 8318 "%c arg not in range(0x10000) " 8319 "(narrow Python build)"); 8320 return -1; 8321 } 8322 #endif 8323 buf[0] = (Py_UNICODE) x; 8324 } 8325 buf[1] = '\0'; 8326 return 1; 8327 8328 onError: 8329 PyErr_SetString(PyExc_TypeError, 8330 "%c requires int or char"); 8331 return -1; 8332 } 8333 8334 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 8335 8336 FORMATBUFLEN is the length of the buffer in which the ints & 8337 chars are formatted. XXX This is a magic number. Each formatting 8338 routine does bounds checking to ensure no overflow, but a better 8339 solution may be to malloc a buffer of appropriate size for each 8340 format. For now, the current solution is sufficient. 8341 */ 8342 #define FORMATBUFLEN (size_t)120 8343 8344 PyObject *PyUnicode_Format(PyObject *format, 8345 PyObject *args) 8346 { 8347 Py_UNICODE *fmt, *res; 8348 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx; 8349 int args_owned = 0; 8350 PyUnicodeObject *result = NULL; 8351 PyObject *dict = NULL; 8352 PyObject *uformat; 8353 8354 if (format == NULL || args == NULL) { 8355 PyErr_BadInternalCall(); 8356 return NULL; 8357 } 8358 uformat = PyUnicode_FromObject(format); 8359 if (uformat == NULL) 8360 return NULL; 8361 fmt = PyUnicode_AS_UNICODE(uformat); 8362 fmtcnt = PyUnicode_GET_SIZE(uformat); 8363 8364 reslen = rescnt = fmtcnt + 100; 8365 result = _PyUnicode_New(reslen); 8366 if (result == NULL) 8367 goto onError; 8368 res = PyUnicode_AS_UNICODE(result); 8369 8370 if (PyTuple_Check(args)) { 8371 arglen = PyTuple_Size(args); 8372 argidx = 0; 8373 } 8374 else { 8375 arglen = -1; 8376 argidx = -2; 8377 } 8378 if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript && 8379 !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type)) 8380 dict = args; 8381 8382 while (--fmtcnt >= 0) { 8383 if (*fmt != '%') { 8384 if (--rescnt < 0) { 8385 rescnt = fmtcnt + 100; 8386 reslen += rescnt; 8387 if (_PyUnicode_Resize(&result, reslen) < 0) 8388 goto onError; 8389 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 8390 --rescnt; 8391 } 8392 *res++ = *fmt++; 8393 } 8394 else { 8395 /* Got a format specifier */ 8396 int flags = 0; 8397 Py_ssize_t width = -1; 8398 int prec = -1; 8399 Py_UNICODE c = '\0'; 8400 Py_UNICODE fill; 8401 int isnumok; 8402 PyObject *v = NULL; 8403 PyObject *temp = NULL; 8404 Py_UNICODE *pbuf; 8405 Py_UNICODE sign; 8406 Py_ssize_t len; 8407 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */ 8408 8409 fmt++; 8410 if (*fmt == '(') { 8411 Py_UNICODE *keystart; 8412 Py_ssize_t keylen; 8413 PyObject *key; 8414 int pcount = 1; 8415 8416 if (dict == NULL) { 8417 PyErr_SetString(PyExc_TypeError, 8418 "format requires a mapping"); 8419 goto onError; 8420 } 8421 ++fmt; 8422 --fmtcnt; 8423 keystart = fmt; 8424 /* Skip over balanced parentheses */ 8425 while (pcount > 0 && --fmtcnt >= 0) { 8426 if (*fmt == ')') 8427 --pcount; 8428 else if (*fmt == '(') 8429 ++pcount; 8430 fmt++; 8431 } 8432 keylen = fmt - keystart - 1; 8433 if (fmtcnt < 0 || pcount > 0) { 8434 PyErr_SetString(PyExc_ValueError, 8435 "incomplete format key"); 8436 goto onError; 8437 } 8438 #if 0 8439 /* keys are converted to strings using UTF-8 and 8440 then looked up since Python uses strings to hold 8441 variables names etc. in its namespaces and we 8442 wouldn't want to break common idioms. */ 8443 key = PyUnicode_EncodeUTF8(keystart, 8444 keylen, 8445 NULL); 8446 #else 8447 key = PyUnicode_FromUnicode(keystart, keylen); 8448 #endif 8449 if (key == NULL) 8450 goto onError; 8451 if (args_owned) { 8452 Py_DECREF(args); 8453 args_owned = 0; 8454 } 8455 args = PyObject_GetItem(dict, key); 8456 Py_DECREF(key); 8457 if (args == NULL) { 8458 goto onError; 8459 } 8460 args_owned = 1; 8461 arglen = -1; 8462 argidx = -2; 8463 } 8464 while (--fmtcnt >= 0) { 8465 switch (c = *fmt++) { 8466 case '-': flags |= F_LJUST; continue; 8467 case '+': flags |= F_SIGN; continue; 8468 case ' ': flags |= F_BLANK; continue; 8469 case '#': flags |= F_ALT; continue; 8470 case '0': flags |= F_ZERO; continue; 8471 } 8472 break; 8473 } 8474 if (c == '*') { 8475 v = getnextarg(args, arglen, &argidx); 8476 if (v == NULL) 8477 goto onError; 8478 if (!PyInt_Check(v)) { 8479 PyErr_SetString(PyExc_TypeError, 8480 "* wants int"); 8481 goto onError; 8482 } 8483 width = PyInt_AsSsize_t(v); 8484 if (width == -1 && PyErr_Occurred()) 8485 goto onError; 8486 if (width < 0) { 8487 flags |= F_LJUST; 8488 width = -width; 8489 } 8490 if (--fmtcnt >= 0) 8491 c = *fmt++; 8492 } 8493 else if (c >= '0' && c <= '9') { 8494 width = c - '0'; 8495 while (--fmtcnt >= 0) { 8496 c = *fmt++; 8497 if (c < '0' || c > '9') 8498 break; 8499 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) { 8500 PyErr_SetString(PyExc_ValueError, 8501 "width too big"); 8502 goto onError; 8503 } 8504 width = width*10 + (c - '0'); 8505 } 8506 } 8507 if (c == '.') { 8508 prec = 0; 8509 if (--fmtcnt >= 0) 8510 c = *fmt++; 8511 if (c == '*') { 8512 v = getnextarg(args, arglen, &argidx); 8513 if (v == NULL) 8514 goto onError; 8515 if (!PyInt_Check(v)) { 8516 PyErr_SetString(PyExc_TypeError, 8517 "* wants int"); 8518 goto onError; 8519 } 8520 prec = _PyInt_AsInt(v); 8521 if (prec == -1 && PyErr_Occurred()) 8522 goto onError; 8523 if (prec < 0) 8524 prec = 0; 8525 if (--fmtcnt >= 0) 8526 c = *fmt++; 8527 } 8528 else if (c >= '0' && c <= '9') { 8529 prec = c - '0'; 8530 while (--fmtcnt >= 0) { 8531 c = *fmt++; 8532 if (c < '0' || c > '9') 8533 break; 8534 if (prec > (INT_MAX - ((int)c - '0')) / 10) { 8535 PyErr_SetString(PyExc_ValueError, 8536 "prec too big"); 8537 goto onError; 8538 } 8539 prec = prec*10 + (c - '0'); 8540 } 8541 } 8542 } /* prec */ 8543 if (fmtcnt >= 0) { 8544 if (c == 'h' || c == 'l' || c == 'L') { 8545 if (--fmtcnt >= 0) 8546 c = *fmt++; 8547 } 8548 } 8549 if (fmtcnt < 0) { 8550 PyErr_SetString(PyExc_ValueError, 8551 "incomplete format"); 8552 goto onError; 8553 } 8554 if (c != '%') { 8555 v = getnextarg(args, arglen, &argidx); 8556 if (v == NULL) 8557 goto onError; 8558 } 8559 sign = 0; 8560 fill = ' '; 8561 switch (c) { 8562 8563 case '%': 8564 pbuf = formatbuf; 8565 /* presume that buffer length is at least 1 */ 8566 pbuf[0] = '%'; 8567 len = 1; 8568 break; 8569 8570 case 's': 8571 case 'r': 8572 if (PyUnicode_CheckExact(v) && c == 's') { 8573 temp = v; 8574 Py_INCREF(temp); 8575 } 8576 else { 8577 PyObject *unicode; 8578 if (c == 's') 8579 temp = PyObject_Unicode(v); 8580 else 8581 temp = PyObject_Repr(v); 8582 if (temp == NULL) 8583 goto onError; 8584 if (PyUnicode_Check(temp)) 8585 /* nothing to do */; 8586 else if (PyString_Check(temp)) { 8587 /* convert to string to Unicode */ 8588 unicode = PyUnicode_Decode(PyString_AS_STRING(temp), 8589 PyString_GET_SIZE(temp), 8590 NULL, 8591 "strict"); 8592 Py_DECREF(temp); 8593 temp = unicode; 8594 if (temp == NULL) 8595 goto onError; 8596 } 8597 else { 8598 Py_DECREF(temp); 8599 PyErr_SetString(PyExc_TypeError, 8600 "%s argument has non-string str()"); 8601 goto onError; 8602 } 8603 } 8604 pbuf = PyUnicode_AS_UNICODE(temp); 8605 len = PyUnicode_GET_SIZE(temp); 8606 if (prec >= 0 && len > prec) 8607 len = prec; 8608 break; 8609 8610 case 'i': 8611 case 'd': 8612 case 'u': 8613 case 'o': 8614 case 'x': 8615 case 'X': 8616 if (c == 'i') 8617 c = 'd'; 8618 isnumok = 0; 8619 if (PyNumber_Check(v)) { 8620 PyObject *iobj=NULL; 8621 8622 if (PyInt_Check(v) || (PyLong_Check(v))) { 8623 iobj = v; 8624 Py_INCREF(iobj); 8625 } 8626 else { 8627 iobj = PyNumber_Int(v); 8628 if (iobj==NULL) iobj = PyNumber_Long(v); 8629 } 8630 if (iobj!=NULL) { 8631 if (PyInt_Check(iobj)) { 8632 isnumok = 1; 8633 pbuf = formatbuf; 8634 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 8635 flags, prec, c, iobj); 8636 Py_DECREF(iobj); 8637 if (len < 0) 8638 goto onError; 8639 sign = 1; 8640 } 8641 else if (PyLong_Check(iobj)) { 8642 isnumok = 1; 8643 temp = formatlong(iobj, flags, prec, c); 8644 Py_DECREF(iobj); 8645 if (!temp) 8646 goto onError; 8647 pbuf = PyUnicode_AS_UNICODE(temp); 8648 len = PyUnicode_GET_SIZE(temp); 8649 sign = 1; 8650 } 8651 else { 8652 Py_DECREF(iobj); 8653 } 8654 } 8655 } 8656 if (!isnumok) { 8657 PyErr_Format(PyExc_TypeError, 8658 "%%%c format: a number is required, " 8659 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 8660 goto onError; 8661 } 8662 if (flags & F_ZERO) 8663 fill = '0'; 8664 break; 8665 8666 case 'e': 8667 case 'E': 8668 case 'f': 8669 case 'F': 8670 case 'g': 8671 case 'G': 8672 temp = formatfloat(v, flags, prec, c); 8673 if (temp == NULL) 8674 goto onError; 8675 pbuf = PyUnicode_AS_UNICODE(temp); 8676 len = PyUnicode_GET_SIZE(temp); 8677 sign = 1; 8678 if (flags & F_ZERO) 8679 fill = '0'; 8680 break; 8681 8682 case 'c': 8683 pbuf = formatbuf; 8684 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 8685 if (len < 0) 8686 goto onError; 8687 break; 8688 8689 default: 8690 PyErr_Format(PyExc_ValueError, 8691 "unsupported format character '%c' (0x%x) " 8692 "at index %zd", 8693 (31<=c && c<=126) ? (char)c : '?', 8694 (int)c, 8695 (Py_ssize_t)(fmt - 1 - 8696 PyUnicode_AS_UNICODE(uformat))); 8697 goto onError; 8698 } 8699 if (sign) { 8700 if (*pbuf == '-' || *pbuf == '+') { 8701 sign = *pbuf++; 8702 len--; 8703 } 8704 else if (flags & F_SIGN) 8705 sign = '+'; 8706 else if (flags & F_BLANK) 8707 sign = ' '; 8708 else 8709 sign = 0; 8710 } 8711 if (width < len) 8712 width = len; 8713 if (rescnt - (sign != 0) < width) { 8714 reslen -= rescnt; 8715 rescnt = width + fmtcnt + 100; 8716 reslen += rescnt; 8717 if (reslen < 0) { 8718 Py_XDECREF(temp); 8719 PyErr_NoMemory(); 8720 goto onError; 8721 } 8722 if (_PyUnicode_Resize(&result, reslen) < 0) { 8723 Py_XDECREF(temp); 8724 goto onError; 8725 } 8726 res = PyUnicode_AS_UNICODE(result) 8727 + reslen - rescnt; 8728 } 8729 if (sign) { 8730 if (fill != ' ') 8731 *res++ = sign; 8732 rescnt--; 8733 if (width > len) 8734 width--; 8735 } 8736 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 8737 assert(pbuf[0] == '0'); 8738 assert(pbuf[1] == c); 8739 if (fill != ' ') { 8740 *res++ = *pbuf++; 8741 *res++ = *pbuf++; 8742 } 8743 rescnt -= 2; 8744 width -= 2; 8745 if (width < 0) 8746 width = 0; 8747 len -= 2; 8748 } 8749 if (width > len && !(flags & F_LJUST)) { 8750 do { 8751 --rescnt; 8752 *res++ = fill; 8753 } while (--width > len); 8754 } 8755 if (fill == ' ') { 8756 if (sign) 8757 *res++ = sign; 8758 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 8759 assert(pbuf[0] == '0'); 8760 assert(pbuf[1] == c); 8761 *res++ = *pbuf++; 8762 *res++ = *pbuf++; 8763 } 8764 } 8765 Py_UNICODE_COPY(res, pbuf, len); 8766 res += len; 8767 rescnt -= len; 8768 while (--width >= len) { 8769 --rescnt; 8770 *res++ = ' '; 8771 } 8772 if (dict && (argidx < arglen) && c != '%') { 8773 PyErr_SetString(PyExc_TypeError, 8774 "not all arguments converted during string formatting"); 8775 Py_XDECREF(temp); 8776 goto onError; 8777 } 8778 Py_XDECREF(temp); 8779 } /* '%' */ 8780 } /* until end */ 8781 if (argidx < arglen && !dict) { 8782 PyErr_SetString(PyExc_TypeError, 8783 "not all arguments converted during string formatting"); 8784 goto onError; 8785 } 8786 8787 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) 8788 goto onError; 8789 if (args_owned) { 8790 Py_DECREF(args); 8791 } 8792 Py_DECREF(uformat); 8793 return (PyObject *)result; 8794 8795 onError: 8796 Py_XDECREF(result); 8797 Py_DECREF(uformat); 8798 if (args_owned) { 8799 Py_DECREF(args); 8800 } 8801 return NULL; 8802 } 8803 8804 static PyBufferProcs unicode_as_buffer = { 8805 (readbufferproc) unicode_buffer_getreadbuf, 8806 (writebufferproc) unicode_buffer_getwritebuf, 8807 (segcountproc) unicode_buffer_getsegcount, 8808 (charbufferproc) unicode_buffer_getcharbuf, 8809 }; 8810 8811 static PyObject * 8812 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 8813 8814 static PyObject * 8815 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 8816 { 8817 PyObject *x = NULL; 8818 static char *kwlist[] = {"string", "encoding", "errors", 0}; 8819 char *encoding = NULL; 8820 char *errors = NULL; 8821 8822 if (type != &PyUnicode_Type) 8823 return unicode_subtype_new(type, args, kwds); 8824 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode", 8825 kwlist, &x, &encoding, &errors)) 8826 return NULL; 8827 if (x == NULL) 8828 return (PyObject *)_PyUnicode_New(0); 8829 if (encoding == NULL && errors == NULL) 8830 return PyObject_Unicode(x); 8831 else 8832 return PyUnicode_FromEncodedObject(x, encoding, errors); 8833 } 8834 8835 static PyObject * 8836 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 8837 { 8838 PyUnicodeObject *tmp, *pnew; 8839 Py_ssize_t n; 8840 8841 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 8842 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 8843 if (tmp == NULL) 8844 return NULL; 8845 assert(PyUnicode_Check(tmp)); 8846 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 8847 if (pnew == NULL) { 8848 Py_DECREF(tmp); 8849 return NULL; 8850 } 8851 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1)); 8852 if (pnew->str == NULL) { 8853 _Py_ForgetReference((PyObject *)pnew); 8854 PyObject_Del(pnew); 8855 Py_DECREF(tmp); 8856 return PyErr_NoMemory(); 8857 } 8858 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 8859 pnew->length = n; 8860 pnew->hash = tmp->hash; 8861 Py_DECREF(tmp); 8862 return (PyObject *)pnew; 8863 } 8864 8865 PyDoc_STRVAR(unicode_doc, 8866 "unicode(object='') -> unicode object\n\ 8867 unicode(string[, encoding[, errors]]) -> unicode object\n\ 8868 \n\ 8869 Create a new Unicode object from the given encoded string.\n\ 8870 encoding defaults to the current default string encoding.\n\ 8871 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 8872 8873 PyTypeObject PyUnicode_Type = { 8874 PyVarObject_HEAD_INIT(&PyType_Type, 0) 8875 "unicode", /* tp_name */ 8876 sizeof(PyUnicodeObject), /* tp_size */ 8877 0, /* tp_itemsize */ 8878 /* Slots */ 8879 (destructor)unicode_dealloc, /* tp_dealloc */ 8880 0, /* tp_print */ 8881 0, /* tp_getattr */ 8882 0, /* tp_setattr */ 8883 0, /* tp_compare */ 8884 unicode_repr, /* tp_repr */ 8885 &unicode_as_number, /* tp_as_number */ 8886 &unicode_as_sequence, /* tp_as_sequence */ 8887 &unicode_as_mapping, /* tp_as_mapping */ 8888 (hashfunc) unicode_hash, /* tp_hash*/ 8889 0, /* tp_call*/ 8890 (reprfunc) unicode_str, /* tp_str */ 8891 PyObject_GenericGetAttr, /* tp_getattro */ 8892 0, /* tp_setattro */ 8893 &unicode_as_buffer, /* tp_as_buffer */ 8894 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES | 8895 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 8896 unicode_doc, /* tp_doc */ 8897 0, /* tp_traverse */ 8898 0, /* tp_clear */ 8899 PyUnicode_RichCompare, /* tp_richcompare */ 8900 0, /* tp_weaklistoffset */ 8901 0, /* tp_iter */ 8902 0, /* tp_iternext */ 8903 unicode_methods, /* tp_methods */ 8904 0, /* tp_members */ 8905 0, /* tp_getset */ 8906 &PyBaseString_Type, /* tp_base */ 8907 0, /* tp_dict */ 8908 0, /* tp_descr_get */ 8909 0, /* tp_descr_set */ 8910 0, /* tp_dictoffset */ 8911 0, /* tp_init */ 8912 0, /* tp_alloc */ 8913 unicode_new, /* tp_new */ 8914 PyObject_Del, /* tp_free */ 8915 }; 8916 8917 /* Initialize the Unicode implementation */ 8918 8919 void _PyUnicode_Init(void) 8920 { 8921 /* XXX - move this array to unicodectype.c ? */ 8922 Py_UNICODE linebreak[] = { 8923 0x000A, /* LINE FEED */ 8924 0x000D, /* CARRIAGE RETURN */ 8925 0x001C, /* FILE SEPARATOR */ 8926 0x001D, /* GROUP SEPARATOR */ 8927 0x001E, /* RECORD SEPARATOR */ 8928 0x0085, /* NEXT LINE */ 8929 0x2028, /* LINE SEPARATOR */ 8930 0x2029, /* PARAGRAPH SEPARATOR */ 8931 }; 8932 8933 /* Init the implementation */ 8934 if (!unicode_empty) { 8935 unicode_empty = _PyUnicode_New(0); 8936 if (!unicode_empty) 8937 return; 8938 } 8939 8940 if (PyType_Ready(&PyUnicode_Type) < 0) 8941 Py_FatalError("Can't initialize 'unicode'"); 8942 8943 /* initialize the linebreak bloom filter */ 8944 bloom_linebreak = make_bloom_mask( 8945 linebreak, sizeof(linebreak) / sizeof(linebreak[0]) 8946 ); 8947 8948 PyType_Ready(&EncodingMapType); 8949 8950 if (PyType_Ready(&PyFieldNameIter_Type) < 0) 8951 Py_FatalError("Can't initialize field name iterator type"); 8952 8953 if (PyType_Ready(&PyFormatterIter_Type) < 0) 8954 Py_FatalError("Can't initialize formatter iter type"); 8955 } 8956 8957 /* Finalize the Unicode implementation */ 8958 8959 int 8960 PyUnicode_ClearFreeList(void) 8961 { 8962 int freelist_size = numfree; 8963 PyUnicodeObject *u; 8964 8965 for (u = free_list; u != NULL;) { 8966 PyUnicodeObject *v = u; 8967 u = *(PyUnicodeObject **)u; 8968 if (v->str) 8969 PyObject_DEL(v->str); 8970 Py_XDECREF(v->defenc); 8971 PyObject_Del(v); 8972 numfree--; 8973 } 8974 free_list = NULL; 8975 assert(numfree == 0); 8976 return freelist_size; 8977 } 8978 8979 void 8980 _PyUnicode_Fini(void) 8981 { 8982 int i; 8983 8984 Py_CLEAR(unicode_empty); 8985 8986 for (i = 0; i < 256; i++) 8987 Py_CLEAR(unicode_latin1[i]); 8988 8989 (void)PyUnicode_ClearFreeList(); 8990 } 8991 8992 #ifdef __cplusplus 8993 } 8994 #endif 8995